This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: [PATCH] Re: [perl #39530] printf: bad formatting of hexadecimal conversion of...
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
1aa99e6b 120 I32 utf8;
830247a4
IZ
121#if ADD_TO_REGEXEC
122 char *starttry; /* -Dr: where regtry was called. */
123#define RExC_starttry (pRExC_state->starttry)
124#endif
3dab1dad 125#ifdef DEBUGGING
be8e71aa 126 const char *lastparse;
3dab1dad
YO
127 I32 lastnum;
128#define RExC_lastparse (pRExC_state->lastparse)
129#define RExC_lastnum (pRExC_state->lastnum)
130#endif
830247a4
IZ
131} RExC_state_t;
132
e2509266 133#define RExC_flags (pRExC_state->flags)
830247a4
IZ
134#define RExC_precomp (pRExC_state->precomp)
135#define RExC_rx (pRExC_state->rx)
fac92740 136#define RExC_start (pRExC_state->start)
830247a4
IZ
137#define RExC_end (pRExC_state->end)
138#define RExC_parse (pRExC_state->parse)
139#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 140#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 141#define RExC_emit (pRExC_state->emit)
fac92740 142#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
143#define RExC_naughty (pRExC_state->naughty)
144#define RExC_sawback (pRExC_state->sawback)
145#define RExC_seen (pRExC_state->seen)
146#define RExC_size (pRExC_state->size)
147#define RExC_npar (pRExC_state->npar)
148#define RExC_extralen (pRExC_state->extralen)
149#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 151#define RExC_utf8 (pRExC_state->utf8)
830247a4 152
a687059c
LW
153#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
a687059c 156
35c8bce7
LW
157#ifdef SPSTART
158#undef SPSTART /* dratted cpp namespace... */
159#endif
a687059c
LW
160/*
161 * Flags to be passed up and down.
162 */
a687059c 163#define WORST 0 /* Worst case. */
821b33a5 164#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
165#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166#define SPSTART 0x4 /* Starts with * or +. */
167#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 168
3dab1dad
YO
169#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
170
07be1b83
YO
171/* whether trie related optimizations are enabled */
172#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173#define TRIE_STUDY_OPT
174#define TRIE_STCLASS
175#endif
2c2d71f5
JH
176/* Length of a variant. */
177
178typedef struct scan_data_t {
179 I32 len_min;
180 I32 len_delta;
181 I32 pos_min;
182 I32 pos_delta;
183 SV *last_found;
184 I32 last_end; /* min value, <0 unless valid. */
185 I32 last_start_min;
186 I32 last_start_max;
187 SV **longest; /* Either &l_fixed, or &l_float. */
188 SV *longest_fixed;
189 I32 offset_fixed;
190 SV *longest_float;
191 I32 offset_float_min;
192 I32 offset_float_max;
193 I32 flags;
194 I32 whilem_c;
cb434fcc 195 I32 *last_closep;
653099ff 196 struct regnode_charclass_class *start_class;
2c2d71f5
JH
197} scan_data_t;
198
a687059c 199/*
e50aee73 200 * Forward declarations for pregcomp()'s friends.
a687059c 201 */
a0d0e21e 202
27da23d5
JH
203static const scan_data_t zero_scan_data =
204 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
205
206#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
207#define SF_BEFORE_SEOL 0x0001
208#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
209#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
210#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
211
09b7f37c
CB
212#ifdef NO_UNARY_PLUS
213# define SF_FIX_SHIFT_EOL (0+2)
214# define SF_FL_SHIFT_EOL (0+4)
215#else
216# define SF_FIX_SHIFT_EOL (+2)
217# define SF_FL_SHIFT_EOL (+4)
218#endif
c277df42
IZ
219
220#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
221#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
222
223#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
224#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
225#define SF_IS_INF 0x0040
226#define SF_HAS_PAR 0x0080
227#define SF_IN_PAR 0x0100
228#define SF_HAS_EVAL 0x0200
229#define SCF_DO_SUBSTR 0x0400
653099ff
GS
230#define SCF_DO_STCLASS_AND 0x0800
231#define SCF_DO_STCLASS_OR 0x1000
232#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 233#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 234
07be1b83
YO
235#define SCF_EXACT_TRIE 0x4000 /* should re study once we are done? */
236
eb160463 237#define UTF (RExC_utf8 != 0)
e2509266
JH
238#define LOC ((RExC_flags & PMf_LOCALE) != 0)
239#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 240
ffc61ed2 241#define OOB_UNICODE 12345678
93733859 242#define OOB_NAMEDCLASS -1
b8c5462f 243
a0ed51b3
LW
244#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
245#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
246
8615cb43 247
b45f050a
JF
248/* length of regex to show in messages that don't mark a position within */
249#define RegexLengthToShowInErrorMessages 127
250
251/*
252 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
253 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
254 * op/pragma/warn/regcomp.
255 */
7253e4e3
RK
256#define MARKER1 "<-- HERE" /* marker as it appears in the description */
257#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 258
7253e4e3 259#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
260
261/*
262 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
263 * arg. Show regex, up to a maximum length. If it's too long, chop and add
264 * "...".
265 */
ccb2c380 266#define FAIL(msg) STMT_START { \
bfed75c6 267 const char *ellipses = ""; \
ccb2c380
MP
268 IV len = RExC_end - RExC_precomp; \
269 \
270 if (!SIZE_ONLY) \
271 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
272 if (len > RegexLengthToShowInErrorMessages) { \
273 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
274 len = RegexLengthToShowInErrorMessages - 10; \
275 ellipses = "..."; \
276 } \
277 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
278 msg, (int)len, RExC_precomp, ellipses); \
279} STMT_END
8615cb43 280
b45f050a 281/*
b45f050a
JF
282 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
283 */
ccb2c380 284#define Simple_vFAIL(m) STMT_START { \
a28509cc 285 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
286 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
287 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
288} STMT_END
b45f050a
JF
289
290/*
291 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
292 */
ccb2c380
MP
293#define vFAIL(m) STMT_START { \
294 if (!SIZE_ONLY) \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 Simple_vFAIL(m); \
297} STMT_END
b45f050a
JF
298
299/*
300 * Like Simple_vFAIL(), but accepts two arguments.
301 */
ccb2c380 302#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 303 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
304 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
305 (int)offset, RExC_precomp, RExC_precomp + offset); \
306} STMT_END
b45f050a
JF
307
308/*
309 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
310 */
ccb2c380
MP
311#define vFAIL2(m,a1) STMT_START { \
312 if (!SIZE_ONLY) \
313 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 Simple_vFAIL2(m, a1); \
315} STMT_END
b45f050a
JF
316
317
318/*
319 * Like Simple_vFAIL(), but accepts three arguments.
320 */
ccb2c380 321#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 322 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
323 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
324 (int)offset, RExC_precomp, RExC_precomp + offset); \
325} STMT_END
b45f050a
JF
326
327/*
328 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
329 */
ccb2c380
MP
330#define vFAIL3(m,a1,a2) STMT_START { \
331 if (!SIZE_ONLY) \
332 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
333 Simple_vFAIL3(m, a1, a2); \
334} STMT_END
b45f050a
JF
335
336/*
337 * Like Simple_vFAIL(), but accepts four arguments.
338 */
ccb2c380 339#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 340 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
341 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
342 (int)offset, RExC_precomp, RExC_precomp + offset); \
343} STMT_END
b45f050a 344
ccb2c380 345#define vWARN(loc,m) STMT_START { \
a28509cc 346 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
347 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
348 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
349} STMT_END
350
351#define vWARNdep(loc,m) STMT_START { \
a28509cc 352 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
353 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
354 "%s" REPORT_LOCATION, \
355 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
356} STMT_END
357
358
359#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 360 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
361 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
362 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
363} STMT_END
364
365#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 366 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
367 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
368 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
369} STMT_END
370
371#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 372 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
373 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
374 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
375} STMT_END
376
377#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 378 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
379 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
380 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
381} STMT_END
9d1d55b5 382
8615cb43 383
cd439c50 384/* Allow for side effects in s */
ccb2c380
MP
385#define REGC(c,s) STMT_START { \
386 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
387} STMT_END
cd439c50 388
fac92740
MJD
389/* Macros for recording node offsets. 20001227 mjd@plover.com
390 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
391 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
392 * Element 0 holds the number n.
07be1b83 393 * Position is 1 indexed.
fac92740
MJD
394 */
395
ccb2c380
MP
396#define Set_Node_Offset_To_R(node,byte) STMT_START { \
397 if (! SIZE_ONLY) { \
398 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 399 __LINE__, (node), (int)(byte))); \
ccb2c380 400 if((node) < 0) { \
551405c4 401 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
402 } else { \
403 RExC_offsets[2*(node)-1] = (byte); \
404 } \
405 } \
406} STMT_END
407
408#define Set_Node_Offset(node,byte) \
409 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
410#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
411
412#define Set_Node_Length_To_R(node,len) STMT_START { \
413 if (! SIZE_ONLY) { \
414 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 415 __LINE__, (int)(node), (int)(len))); \
ccb2c380 416 if((node) < 0) { \
551405c4 417 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
418 } else { \
419 RExC_offsets[2*(node)] = (len); \
420 } \
421 } \
422} STMT_END
423
424#define Set_Node_Length(node,len) \
425 Set_Node_Length_To_R((node)-RExC_emit_start, len)
426#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
427#define Set_Node_Cur_Length(node) \
428 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
429
430/* Get offsets and lengths */
431#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
432#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
433
07be1b83
YO
434#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
435 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
436 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
437} STMT_END
438
439
440#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
441#define EXPERIMENTAL_INPLACESCAN
442#endif
443
acfe0abc 444static void clear_re(pTHX_ void *r);
4327152a 445
653099ff
GS
446/* Mark that we cannot extend a found fixed substring at this point.
447 Updata the longest found anchored substring and the longest found
448 floating substrings if needed. */
449
4327152a 450STATIC void
097eb12c 451S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 452{
e1ec3a88
AL
453 const STRLEN l = CHR_SVLEN(data->last_found);
454 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 455
c277df42 456 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 457 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
458 if (*data->longest == data->longest_fixed) {
459 data->offset_fixed = l ? data->last_start_min : data->pos_min;
460 if (data->flags & SF_BEFORE_EOL)
b81d288d 461 data->flags
c277df42
IZ
462 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
463 else
464 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
465 }
466 else {
c277df42 467 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
468 data->offset_float_max = (l
469 ? data->last_start_max
c277df42 470 : data->pos_min + data->pos_delta);
9051bda5
HS
471 if ((U32)data->offset_float_max > (U32)I32_MAX)
472 data->offset_float_max = I32_MAX;
c277df42 473 if (data->flags & SF_BEFORE_EOL)
b81d288d 474 data->flags
c277df42
IZ
475 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
476 else
477 data->flags &= ~SF_FL_BEFORE_EOL;
478 }
479 }
480 SvCUR_set(data->last_found, 0);
0eda9292 481 {
a28509cc 482 SV * const sv = data->last_found;
097eb12c
AL
483 if (SvUTF8(sv) && SvMAGICAL(sv)) {
484 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
485 if (mg)
486 mg->mg_len = 0;
487 }
0eda9292 488 }
c277df42
IZ
489 data->last_end = -1;
490 data->flags &= ~SF_BEFORE_EOL;
491}
492
653099ff
GS
493/* Can match anything (initialization) */
494STATIC void
097eb12c 495S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 496{
653099ff 497 ANYOF_CLASS_ZERO(cl);
f8bef550 498 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 499 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
500 if (LOC)
501 cl->flags |= ANYOF_LOCALE;
502}
503
504/* Can match anything (initialization) */
505STATIC int
5f66b61c 506S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
507{
508 int value;
509
aaa51d5e 510 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
511 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
512 return 1;
1aa99e6b
IH
513 if (!(cl->flags & ANYOF_UNICODE_ALL))
514 return 0;
f8bef550
NC
515 if (!ANYOF_BITMAP_TESTALLSET(cl))
516 return 0;
653099ff
GS
517 return 1;
518}
519
520/* Can match anything (initialization) */
521STATIC void
097eb12c 522S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 523{
8ecf7187 524 Zero(cl, 1, struct regnode_charclass_class);
653099ff 525 cl->type = ANYOF;
830247a4 526 cl_anything(pRExC_state, cl);
653099ff
GS
527}
528
529STATIC void
097eb12c 530S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 531{
8ecf7187 532 Zero(cl, 1, struct regnode_charclass_class);
653099ff 533 cl->type = ANYOF;
830247a4 534 cl_anything(pRExC_state, cl);
653099ff
GS
535 if (LOC)
536 cl->flags |= ANYOF_LOCALE;
537}
538
539/* 'And' a given class with another one. Can create false positives */
540/* We assume that cl is not inverted */
541STATIC void
5f66b61c 542S_cl_and(struct regnode_charclass_class *cl,
a28509cc 543 const struct regnode_charclass_class *and_with)
653099ff 544{
653099ff
GS
545 if (!(and_with->flags & ANYOF_CLASS)
546 && !(cl->flags & ANYOF_CLASS)
547 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
548 && !(and_with->flags & ANYOF_FOLD)
549 && !(cl->flags & ANYOF_FOLD)) {
550 int i;
551
552 if (and_with->flags & ANYOF_INVERT)
553 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
554 cl->bitmap[i] &= ~and_with->bitmap[i];
555 else
556 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557 cl->bitmap[i] &= and_with->bitmap[i];
558 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
559 if (!(and_with->flags & ANYOF_EOS))
560 cl->flags &= ~ANYOF_EOS;
1aa99e6b 561
14ebb1a2
JH
562 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
563 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
564 cl->flags &= ~ANYOF_UNICODE_ALL;
565 cl->flags |= ANYOF_UNICODE;
566 ARG_SET(cl, ARG(and_with));
567 }
14ebb1a2
JH
568 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
569 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 570 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
571 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
572 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 573 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
574}
575
576/* 'OR' a given class with another one. Can create false positives */
577/* We assume that cl is not inverted */
578STATIC void
097eb12c 579S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 580{
653099ff
GS
581 if (or_with->flags & ANYOF_INVERT) {
582 /* We do not use
583 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
584 * <= (B1 | !B2) | (CL1 | !CL2)
585 * which is wasteful if CL2 is small, but we ignore CL2:
586 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
587 * XXXX Can we handle case-fold? Unclear:
588 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
589 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
590 */
591 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
592 && !(or_with->flags & ANYOF_FOLD)
593 && !(cl->flags & ANYOF_FOLD) ) {
594 int i;
595
596 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
597 cl->bitmap[i] |= ~or_with->bitmap[i];
598 } /* XXXX: logic is complicated otherwise */
599 else {
830247a4 600 cl_anything(pRExC_state, cl);
653099ff
GS
601 }
602 } else {
603 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
604 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 605 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
606 || (cl->flags & ANYOF_FOLD)) ) {
607 int i;
608
609 /* OR char bitmap and class bitmap separately */
610 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
611 cl->bitmap[i] |= or_with->bitmap[i];
612 if (or_with->flags & ANYOF_CLASS) {
613 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
614 cl->classflags[i] |= or_with->classflags[i];
615 cl->flags |= ANYOF_CLASS;
616 }
617 }
618 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 619 cl_anything(pRExC_state, cl);
653099ff
GS
620 }
621 }
622 if (or_with->flags & ANYOF_EOS)
623 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
624
625 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
626 ARG(cl) != ARG(or_with)) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
629 }
630 if (or_with->flags & ANYOF_UNICODE_ALL) {
631 cl->flags |= ANYOF_UNICODE_ALL;
632 cl->flags &= ~ANYOF_UNICODE;
633 }
653099ff
GS
634}
635
5d1c421c 636/*
a3621e74 637
3dab1dad 638 make_trie(startbranch,first,last,tail,flags,depth)
a3621e74
YO
639 startbranch: the first branch in the whole branch sequence
640 first : start branch of sequence of branch-exact nodes.
641 May be the same as startbranch
642 last : Thing following the last branch.
643 May be the same as tail.
644 tail : item following the branch sequence
645 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
3dab1dad 646 depth : indent depth
a3621e74
YO
647
648Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
649
650A trie is an N'ary tree where the branches are determined by digital
651decomposition of the key. IE, at the root node you look up the 1st character and
652follow that branch repeat until you find the end of the branches. Nodes can be
653marked as "accepting" meaning they represent a complete word. Eg:
654
655 /he|she|his|hers/
656
657would convert into the following structure. Numbers represent states, letters
658following numbers represent valid transitions on the letter from that state, if
659the number is in square brackets it represents an accepting state, otherwise it
660will be in parenthesis.
661
662 +-h->+-e->[3]-+-r->(8)-+-s->[9]
663 | |
664 | (2)
665 | |
666 (1) +-i->(6)-+-s->[7]
667 |
668 +-s->(3)-+-h->(4)-+-e->[5]
669
670 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
671
672This shows that when matching against the string 'hers' we will begin at state 1
673read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
674then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
675is also accepting. Thus we know that we can match both 'he' and 'hers' with a
676single traverse. We store a mapping from accepting to state to which word was
677matched, and then when we have multiple possibilities we try to complete the
678rest of the regex in the order in which they occured in the alternation.
679
680The only prior NFA like behaviour that would be changed by the TRIE support is
681the silent ignoring of duplicate alternations which are of the form:
682
683 / (DUPE|DUPE) X? (?{ ... }) Y /x
684
685Thus EVAL blocks follwing a trie may be called a different number of times with
686and without the optimisation. With the optimisations dupes will be silently
687ignored. This inconsistant behaviour of EVAL type nodes is well established as
688the following demonstrates:
689
690 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
691
692which prints out 'word' three times, but
693
694 'words'=~/(word|word|word)(?{ print $1 })S/
695
696which doesnt print it out at all. This is due to other optimisations kicking in.
697
698Example of what happens on a structural level:
699
700The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
701
702 1: CURLYM[1] {1,32767}(18)
703 5: BRANCH(8)
704 6: EXACT <ac>(16)
705 8: BRANCH(11)
706 9: EXACT <ad>(16)
707 11: BRANCH(14)
708 12: EXACT <ab>(16)
709 16: SUCCEED(0)
710 17: NOTHING(18)
711 18: END(0)
712
713This would be optimizable with startbranch=5, first=5, last=16, tail=16
714and should turn into:
715
716 1: CURLYM[1] {1,32767}(18)
717 5: TRIE(16)
718 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
719 <ac>
720 <ad>
721 <ab>
722 16: SUCCEED(0)
723 17: NOTHING(18)
724 18: END(0)
725
726Cases where tail != last would be like /(?foo|bar)baz/:
727
728 1: BRANCH(4)
729 2: EXACT <foo>(8)
730 4: BRANCH(7)
731 5: EXACT <bar>(8)
732 7: TAIL(8)
733 8: EXACT <baz>(10)
734 10: END(0)
735
736which would be optimizable with startbranch=1, first=1, last=7, tail=8
737and would end up looking like:
738
739 1: TRIE(8)
740 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
741 <foo>
742 <bar>
743 7: TAIL(8)
744 8: EXACT <baz>(10)
745 10: END(0)
746
3dab1dad
YO
747 d = uvuni_to_utf8_flags(d, uv, 0);
748
749is the recommended Unicode-aware way of saying
750
751 *(d++) = uv;
a3621e74
YO
752*/
753
3dab1dad
YO
754#define TRIE_STORE_REVCHAR \
755 STMT_START { \
756 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
757 av_push( TRIE_REVCHARMAP(trie), tmp ); \
758 } STMT_END
a3621e74
YO
759
760#define TRIE_READ_CHAR STMT_START { \
07be1b83 761 wordlen++; \
a3621e74
YO
762 if ( UTF ) { \
763 if ( folder ) { \
764 if ( foldlen > 0 ) { \
765 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
766 foldlen -= len; \
767 scan += len; \
768 len = 0; \
769 } else { \
e1ec3a88 770 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
771 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
772 foldlen -= UNISKIP( uvc ); \
773 scan = foldbuf + UNISKIP( uvc ); \
774 } \
775 } else { \
e1ec3a88 776 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
777 } \
778 } else { \
779 uvc = (U32)*uc; \
780 len = 1; \
781 } \
782} STMT_END
783
784
785#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
786#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
787#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
788#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
789
790#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
791 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
792 TRIE_LIST_LEN( state ) *= 2; \
793 Renew( trie->states[ state ].trans.list, \
794 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
795 } \
796 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
797 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
798 TRIE_LIST_CUR( state )++; \
799} STMT_END
800
801#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 802 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
803 4, reg_trie_trans_le ); \
804 TRIE_LIST_CUR( state ) = 1; \
805 TRIE_LIST_LEN( state ) = 4; \
806} STMT_END
807
07be1b83
YO
808#define TRIE_HANDLE_WORD(state) STMT_START { \
809 if ( !trie->states[ state ].wordnum ) { \
810 /* we haven't inserted this word into the structure yet. */ \
811 if (trie->wordlen) \
812 trie->wordlen[ curword ] = wordlen; \
3dab1dad
YO
813 trie->states[ state ].wordnum = ++curword; \
814 DEBUG_r({ \
815 /* store the word for dumping */ \
816 SV* tmp; \
07be1b83
YO
817 if (OP(noper) != NOTHING) \
818 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
3dab1dad 819 else \
07be1b83 820 tmp = newSVpvn( "", 0 ); \
3dab1dad
YO
821 if ( UTF ) SvUTF8_on( tmp ); \
822 av_push( trie->words, tmp ); \
823 }); \
824 } else { \
825 NOOP; /* It's a dupe. So ignore it. */ \
826 } \
827} STMT_END
828
829#ifdef DEBUGGING
07be1b83 830/*
3dab1dad
YO
831 dump_trie(trie)
832 dump_trie_interim_list(trie,next_alloc)
833 dump_trie_interim_table(trie,next_alloc)
834
835 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
836 The _interim_ variants are used for debugging the interim
837 tables that are used to generate the final compressed
838 representation which is what dump_trie expects.
839
3dab1dad
YO
840 Part of the reason for their existance is to provide a form
841 of documentation as to how the different representations function.
07be1b83
YO
842
843*/
3dab1dad
YO
844
845/*
846 dump_trie(trie)
847 Dumps the final compressed table form of the trie to Perl_debug_log.
848 Used for debugging make_trie().
849*/
850
851STATIC void
852S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
853{
854 U32 state;
855 GET_RE_DEBUG_FLAGS_DECL;
856
857 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
858 (int)depth * 2 + 2,"",
859 "Match","Base","Ofs" );
860
861 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 862 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad
YO
863 if ( tmp ) {
864 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
865 }
866 }
867 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
868 (int)depth * 2 + 2,"");
869
870 for( state = 0 ; state < trie->uniquecharcount ; state++ )
871 PerlIO_printf( Perl_debug_log, "-----");
872 PerlIO_printf( Perl_debug_log, "\n");
873
874 for( state = 1 ; state < TRIE_LASTSTATE(trie) ; state++ ) {
be8e71aa 875 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
876
877 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
878
879 if ( trie->states[ state ].wordnum ) {
880 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
881 } else {
882 PerlIO_printf( Perl_debug_log, "%6s", "" );
883 }
884
885 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
886
887 if ( base ) {
888 U32 ofs = 0;
889
890 while( ( base + ofs < trie->uniquecharcount ) ||
891 ( base + ofs - trie->uniquecharcount < trie->lasttrans
892 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
893 ofs++;
894
895 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
896
897 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
898 if ( ( base + ofs >= trie->uniquecharcount ) &&
899 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
900 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
901 {
902 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
903 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
904 } else {
905 PerlIO_printf( Perl_debug_log, "%4s "," ." );
906 }
907 }
908
909 PerlIO_printf( Perl_debug_log, "]");
910
911 }
912 PerlIO_printf( Perl_debug_log, "\n" );
913 }
914}
915/*
916 dump_trie_interim_list(trie,next_alloc)
917 Dumps a fully constructed but uncompressed trie in list form.
918 List tries normally only are used for construction when the number of
919 possible chars (trie->uniquecharcount) is very high.
920 Used for debugging make_trie().
921*/
922STATIC void
923S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
924{
925 U32 state;
926 GET_RE_DEBUG_FLAGS_DECL;
927 /* print out the table precompression. */
928 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s",
929 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"");
930 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
931
932 for( state=1 ; state < next_alloc ; state ++ ) {
933 U16 charid;
934
935 PerlIO_printf( Perl_debug_log, "\n%*s %4"UVXf" :",
936 (int)depth * 2 + 2,"", (UV)state );
937 if ( ! trie->states[ state ].wordnum ) {
938 PerlIO_printf( Perl_debug_log, "%5s| ","");
939 } else {
940 PerlIO_printf( Perl_debug_log, "W%4x| ",
941 trie->states[ state ].wordnum
942 );
943 }
944 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 945 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
3dab1dad
YO
946 PerlIO_printf( Perl_debug_log, "%s:%3X=%4"UVXf" | ",
947 SvPV_nolen_const( *tmp ),
948 TRIE_LIST_ITEM(state,charid).forid,
949 (UV)TRIE_LIST_ITEM(state,charid).newstate
950 );
951 }
952
953 }
954}
955
956/*
957 dump_trie_interim_table(trie,next_alloc)
958 Dumps a fully constructed but uncompressed trie in table form.
959 This is the normal DFA style state transition table, with a few
960 twists to facilitate compression later.
961 Used for debugging make_trie().
962*/
963STATIC void
964S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
965{
966 U32 state;
967 U16 charid;
968 GET_RE_DEBUG_FLAGS_DECL;
969
970 /*
971 print out the table precompression so that we can do a visual check
972 that they are identical.
973 */
974
975 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
976
977 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 978 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad
YO
979 if ( tmp ) {
980 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
981 }
982 }
983
984 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
985
986 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
987 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
988 }
989
990 PerlIO_printf( Perl_debug_log, "\n" );
991
992 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
993
994 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
995 (int)depth * 2 + 2,"",
996 (UV)TRIE_NODENUM( state ) );
997
998 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
999 PerlIO_printf( Perl_debug_log, "%4"UVXf" ",
1000 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1001 }
1002 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1003 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1004 } else {
1005 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1006 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1007 }
1008 }
07be1b83 1009}
3dab1dad
YO
1010
1011#endif
1012
07be1b83
YO
1013#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1014 ( ( base + charid >= ucharcount \
1015 && base + charid < ubound \
1016 && state == trie->trans[ base - ucharcount + charid ].check \
1017 && trie->trans[ base - ucharcount + charid ].next ) \
1018 ? trie->trans[ base - ucharcount + charid ].next \
1019 : ( state==1 ? special : 0 ) \
1020 )
3dab1dad 1021
07be1b83
YO
1022STATIC void
1023S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1024{
1025/* The Trie is constructed and compressed now so we can build a fail array now if its needed
1026
1027 This is apparently the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1028 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1029 ISBN 0-201-10088-6
1030
1031 We find the fail state for each state in the trie, this state is the longest proper
1032 suffix of the current states 'word' that is also a proper prefix of another word in our
1033 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1034 the DFA not to have to restart after its tried and failed a word at a given point, it
1035 simply continues as though it had been matching the other word in the first place.
1036 Consider
1037 'abcdgu'=~/abcdefg|cdgu/
1038 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1039 fail, which would bring use to the state representing 'd' in the second word where we would
1040 try 'g' and succeed, prodceding to match 'cdgu'.
1041 */
1042 /* add a fail transition */
1043 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1044 U32 *q;
be8e71aa
YO
1045 const U32 ucharcount = trie->uniquecharcount;
1046 const U32 numstates = trie->laststate;
1047 const U32 ubound = trie->lasttrans + ucharcount;
07be1b83
YO
1048 U32 q_read = 0;
1049 U32 q_write = 0;
1050 U32 charid;
1051 U32 base = trie->states[ 1 ].trans.base;
07be1b83
YO
1052 U32 *fail;
1053 reg_ac_data *aho;
1054 const U32 data_slot = add_data( pRExC_state, 1, "T" );
f9049ba1
SP
1055#ifndef DEBUGGING
1056 PERL_UNUSED_ARG(depth);
1057#endif
07be1b83
YO
1058 GET_RE_DEBUG_FLAGS_DECL;
1059
1060 ARG_SET( stclass, data_slot );
1061 Newxz( aho, 1, reg_ac_data );
1062 RExC_rx->data->data[ data_slot ] = (void*)aho;
1063 aho->trie=trie;
1064 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1065 (trie->laststate+1)*sizeof(reg_trie_state));
1066 Newxz( q, numstates, U32);
1067 Newxz( aho->fail, numstates, U32 );
283d8f99
YO
1068 aho->refcount = 1;
1069 fail = aho->fail;
07be1b83
YO
1070 fail[ 0 ] = fail[ 1 ] = 1;
1071
1072 for ( charid = 0; charid < ucharcount ; charid++ ) {
be8e71aa
YO
1073 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1074 if ( newstate ) {
07be1b83
YO
1075 q[ q_write ] = newstate;
1076 /* set to point at the root */
1077 fail[ q[ q_write++ ] ]=1;
1078 }
1079 }
1080 while ( q_read < q_write) {
be8e71aa 1081 const U32 cur = q[ q_read++ % numstates ];
07be1b83
YO
1082 base = trie->states[ cur ].trans.base;
1083
1084 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
be8e71aa
YO
1085 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1086 if (ch_state) {
07be1b83
YO
1087 U32 fail_state = cur;
1088 U32 fail_base;
1089 do {
1090 fail_state = fail[ fail_state ];
1091 fail_base = aho->states[ fail_state ].trans.base;
1092 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1093
1094 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1095 fail[ ch_state ] = fail_state;
1096 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1097 {
1098 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1099 }
1100 q[ q_write++ % numstates] = ch_state;
1101 }
1102 }
1103 }
1104
1105 DEBUG_TRIE_COMPILE_MORE_r({
1106 PerlIO_printf(Perl_debug_log, "%*sFail: 1", (int)(depth * 2), "");
1107 for( q_read=2; q_read<numstates; q_read++ ) {
1108 PerlIO_printf(Perl_debug_log, ", %"UVuf, fail[q_read]);
1109 }
1110 PerlIO_printf(Perl_debug_log, "\n");
1111 });
1112 Safefree(q);
1113 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
1114}
3dab1dad
YO
1115
1116
1117
a3621e74 1118STATIC I32
3dab1dad 1119S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags, U32 depth)
a3621e74 1120{
27da23d5 1121 dVAR;
a3621e74
YO
1122 /* first pass, loop through and scan words */
1123 reg_trie_data *trie;
1124 regnode *cur;
9f7f3913 1125 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1126 STRLEN len = 0;
1127 UV uvc = 0;
1128 U16 curword = 0;
1129 U32 next_alloc = 0;
1130 /* we just use folder as a flag in utf8 */
e1ec3a88 1131 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1132 ? PL_fold
1133 : ( flags == EXACTFL
1134 ? PL_fold_locale
1135 : NULL
1136 )
1137 );
1138
e1ec3a88 1139 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1140 SV *re_trie_maxbuff;
3dab1dad
YO
1141#ifndef DEBUGGING
1142 /* these are only used during construction but are useful during
8e11feef
RGS
1143 * debugging so we store them in the struct when debugging.
1144 * Wordcount is actually superfluous in debugging as we have
1145 * (AV*)trie->words to use for it, but that's not available when
1146 * not debugging... We could make the macro use the AV during
1147 * debugging though...
1148 */
3dab1dad
YO
1149 U16 trie_wordcount=0;
1150 STRLEN trie_charcount=0;
07be1b83 1151 /*U32 trie_laststate=0;*/
3dab1dad 1152 AV *trie_revcharmap;
f9049ba1 1153 PERL_UNUSED_ARG(depth);
3dab1dad 1154#endif
a3621e74
YO
1155 GET_RE_DEBUG_FLAGS_DECL;
1156
a02a5408 1157 Newxz( trie, 1, reg_trie_data );
a3621e74 1158 trie->refcount = 1;
3dab1dad 1159 trie->startstate = 1;
a3621e74 1160 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1161 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1162 if (!(UTF && folder))
1163 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1164 DEBUG_r({
1165 trie->words = newAV();
a3621e74 1166 });
3dab1dad 1167 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1168
0111c4fd 1169 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1170 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1171 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1172 }
3dab1dad
YO
1173 DEBUG_OPTIMISE_r({
1174 PerlIO_printf( Perl_debug_log,
1175 "%*smake_trie start==%d, first==%d, last==%d, tail==%d\n",
1176 (int)depth * 2 + 2, "",
1177 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1178 REG_NODE_NUM(last), REG_NODE_NUM(tail));
1179 });
a3621e74
YO
1180 /* -- First loop and Setup --
1181
1182 We first traverse the branches and scan each word to determine if it
1183 contains widechars, and how many unique chars there are, this is
1184 important as we have to build a table with at least as many columns as we
1185 have unique chars.
1186
1187 We use an array of integers to represent the character codes 0..255
1188 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1189 native representation of the character value as the key and IV's for the
1190 coded index.
1191
1192 *TODO* If we keep track of how many times each character is used we can
1193 remap the columns so that the table compression later on is more
1194 efficient in terms of memory by ensuring most common value is in the
1195 middle and the least common are on the outside. IMO this would be better
1196 than a most to least common mapping as theres a decent chance the most
1197 common letter will share a node with the least common, meaning the node
1198 will not be compressable. With a middle is most common approach the worst
1199 case is when we have the least common nodes twice.
1200
1201 */
1202
a3621e74 1203 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1204 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1205 const U8 *uc = (U8*)STRING( noper );
a28509cc 1206 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1207 STRLEN foldlen = 0;
1208 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1209 const U8 *scan = (U8*)NULL;
07be1b83 1210 U32 wordlen = 0; /* required init */
3dab1dad 1211 STRLEN chars=0;
a3621e74 1212
3dab1dad
YO
1213 TRIE_WORDCOUNT(trie)++;
1214 if (OP(noper) == NOTHING) {
1215 trie->minlen= 0;
1216 continue;
1217 }
1218 if (trie->bitmap) {
1219 TRIE_BITMAP_SET(trie,*uc);
1220 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1221 }
a3621e74 1222 for ( ; uc < e ; uc += len ) {
3dab1dad 1223 TRIE_CHARCOUNT(trie)++;
a3621e74 1224 TRIE_READ_CHAR;
3dab1dad 1225 chars++;
a3621e74
YO
1226 if ( uvc < 256 ) {
1227 if ( !trie->charmap[ uvc ] ) {
1228 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1229 if ( folder )
1230 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1231 TRIE_STORE_REVCHAR;
a3621e74
YO
1232 }
1233 } else {
1234 SV** svpp;
1235 if ( !trie->widecharmap )
1236 trie->widecharmap = newHV();
1237
1238 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1239
1240 if ( !svpp )
e4584336 1241 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1242
1243 if ( !SvTRUE( *svpp ) ) {
1244 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1245 TRIE_STORE_REVCHAR;
a3621e74
YO
1246 }
1247 }
1248 }
3dab1dad
YO
1249 if( cur == first ) {
1250 trie->minlen=chars;
1251 trie->maxlen=chars;
1252 } else if (chars < trie->minlen) {
1253 trie->minlen=chars;
1254 } else if (chars > trie->maxlen) {
1255 trie->maxlen=chars;
1256 }
1257
a3621e74
YO
1258 } /* end first pass */
1259 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1260 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1261 (int)depth * 2 + 2,"",
1262 ( trie->widecharmap ? "UTF8" : "NATIVE" ), TRIE_WORDCOUNT(trie),
be8e71aa
YO
1263 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1264 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1265 );
07be1b83 1266 Newxz( trie->wordlen, TRIE_WORDCOUNT(trie), U32 );
a3621e74
YO
1267
1268 /*
1269 We now know what we are dealing with in terms of unique chars and
1270 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1271 representation using a flat table will take. If it's over a reasonable
1272 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1273 conservative but potentially much slower representation using an array
1274 of lists.
1275
1276 At the end we convert both representations into the same compressed
1277 form that will be used in regexec.c for matching with. The latter
1278 is a form that cannot be used to construct with but has memory
1279 properties similar to the list form and access properties similar
1280 to the table form making it both suitable for fast searches and
1281 small enough that its feasable to store for the duration of a program.
1282
1283 See the comment in the code where the compressed table is produced
1284 inplace from the flat tabe representation for an explanation of how
1285 the compression works.
1286
1287 */
1288
1289
3dab1dad 1290 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1291 /*
1292 Second Pass -- Array Of Lists Representation
1293
1294 Each state will be represented by a list of charid:state records
1295 (reg_trie_trans_le) the first such element holds the CUR and LEN
1296 points of the allocated array. (See defines above).
1297
1298 We build the initial structure using the lists, and then convert
1299 it into the compressed table form which allows faster lookups
1300 (but cant be modified once converted).
a3621e74
YO
1301 */
1302
a3621e74
YO
1303 STRLEN transcount = 1;
1304
3dab1dad 1305 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1306 TRIE_LIST_NEW(1);
1307 next_alloc = 2;
1308
1309 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1310
c445ea15
AL
1311 regnode * const noper = NEXTOPER( cur );
1312 U8 *uc = (U8*)STRING( noper );
1313 const U8 * const e = uc + STR_LEN( noper );
1314 U32 state = 1; /* required init */
1315 U16 charid = 0; /* sanity init */
1316 U8 *scan = (U8*)NULL; /* sanity init */
1317 STRLEN foldlen = 0; /* required init */
07be1b83 1318 U32 wordlen = 0; /* required init */
c445ea15
AL
1319 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1320
3dab1dad 1321 if (OP(noper) != NOTHING) {
c445ea15
AL
1322 for ( ; uc < e ; uc += len ) {
1323
1324 TRIE_READ_CHAR;
1325
1326 if ( uvc < 256 ) {
1327 charid = trie->charmap[ uvc ];
1328 } else {
1329 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1330 if ( !svpp ) {
1331 charid = 0;
1332 } else {
1333 charid=(U16)SvIV( *svpp );
1334 }
1335 }
1336 if ( charid ) {
a3621e74 1337
c445ea15
AL
1338 U16 check;
1339 U32 newstate = 0;
a3621e74 1340
c445ea15
AL
1341 charid--;
1342 if ( !trie->states[ state ].trans.list ) {
1343 TRIE_LIST_NEW( state );
1344 }
1345 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1346 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1347 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1348 break;
1349 }
1350 }
1351 if ( ! newstate ) {
1352 newstate = next_alloc++;
1353 TRIE_LIST_PUSH( state, charid, newstate );
1354 transcount++;
1355 }
1356 state = newstate;
1357 } else {
1358 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a28509cc 1359 }
c445ea15
AL
1360 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1361 }
c445ea15 1362 }
3dab1dad 1363 TRIE_HANDLE_WORD(state);
a3621e74
YO
1364
1365 } /* end second pass */
1366
3dab1dad 1367 TRIE_LASTSTATE(trie) = next_alloc;
a3621e74
YO
1368 Renew( trie->states, next_alloc, reg_trie_state );
1369
3dab1dad
YO
1370 /* and now dump it out before we compress it */
1371 DEBUG_TRIE_COMPILE_MORE_r(
1372 dump_trie_interim_list(trie,next_alloc,depth+1)
a3621e74 1373 );
a3621e74 1374
a02a5408 1375 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1376 {
1377 U32 state;
a3621e74
YO
1378 U32 tp = 0;
1379 U32 zp = 0;
1380
1381
1382 for( state=1 ; state < next_alloc ; state ++ ) {
1383 U32 base=0;
1384
1385 /*
1386 DEBUG_TRIE_COMPILE_MORE_r(
1387 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1388 );
1389 */
1390
1391 if (trie->states[state].trans.list) {
1392 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1393 U16 maxid=minid;
a28509cc 1394 U16 idx;
a3621e74
YO
1395
1396 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1397 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1398 if ( forid < minid ) {
1399 minid=forid;
1400 } else if ( forid > maxid ) {
1401 maxid=forid;
1402 }
a3621e74
YO
1403 }
1404 if ( transcount < tp + maxid - minid + 1) {
1405 transcount *= 2;
1406 Renew( trie->trans, transcount, reg_trie_trans );
1407 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1408 }
1409 base = trie->uniquecharcount + tp - minid;
1410 if ( maxid == minid ) {
1411 U32 set = 0;
1412 for ( ; zp < tp ; zp++ ) {
1413 if ( ! trie->trans[ zp ].next ) {
1414 base = trie->uniquecharcount + zp - minid;
1415 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1416 trie->trans[ zp ].check = state;
1417 set = 1;
1418 break;
1419 }
1420 }
1421 if ( !set ) {
1422 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1423 trie->trans[ tp ].check = state;
1424 tp++;
1425 zp = tp;
1426 }
1427 } else {
1428 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1429 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1430 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1431 trie->trans[ tid ].check = state;
1432 }
1433 tp += ( maxid - minid + 1 );
1434 }
1435 Safefree(trie->states[ state ].trans.list);
1436 }
1437 /*
1438 DEBUG_TRIE_COMPILE_MORE_r(
1439 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1440 );
1441 */
1442 trie->states[ state ].trans.base=base;
1443 }
cc601c31 1444 trie->lasttrans = tp + 1;
a3621e74
YO
1445 }
1446 } else {
1447 /*
1448 Second Pass -- Flat Table Representation.
1449
1450 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1451 We know that we will need Charcount+1 trans at most to store the data
1452 (one row per char at worst case) So we preallocate both structures
1453 assuming worst case.
1454
1455 We then construct the trie using only the .next slots of the entry
1456 structs.
1457
1458 We use the .check field of the first entry of the node temporarily to
1459 make compression both faster and easier by keeping track of how many non
1460 zero fields are in the node.
1461
1462 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1463 transition.
1464
1465 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1466 number representing the first entry of the node, and state as a
1467 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1468 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1469 are 2 entrys per node. eg:
1470
1471 A B A B
1472 1. 2 4 1. 3 7
1473 2. 0 3 3. 0 5
1474 3. 0 0 5. 0 0
1475 4. 0 0 7. 0 0
1476
1477 The table is internally in the right hand, idx form. However as we also
1478 have to deal with the states array which is indexed by nodenum we have to
1479 use TRIE_NODENUM() to convert.
1480
1481 */
1482
3dab1dad
YO
1483
1484 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1485 reg_trie_trans );
3dab1dad 1486 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1487 next_alloc = trie->uniquecharcount + 1;
1488
3dab1dad 1489
a3621e74
YO
1490 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1491
c445ea15 1492 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1493 const U8 *uc = (U8*)STRING( noper );
1494 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1495
1496 U32 state = 1; /* required init */
1497
1498 U16 charid = 0; /* sanity init */
1499 U32 accept_state = 0; /* sanity init */
1500 U8 *scan = (U8*)NULL; /* sanity init */
1501
1502 STRLEN foldlen = 0; /* required init */
07be1b83 1503 U32 wordlen = 0; /* required init */
a3621e74
YO
1504 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1505
3dab1dad 1506 if ( OP(noper) != NOTHING ) {
a3621e74
YO
1507 for ( ; uc < e ; uc += len ) {
1508
1509 TRIE_READ_CHAR;
1510
1511 if ( uvc < 256 ) {
1512 charid = trie->charmap[ uvc ];
1513 } else {
c445ea15
AL
1514 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1515 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74
YO
1516 }
1517 if ( charid ) {
1518 charid--;
1519 if ( !trie->trans[ state + charid ].next ) {
1520 trie->trans[ state + charid ].next = next_alloc;
1521 trie->trans[ state ].check++;
1522 next_alloc += trie->uniquecharcount;
1523 }
1524 state = trie->trans[ state + charid ].next;
1525 } else {
e4584336 1526 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1527 }
1528 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1529 }
a3621e74 1530 }
3dab1dad
YO
1531 accept_state = TRIE_NODENUM( state );
1532 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1533
1534 } /* end second pass */
1535
3dab1dad
YO
1536 /* and now dump it out before we compress it */
1537 DEBUG_TRIE_COMPILE_MORE_r(
1538 dump_trie_interim_table(trie,next_alloc,depth+1)
1539 );
a3621e74 1540
a3621e74
YO
1541 {
1542 /*
1543 * Inplace compress the table.*
1544
1545 For sparse data sets the table constructed by the trie algorithm will
1546 be mostly 0/FAIL transitions or to put it another way mostly empty.
1547 (Note that leaf nodes will not contain any transitions.)
1548
1549 This algorithm compresses the tables by eliminating most such
1550 transitions, at the cost of a modest bit of extra work during lookup:
1551
1552 - Each states[] entry contains a .base field which indicates the
1553 index in the state[] array wheres its transition data is stored.
1554
1555 - If .base is 0 there are no valid transitions from that node.
1556
1557 - If .base is nonzero then charid is added to it to find an entry in
1558 the trans array.
1559
1560 -If trans[states[state].base+charid].check!=state then the
1561 transition is taken to be a 0/Fail transition. Thus if there are fail
1562 transitions at the front of the node then the .base offset will point
1563 somewhere inside the previous nodes data (or maybe even into a node
1564 even earlier), but the .check field determines if the transition is
1565 valid.
1566
1567 The following process inplace converts the table to the compressed
1568 table: We first do not compress the root node 1,and mark its all its
1569 .check pointers as 1 and set its .base pointer as 1 as well. This
1570 allows to do a DFA construction from the compressed table later, and
1571 ensures that any .base pointers we calculate later are greater than
1572 0.
1573
1574 - We set 'pos' to indicate the first entry of the second node.
1575
1576 - We then iterate over the columns of the node, finding the first and
1577 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1578 and set the .check pointers accordingly, and advance pos
1579 appropriately and repreat for the next node. Note that when we copy
1580 the next pointers we have to convert them from the original
1581 NODEIDX form to NODENUM form as the former is not valid post
1582 compression.
1583
1584 - If a node has no transitions used we mark its base as 0 and do not
1585 advance the pos pointer.
1586
1587 - If a node only has one transition we use a second pointer into the
1588 structure to fill in allocated fail transitions from other states.
1589 This pointer is independent of the main pointer and scans forward
1590 looking for null transitions that are allocated to a state. When it
1591 finds one it writes the single transition into the "hole". If the
1592 pointer doesnt find one the single transition is appeneded as normal.
1593
1594 - Once compressed we can Renew/realloc the structures to release the
1595 excess space.
1596
1597 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1598 specifically Fig 3.47 and the associated pseudocode.
1599
1600 demq
1601 */
a3b680e6 1602 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1603 U32 state, charid;
a3621e74 1604 U32 pos = 0, zp=0;
3dab1dad 1605 TRIE_LASTSTATE(trie) = laststate;
a3621e74
YO
1606
1607 for ( state = 1 ; state < laststate ; state++ ) {
1608 U8 flag = 0;
a28509cc
AL
1609 const U32 stateidx = TRIE_NODEIDX( state );
1610 const U32 o_used = trie->trans[ stateidx ].check;
1611 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1612 trie->trans[ stateidx ].check = 0;
1613
1614 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1615 if ( flag || trie->trans[ stateidx + charid ].next ) {
1616 if ( trie->trans[ stateidx + charid ].next ) {
1617 if (o_used == 1) {
1618 for ( ; zp < pos ; zp++ ) {
1619 if ( ! trie->trans[ zp ].next ) {
1620 break;
1621 }
1622 }
1623 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1624 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1625 trie->trans[ zp ].check = state;
1626 if ( ++zp > pos ) pos = zp;
1627 break;
1628 }
1629 used--;
1630 }
1631 if ( !flag ) {
1632 flag = 1;
1633 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1634 }
1635 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1636 trie->trans[ pos ].check = state;
1637 pos++;
1638 }
1639 }
1640 }
cc601c31 1641 trie->lasttrans = pos + 1;
a3621e74
YO
1642 Renew( trie->states, laststate + 1, reg_trie_state);
1643 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1644 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1645 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1646 (int)depth * 2 + 2,"",
1647 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1648 (IV)next_alloc,
1649 (IV)pos,
a3621e74
YO
1650 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1651 );
1652
1653 } /* end table compress */
1654 }
cc601c31
YO
1655 /* resize the trans array to remove unused space */
1656 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1657
3dab1dad
YO
1658 /* and now dump out the compressed format */
1659 DEBUG_TRIE_COMPILE_r(
1660 dump_trie(trie,depth+1)
1661 );
07be1b83 1662
3dab1dad
YO
1663 { /* Modify the program and insert the new TRIE node*/
1664 regnode *convert;
1665 U8 nodetype =(U8)(flags & 0xFF);
1666 char *str=NULL;
07be1b83
YO
1667#ifdef DEBUGGING
1668 U32 mjd_offset;
1669 U32 mjd_nodelen;
1670#endif
a3621e74 1671 /*
3dab1dad
YO
1672 This means we convert either the first branch or the first Exact,
1673 depending on whether the thing following (in 'last') is a branch
1674 or not and whther first is the startbranch (ie is it a sub part of
1675 the alternation or is it the whole thing.)
1676 Assuming its a sub part we conver the EXACT otherwise we convert
1677 the whole branch sequence, including the first.
a3621e74 1678 */
3dab1dad
YO
1679 /* Find the node we are going to overwrite */
1680 if ( first == startbranch && OP( last ) != BRANCH ) {
07be1b83 1681 /* whole branch chain */
3dab1dad 1682 convert = first;
07be1b83
YO
1683 DEBUG_r({
1684 const regnode *nop = NEXTOPER( convert );
1685 mjd_offset= Node_Offset((nop));
1686 mjd_nodelen= Node_Length((nop));
1687 });
1688 } else {
1689 /* branch sub-chain */
3dab1dad
YO
1690 convert = NEXTOPER( first );
1691 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1692 DEBUG_r({
1693 mjd_offset= Node_Offset((convert));
1694 mjd_nodelen= Node_Length((convert));
1695 });
1696 }
1697 DEBUG_OPTIMISE_r(
1698 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1699 (int)depth * 2 + 2, "",
1700 mjd_offset,mjd_nodelen)
1701 );
a3621e74 1702
3dab1dad
YO
1703 /* But first we check to see if there is a common prefix we can
1704 split out as an EXACT and put in front of the TRIE node. */
1705 trie->startstate= 1;
1706 if ( trie->bitmap && !trie->widecharmap ) {
1707 U32 state;
1708 DEBUG_OPTIMISE_r(
8e11feef
RGS
1709 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1710 (int)depth * 2 + 2, "",
1711 TRIE_LASTSTATE(trie))
1712 );
1713 for ( state = 1 ; state < TRIE_LASTSTATE(trie)-1 ; state++ ) {
a3621e74 1714 U32 ofs = 0;
8e11feef
RGS
1715 I32 idx = -1;
1716 U32 count = 0;
1717 const U32 base = trie->states[ state ].trans.base;
a3621e74 1718
3dab1dad 1719 if ( trie->states[state].wordnum )
8e11feef 1720 count = 1;
a3621e74 1721
8e11feef 1722 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1723 if ( ( base + ofs >= trie->uniquecharcount ) &&
1724 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1725 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1726 {
3dab1dad 1727 if ( ++count > 1 ) {
8e11feef 1728 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1729 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1730 if ( state == 1 ) break;
3dab1dad
YO
1731 if ( count == 2 ) {
1732 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1733 DEBUG_OPTIMISE_r(
8e11feef
RGS
1734 PerlIO_printf(Perl_debug_log,
1735 "%*sNew Start State=%"UVuf" Class: [",
1736 (int)depth * 2 + 2, "",
3dab1dad 1737 state));
be8e71aa
YO
1738 if (idx >= 0) {
1739 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1740 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1741
3dab1dad 1742 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1743 if ( folder )
1744 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1745 DEBUG_OPTIMISE_r(
07be1b83 1746 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1747 );
8e11feef
RGS
1748 }
1749 }
1750 TRIE_BITMAP_SET(trie,*ch);
1751 if ( folder )
1752 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1753 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1754 }
1755 idx = ofs;
1756 }
3dab1dad
YO
1757 }
1758 if ( count == 1 ) {
1759 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1760 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1761 DEBUG_OPTIMISE_r(
8e11feef
RGS
1762 PerlIO_printf( Perl_debug_log,
1763 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1764 (int)depth * 2 + 2, "",
1765 state, idx, ch)
3dab1dad
YO
1766 );
1767 if ( state==1 ) {
1768 OP( convert ) = nodetype;
1769 str=STRING(convert);
1770 STR_LEN(convert)=0;
1771 }
1772 *str++=*ch;
1773 STR_LEN(convert)++;
a3621e74 1774
8e11feef 1775 } else {
f9049ba1 1776#ifdef DEBUGGING
8e11feef
RGS
1777 if (state>1)
1778 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1779#endif
8e11feef
RGS
1780 break;
1781 }
1782 }
3dab1dad 1783 if (str) {
8e11feef 1784 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1785 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1786 trie->startstate = state;
07be1b83
YO
1787 trie->minlen -= (state - 1);
1788 trie->maxlen -= (state - 1);
1789 DEBUG_r({
1790 regnode *fix = convert;
1791 mjd_nodelen++;
1792 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1793 while( ++fix < n ) {
1794 Set_Node_Offset_Length(fix, 0, 0);
1795 }
1796 });
8e11feef
RGS
1797 if (trie->maxlen) {
1798 convert = n;
1799 } else {
3dab1dad
YO
1800 NEXT_OFF(convert) = (U16)(tail - convert);
1801 }
1802 }
1803 }
1804 if ( trie->maxlen ) {
1805 OP( convert ) = TRIE;
8e11feef
RGS
1806 NEXT_OFF( convert ) = (U16)(tail - convert);
1807 ARG_SET( convert, data_slot );
a3621e74 1808
3dab1dad
YO
1809 /* store the type in the flags */
1810 convert->flags = nodetype;
1811 /* XXX We really should free up the resource in trie now, as we wont use them */
1812 }
a3621e74
YO
1813 /* needed for dumping*/
1814 DEBUG_r({
1815 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
07be1b83
YO
1816 regnode *opt = convert;
1817 while (++opt<optimize) {
1818 Set_Node_Offset_Length(opt,0,0);
1819 }
a3621e74
YO
1820 /* We now need to mark all of the space originally used by the
1821 branches as optimized away. This keeps the dumpuntil from
1822 throwing a wobbly as it doesnt use regnext() to traverse the
1823 opcodes.
07be1b83 1824 We also "fix" the offsets
a3621e74
YO
1825 */
1826 while( optimize < last ) {
07be1b83 1827 mjd_nodelen += Node_Length((optimize));
a3621e74 1828 OP( optimize ) = OPTIMIZED;
07be1b83 1829 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1830 optimize++;
1831 }
07be1b83 1832 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1833 });
1834 } /* end node insert */
07be1b83 1835#ifndef DEBUGGING
6e8b4190 1836 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1837#endif
a3621e74
YO
1838 return 1;
1839}
1840
a3621e74 1841/*
5d1c421c
JH
1842 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1843 * These need to be revisited when a newer toolchain becomes available.
1844 */
1845#if defined(__sparc64__) && defined(__GNUC__)
1846# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1847# undef SPARC64_GCC_WORKAROUND
1848# define SPARC64_GCC_WORKAROUND 1
1849# endif
1850#endif
1851
07be1b83
YO
1852#define DEBUG_PEEP(str,scan,depth) \
1853 DEBUG_OPTIMISE_r({ \
1854 SV * const mysv=sv_newmortal(); \
1855 regnode *Next = regnext(scan); \
1856 regprop(RExC_rx, mysv, scan); \
1857 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1858 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1859 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1860 });
1861
1862#define JOIN_EXACT(scan,min,flags) \
1863 if (PL_regkind[OP(scan)] == EXACT) \
1864 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1865
be8e71aa 1866STATIC U32
07be1b83
YO
1867S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1868 /* Merge several consecutive EXACTish nodes into one. */
1869 regnode *n = regnext(scan);
1870 U32 stringok = 1;
1871 regnode *next = scan + NODE_SZ_STR(scan);
1872 U32 merged = 0;
1873 U32 stopnow = 0;
1874#ifdef DEBUGGING
1875 regnode *stop = scan;
f9049ba1
SP
1876#else
1877 PERL_UNUSED_ARG(flags);
1878 PERL_UNUSED_ARG(val);
1879 PERL_UNUSED_ARG(depth);
07be1b83
YO
1880#endif
1881 GET_RE_DEBUG_FLAGS_DECL;
1882 DEBUG_PEEP("join",scan,depth);
1883
1884 /* Skip NOTHING, merge EXACT*. */
1885 while (n &&
1886 ( PL_regkind[OP(n)] == NOTHING ||
1887 (stringok && (OP(n) == OP(scan))))
1888 && NEXT_OFF(n)
1889 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1890
1891 if (OP(n) == TAIL || n > next)
1892 stringok = 0;
1893 if (PL_regkind[OP(n)] == NOTHING) {
1894
1895 DEBUG_PEEP("skip:",n,depth);
1896 NEXT_OFF(scan) += NEXT_OFF(n);
1897 next = n + NODE_STEP_REGNODE;
1898#ifdef DEBUGGING
1899 if (stringok)
1900 stop = n;
1901#endif
1902 n = regnext(n);
1903 }
1904 else if (stringok) {
1905 const int oldl = STR_LEN(scan);
1906 regnode * const nnext = regnext(n);
1907
1908 DEBUG_PEEP("merg",n,depth);
1909
1910 merged++;
1911 if (oldl + STR_LEN(n) > U8_MAX)
1912 break;
1913 NEXT_OFF(scan) += NEXT_OFF(n);
1914 STR_LEN(scan) += STR_LEN(n);
1915 next = n + NODE_SZ_STR(n);
1916 /* Now we can overwrite *n : */
1917 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1918#ifdef DEBUGGING
1919 stop = next - 1;
1920#endif
1921 n = nnext;
1922 if (stopnow) break;
1923 }
1924
1925#ifdef EXPERIMENTAL_INPLACESCAN
1926 if (flags && !NEXT_OFF(n)) {
1927 DEBUG_PEEP("atch",val,depth);
1928 if (reg_off_by_arg[OP(n)]) {
1929 ARG_SET(n, val - n);
1930 }
1931 else {
1932 NEXT_OFF(n) = val - n;
1933 }
1934 stopnow=1;
1935 }
1936#endif
1937 }
1938
1939 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
1940 /*
1941 Two problematic code points in Unicode casefolding of EXACT nodes:
1942
1943 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1944 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1945
1946 which casefold to
1947
1948 Unicode UTF-8
1949
1950 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1951 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1952
1953 This means that in case-insensitive matching (or "loose matching",
1954 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1955 length of the above casefolded versions) can match a target string
1956 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1957 This would rather mess up the minimum length computation.
1958
1959 What we'll do is to look for the tail four bytes, and then peek
1960 at the preceding two bytes to see whether we need to decrease
1961 the minimum length by four (six minus two).
1962
1963 Thanks to the design of UTF-8, there cannot be false matches:
1964 A sequence of valid UTF-8 bytes cannot be a subsequence of
1965 another valid sequence of UTF-8 bytes.
1966
1967 */
1968 char * const s0 = STRING(scan), *s, *t;
1969 char * const s1 = s0 + STR_LEN(scan) - 1;
1970 char * const s2 = s1 - 4;
1971 const char t0[] = "\xcc\x88\xcc\x81";
1972 const char * const t1 = t0 + 3;
1973
1974 for (s = s0 + 2;
1975 s < s2 && (t = ninstr(s, s1, t0, t1));
1976 s = t + 4) {
1977 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1978 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1979 *min -= 4;
1980 }
1981 }
1982
1983#ifdef DEBUGGING
1984 /* Allow dumping */
1985 n = scan + NODE_SZ_STR(scan);
1986 while (n <= stop) {
1987 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
1988 OP(n) = OPTIMIZED;
1989 NEXT_OFF(n) = 0;
1990 }
1991 n++;
1992 }
1993#endif
1994 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
1995 return stopnow;
1996}
1997
653099ff
GS
1998/* REx optimizer. Converts nodes into quickier variants "in place".
1999 Finds fixed substrings. */
2000
a0288114 2001/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2002 to the position after last scanned or to NULL. */
2003
07be1b83
YO
2004
2005
76e3520e 2006STATIC I32
9a957fbc
AL
2007S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
2008 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
2009 /* scanp: Start here (read-write). */
2010 /* deltap: Write maxlen-minlen here. */
2011 /* last: Stop before this one. */
2012{
97aff369 2013 dVAR;
c277df42
IZ
2014 I32 min = 0, pars = 0, code;
2015 regnode *scan = *scanp, *next;
2016 I32 delta = 0;
2017 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2018 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2019 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2020 scan_data_t data_fake;
653099ff 2021 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
2022 SV *re_trie_maxbuff = NULL;
2023
2024 GET_RE_DEBUG_FLAGS_DECL;
13a24bad
YO
2025#ifdef DEBUGGING
2026 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2027#endif
b81d288d 2028
c277df42
IZ
2029 while (scan && OP(scan) != END && scan < last) {
2030 /* Peephole optimizer: */
07be1b83 2031 DEBUG_PEEP("Peep",scan,depth);
a3621e74 2032
07be1b83 2033 JOIN_EXACT(scan,&min,0);
a3621e74 2034
653099ff
GS
2035 /* Follow the next-chain of the current node and optimize
2036 away all the NOTHINGs from it. */
c277df42 2037 if (OP(scan) != CURLYX) {
a3b680e6 2038 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2039 ? I32_MAX
2040 /* I32 may be smaller than U16 on CRAYs! */
2041 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2042 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2043 int noff;
2044 regnode *n = scan;
b81d288d 2045
c277df42
IZ
2046 /* Skip NOTHING and LONGJMP. */
2047 while ((n = regnext(n))
3dab1dad 2048 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2049 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2050 && off + noff < max)
2051 off += noff;
2052 if (reg_off_by_arg[OP(scan)])
2053 ARG(scan) = off;
b81d288d 2054 else
c277df42
IZ
2055 NEXT_OFF(scan) = off;
2056 }
a3621e74 2057
07be1b83 2058
3dab1dad 2059
653099ff
GS
2060 /* The principal pseudo-switch. Cannot be a switch, since we
2061 look into several different things. */
b81d288d 2062 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2063 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2064 next = regnext(scan);
2065 code = OP(scan);
a3621e74 2066 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2067
2068 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 2069 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2070 struct regnode_charclass_class accum;
d4c19fe8 2071 regnode * const startbranch=scan;
c277df42 2072
653099ff 2073 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 2074 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 2075 if (flags & SCF_DO_STCLASS)
830247a4 2076 cl_init_zero(pRExC_state, &accum);
a3621e74 2077
c277df42 2078 while (OP(scan) == code) {
830247a4 2079 I32 deltanext, minnext, f = 0, fake;
653099ff 2080 struct regnode_charclass_class this_class;
c277df42
IZ
2081
2082 num++;
2083 data_fake.flags = 0;
b81d288d 2084 if (data) {
2c2d71f5 2085 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2086 data_fake.last_closep = data->last_closep;
2087 }
2088 else
2089 data_fake.last_closep = &fake;
c277df42
IZ
2090 next = regnext(scan);
2091 scan = NEXTOPER(scan);
2092 if (code != BRANCH)
2093 scan = NEXTOPER(scan);
653099ff 2094 if (flags & SCF_DO_STCLASS) {
830247a4 2095 cl_init(pRExC_state, &this_class);
653099ff
GS
2096 data_fake.start_class = &this_class;
2097 f = SCF_DO_STCLASS_AND;
b81d288d 2098 }
e1901655
IZ
2099 if (flags & SCF_WHILEM_VISITED_POS)
2100 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2101
653099ff 2102 /* we suppose the run is continuous, last=next...*/
830247a4 2103 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 2104 next, &data_fake, f,depth+1);
b81d288d 2105 if (min1 > minnext)
c277df42
IZ
2106 min1 = minnext;
2107 if (max1 < minnext + deltanext)
2108 max1 = minnext + deltanext;
2109 if (deltanext == I32_MAX)
aca2d497 2110 is_inf = is_inf_internal = 1;
c277df42
IZ
2111 scan = next;
2112 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2113 pars++;
3dab1dad
YO
2114 if (data) {
2115 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2116 data->flags |= SF_HAS_EVAL;
2c2d71f5 2117 data->whilem_c = data_fake.whilem_c;
3dab1dad 2118 }
653099ff 2119 if (flags & SCF_DO_STCLASS)
830247a4 2120 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2121 if (code == SUSPEND)
c277df42
IZ
2122 break;
2123 }
2124 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2125 min1 = 0;
2126 if (flags & SCF_DO_SUBSTR) {
2127 data->pos_min += min1;
2128 data->pos_delta += max1 - min1;
2129 if (max1 != min1 || is_inf)
2130 data->longest = &(data->longest_float);
2131 }
2132 min += min1;
2133 delta += max1 - min1;
653099ff 2134 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2135 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
2136 if (min1) {
2137 cl_and(data->start_class, &and_with);
2138 flags &= ~SCF_DO_STCLASS;
2139 }
2140 }
2141 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2142 if (min1) {
2143 cl_and(data->start_class, &accum);
653099ff 2144 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2145 }
2146 else {
b81d288d 2147 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
2148 * data->start_class */
2149 StructCopy(data->start_class, &and_with,
2150 struct regnode_charclass_class);
2151 flags &= ~SCF_DO_STCLASS_AND;
2152 StructCopy(&accum, data->start_class,
2153 struct regnode_charclass_class);
2154 flags |= SCF_DO_STCLASS_OR;
2155 data->start_class->flags |= ANYOF_EOS;
2156 }
653099ff 2157 }
a3621e74
YO
2158
2159 /* demq.
2160
2161 Assuming this was/is a branch we are dealing with: 'scan' now
2162 points at the item that follows the branch sequence, whatever
2163 it is. We now start at the beginning of the sequence and look
2164 for subsequences of
2165
2166 BRANCH->EXACT=>X
2167 BRANCH->EXACT=>X
2168
2169 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2170
2171 If we can find such a subseqence we need to turn the first
2172 element into a trie and then add the subsequent branch exact
2173 strings to the trie.
2174
2175 We have two cases
2176
2177 1. patterns where the whole set of branch can be converted to a trie,
2178
2179 2. patterns where only a subset of the alternations can be
2180 converted to a trie.
2181
2182 In case 1 we can replace the whole set with a single regop
2183 for the trie. In case 2 we need to keep the start and end
2184 branchs so
2185
2186 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2187 becomes BRANCH TRIE; BRANCH X;
2188
2189 Hypthetically when we know the regex isnt anchored we can
2190 turn a case 1 into a DFA and let it rip... Every time it finds a match
2191 it would just call its tail, no WHILEM/CURLY needed.
2192
2193 */
07be1b83 2194 if (PERL_ENABLE_TRIE_OPTIMISATION) {
3dab1dad 2195 int made=0;
0111c4fd
RGS
2196 if (!re_trie_maxbuff) {
2197 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2198 if (!SvIOK(re_trie_maxbuff))
2199 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2200 }
a3621e74
YO
2201 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
2202 regnode *cur;
2203 regnode *first = (regnode *)NULL;
2204 regnode *last = (regnode *)NULL;
2205 regnode *tail = scan;
2206 U8 optype = 0;
2207 U32 count=0;
2208
2209#ifdef DEBUGGING
c445ea15 2210 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2211#endif
2212 /* var tail is used because there may be a TAIL
2213 regop in the way. Ie, the exacts will point to the
2214 thing following the TAIL, but the last branch will
2215 point at the TAIL. So we advance tail. If we
2216 have nested (?:) we may have to move through several
2217 tails.
2218 */
2219
2220 while ( OP( tail ) == TAIL ) {
2221 /* this is the TAIL generated by (?:) */
2222 tail = regnext( tail );
2223 }
2224
3dab1dad 2225
a3621e74 2226 DEBUG_OPTIMISE_r({
32fc9b6a 2227 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2228 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2229 (int)depth * 2 + 2, "",
2230 "Looking for TRIE'able sequences. Tail node is: ",
2231 SvPV_nolen_const( mysv )
a3621e74
YO
2232 );
2233 });
3dab1dad 2234
a3621e74
YO
2235 /*
2236
2237 step through the branches, cur represents each
2238 branch, noper is the first thing to be matched
2239 as part of that branch and noper_next is the
2240 regnext() of that node. if noper is an EXACT
2241 and noper_next is the same as scan (our current
2242 position in the regex) then the EXACT branch is
2243 a possible optimization target. Once we have
2244 two or more consequetive such branches we can
2245 create a trie of the EXACT's contents and stich
2246 it in place. If the sequence represents all of
2247 the branches we eliminate the whole thing and
2248 replace it with a single TRIE. If it is a
2249 subsequence then we need to stitch it in. This
2250 means the first branch has to remain, and needs
2251 to be repointed at the item on the branch chain
2252 following the last branch optimized. This could
2253 be either a BRANCH, in which case the
2254 subsequence is internal, or it could be the
2255 item following the branch sequence in which
2256 case the subsequence is at the end.
2257
2258 */
2259
2260 /* dont use tail as the end marker for this traverse */
2261 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
2262 regnode * const noper = NEXTOPER( cur );
2263 regnode * const noper_next = regnext( noper );
a3621e74 2264
a3621e74 2265 DEBUG_OPTIMISE_r({
32fc9b6a 2266 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2267 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2268 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2269
32fc9b6a 2270 regprop(RExC_rx, mysv, noper);
a3621e74 2271 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2272 SvPV_nolen_const(mysv));
a3621e74
YO
2273
2274 if ( noper_next ) {
32fc9b6a 2275 regprop(RExC_rx, mysv, noper_next );
a3621e74 2276 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2277 SvPV_nolen_const(mysv));
a3621e74 2278 }
3dab1dad
YO
2279 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2280 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2281 });
3dab1dad
YO
2282 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2283 : PL_regkind[ OP( noper ) ] == EXACT )
2284 || OP(noper) == NOTHING )
a3621e74
YO
2285 && noper_next == tail && count<U16_MAX)
2286 {
2287 count++;
3dab1dad
YO
2288 if ( !first || optype == NOTHING ) {
2289 if (!first) first = cur;
a3621e74
YO
2290 optype = OP( noper );
2291 } else {
a3621e74 2292 last = cur;
a3621e74
YO
2293 }
2294 } else {
2295 if ( last ) {
3dab1dad 2296 made+=make_trie( pRExC_state, startbranch, first, cur, tail, optype, depth+1 );
a3621e74 2297 }
3dab1dad 2298 if ( PL_regkind[ OP( noper ) ] == EXACT
a3621e74
YO
2299 && noper_next == tail )
2300 {
2301 count = 1;
2302 first = cur;
2303 optype = OP( noper );
2304 } else {
2305 count = 0;
2306 first = NULL;
2307 optype = 0;
2308 }
2309 last = NULL;
2310 }
2311 }
2312 DEBUG_OPTIMISE_r({
32fc9b6a 2313 regprop(RExC_rx, mysv, cur);
a3621e74 2314 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2315 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2316 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2317
2318 });
2319 if ( last ) {
3dab1dad
YO
2320 made+= make_trie( pRExC_state, startbranch, first, scan, tail, optype, depth+1 );
2321#ifdef TRIE_STUDY_OPT
07be1b83
YO
2322 if ( made && startbranch == first ) {
2323 if ( OP(first)!=TRIE )
2324 flags |= SCF_EXACT_TRIE;
2325 else {
2326 regnode *chk=*scanp;
2327 while ( OP( chk ) == OPEN )
2328 chk = regnext( chk );
2329 if (chk==first)
2330 flags |= SCF_EXACT_TRIE;
2331 }
2332 }
3dab1dad 2333#endif
07be1b83 2334 }
a3621e74 2335 }
3dab1dad
YO
2336
2337 } /* do trie */
a0ed51b3 2338 }
a3621e74 2339 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2340 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2341 } else /* single branch is optimized. */
c277df42
IZ
2342 scan = NEXTOPER(scan);
2343 continue;
a0ed51b3
LW
2344 }
2345 else if (OP(scan) == EXACT) {
cd439c50 2346 I32 l = STR_LEN(scan);
c445ea15 2347 UV uc;
a0ed51b3 2348 if (UTF) {
a3b680e6 2349 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2350 l = utf8_length(s, s + l);
9041c2e3 2351 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2352 } else {
2353 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2354 }
2355 min += l;
c277df42 2356 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2357 /* The code below prefers earlier match for fixed
2358 offset, later match for variable offset. */
2359 if (data->last_end == -1) { /* Update the start info. */
2360 data->last_start_min = data->pos_min;
2361 data->last_start_max = is_inf
b81d288d 2362 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2363 }
cd439c50 2364 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2365 if (UTF)
2366 SvUTF8_on(data->last_found);
0eda9292 2367 {
9a957fbc 2368 SV * const sv = data->last_found;
a28509cc 2369 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2370 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2371 if (mg && mg->mg_len >= 0)
5e43f467
JH
2372 mg->mg_len += utf8_length((U8*)STRING(scan),
2373 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2374 }
c277df42
IZ
2375 data->last_end = data->pos_min + l;
2376 data->pos_min += l; /* As in the first entry. */
2377 data->flags &= ~SF_BEFORE_EOL;
2378 }
653099ff
GS
2379 if (flags & SCF_DO_STCLASS_AND) {
2380 /* Check whether it is compatible with what we know already! */
2381 int compat = 1;
2382
1aa99e6b 2383 if (uc >= 0x100 ||
516a5887 2384 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2385 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2386 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2387 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2388 )
653099ff
GS
2389 compat = 0;
2390 ANYOF_CLASS_ZERO(data->start_class);
2391 ANYOF_BITMAP_ZERO(data->start_class);
2392 if (compat)
1aa99e6b 2393 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2394 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2395 if (uc < 0x100)
2396 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2397 }
2398 else if (flags & SCF_DO_STCLASS_OR) {
2399 /* false positive possible if the class is case-folded */
1aa99e6b 2400 if (uc < 0x100)
9b877dbb
IH
2401 ANYOF_BITMAP_SET(data->start_class, uc);
2402 else
2403 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2404 data->start_class->flags &= ~ANYOF_EOS;
2405 cl_and(data->start_class, &and_with);
2406 }
2407 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2408 }
3dab1dad 2409 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2410 I32 l = STR_LEN(scan);
1aa99e6b 2411 UV uc = *((U8*)STRING(scan));
653099ff
GS
2412
2413 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2414 if (flags & SCF_DO_SUBSTR) {
2415 assert(data);
830247a4 2416 scan_commit(pRExC_state, data);
ecaa9b9c 2417 }
a0ed51b3 2418 if (UTF) {
6136c704 2419 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2420 l = utf8_length(s, s + l);
9041c2e3 2421 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2422 }
2423 min += l;
ecaa9b9c 2424 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2425 data->pos_min += l;
653099ff
GS
2426 if (flags & SCF_DO_STCLASS_AND) {
2427 /* Check whether it is compatible with what we know already! */
2428 int compat = 1;
2429
1aa99e6b 2430 if (uc >= 0x100 ||
516a5887 2431 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2432 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2433 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2434 compat = 0;
2435 ANYOF_CLASS_ZERO(data->start_class);
2436 ANYOF_BITMAP_ZERO(data->start_class);
2437 if (compat) {
1aa99e6b 2438 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2439 data->start_class->flags &= ~ANYOF_EOS;
2440 data->start_class->flags |= ANYOF_FOLD;
2441 if (OP(scan) == EXACTFL)
2442 data->start_class->flags |= ANYOF_LOCALE;
2443 }
2444 }
2445 else if (flags & SCF_DO_STCLASS_OR) {
2446 if (data->start_class->flags & ANYOF_FOLD) {
2447 /* false positive possible if the class is case-folded.
2448 Assume that the locale settings are the same... */
1aa99e6b
IH
2449 if (uc < 0x100)
2450 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2451 data->start_class->flags &= ~ANYOF_EOS;
2452 }
2453 cl_and(data->start_class, &and_with);
2454 }
2455 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2456 }
3dab1dad
YO
2457#ifdef TRIE_STUDY_OPT
2458 else if (OP(scan) == TRIE) {
2459 reg_trie_data *trie=RExC_rx->data->data[ ARG(scan) ];
2460 min += trie->minlen;
07be1b83 2461 delta += (trie->maxlen - trie->minlen);
3dab1dad
YO
2462 flags &= ~SCF_DO_STCLASS; /* xxx */
2463 if (flags & SCF_DO_SUBSTR) {
2464 scan_commit(pRExC_state,data); /* Cannot expect anything... */
2465 data->pos_min += trie->minlen;
07be1b83
YO
2466 data->pos_delta += (trie->maxlen - trie->minlen);
2467 if (trie->maxlen != trie->minlen)
2468 data->longest = &(data->longest_float);
3dab1dad
YO
2469 }
2470 }
2471#endif
bfed75c6 2472 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2473 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2474 I32 f = flags, pos_before = 0;
d4c19fe8 2475 regnode * const oscan = scan;
653099ff
GS
2476 struct regnode_charclass_class this_class;
2477 struct regnode_charclass_class *oclass = NULL;
727f22e3 2478 I32 next_is_eval = 0;
653099ff 2479
3dab1dad 2480 switch (PL_regkind[OP(scan)]) {
653099ff 2481 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2482 scan = NEXTOPER(scan);
2483 goto finish;
2484 case PLUS:
653099ff 2485 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2486 next = NEXTOPER(scan);
653099ff 2487 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2488 mincount = 1;
2489 maxcount = REG_INFTY;
c277df42
IZ
2490 next = regnext(scan);
2491 scan = NEXTOPER(scan);
2492 goto do_curly;
2493 }
2494 }
2495 if (flags & SCF_DO_SUBSTR)
2496 data->pos_min++;
2497 min++;
2498 /* Fall through. */
2499 case STAR:
653099ff
GS
2500 if (flags & SCF_DO_STCLASS) {
2501 mincount = 0;
b81d288d 2502 maxcount = REG_INFTY;
653099ff
GS
2503 next = regnext(scan);
2504 scan = NEXTOPER(scan);
2505 goto do_curly;
2506 }
b81d288d 2507 is_inf = is_inf_internal = 1;
c277df42
IZ
2508 scan = regnext(scan);
2509 if (flags & SCF_DO_SUBSTR) {
830247a4 2510 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2511 data->longest = &(data->longest_float);
2512 }
2513 goto optimize_curly_tail;
2514 case CURLY:
b81d288d 2515 mincount = ARG1(scan);
c277df42
IZ
2516 maxcount = ARG2(scan);
2517 next = regnext(scan);
cb434fcc
IZ
2518 if (OP(scan) == CURLYX) {
2519 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2520 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2521 }
c277df42 2522 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2523 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2524 do_curly:
2525 if (flags & SCF_DO_SUBSTR) {
830247a4 2526 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2527 pos_before = data->pos_min;
2528 }
2529 if (data) {
2530 fl = data->flags;
2531 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2532 if (is_inf)
2533 data->flags |= SF_IS_INF;
2534 }
653099ff 2535 if (flags & SCF_DO_STCLASS) {
830247a4 2536 cl_init(pRExC_state, &this_class);
653099ff
GS
2537 oclass = data->start_class;
2538 data->start_class = &this_class;
2539 f |= SCF_DO_STCLASS_AND;
2540 f &= ~SCF_DO_STCLASS_OR;
2541 }
e1901655
IZ
2542 /* These are the cases when once a subexpression
2543 fails at a particular position, it cannot succeed
2544 even after backtracking at the enclosing scope.
b81d288d 2545
e1901655
IZ
2546 XXXX what if minimal match and we are at the
2547 initial run of {n,m}? */
2548 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2549 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2550
c277df42 2551 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2552 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2553 (mincount == 0
2554 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2555
2556 if (flags & SCF_DO_STCLASS)
2557 data->start_class = oclass;
2558 if (mincount == 0 || minnext == 0) {
2559 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2560 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2561 }
2562 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2563 /* Switch to OR mode: cache the old value of
653099ff
GS
2564 * data->start_class */
2565 StructCopy(data->start_class, &and_with,
2566 struct regnode_charclass_class);
2567 flags &= ~SCF_DO_STCLASS_AND;
2568 StructCopy(&this_class, data->start_class,
2569 struct regnode_charclass_class);
2570 flags |= SCF_DO_STCLASS_OR;
2571 data->start_class->flags |= ANYOF_EOS;
2572 }
2573 } else { /* Non-zero len */
2574 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2575 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2576 cl_and(data->start_class, &and_with);
2577 }
2578 else if (flags & SCF_DO_STCLASS_AND)
2579 cl_and(data->start_class, &this_class);
2580 flags &= ~SCF_DO_STCLASS;
2581 }
c277df42
IZ
2582 if (!scan) /* It was not CURLYX, but CURLY. */
2583 scan = next;
041457d9
DM
2584 if ( /* ? quantifier ok, except for (?{ ... }) */
2585 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2586 && (minnext == 0) && (deltanext == 0)
99799961 2587 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2588 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2589 && ckWARN(WARN_REGEXP))
b45f050a 2590 {
830247a4 2591 vWARN(RExC_parse,
b45f050a
JF
2592 "Quantifier unexpected on zero-length expression");
2593 }
2594
c277df42 2595 min += minnext * mincount;
b81d288d 2596 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2597 && (minnext + deltanext) > 0)
2598 || deltanext == I32_MAX);
aca2d497 2599 is_inf |= is_inf_internal;
c277df42
IZ
2600 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2601
2602 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2603 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2604 && data->flags & SF_IN_PAR
2605 && !(data->flags & SF_HAS_EVAL)
2606 && !deltanext && minnext == 1 ) {
2607 /* Try to optimize to CURLYN. */
2608 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2609 regnode * const nxt1 = nxt;
497b47a8
JH
2610#ifdef DEBUGGING
2611 regnode *nxt2;
2612#endif
c277df42
IZ
2613
2614 /* Skip open. */
2615 nxt = regnext(nxt);
bfed75c6 2616 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2617 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2618 && STR_LEN(nxt) == 1))
c277df42 2619 goto nogo;
497b47a8 2620#ifdef DEBUGGING
c277df42 2621 nxt2 = nxt;
497b47a8 2622#endif
c277df42 2623 nxt = regnext(nxt);
b81d288d 2624 if (OP(nxt) != CLOSE)
c277df42
IZ
2625 goto nogo;
2626 /* Now we know that nxt2 is the only contents: */
eb160463 2627 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2628 OP(oscan) = CURLYN;
2629 OP(nxt1) = NOTHING; /* was OPEN. */
2630#ifdef DEBUGGING
2631 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2632 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2633 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2634 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2635 OP(nxt + 1) = OPTIMIZED; /* was count. */
2636 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2637#endif
c277df42 2638 }
c277df42
IZ
2639 nogo:
2640
2641 /* Try optimization CURLYX => CURLYM. */
b81d288d 2642 if ( OP(oscan) == CURLYX && data
c277df42 2643 && !(data->flags & SF_HAS_PAR)
c277df42 2644 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2645 && !deltanext /* atom is fixed width */
2646 && minnext != 0 /* CURLYM can't handle zero width */
2647 ) {
c277df42
IZ
2648 /* XXXX How to optimize if data == 0? */
2649 /* Optimize to a simpler form. */
2650 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2651 regnode *nxt2;
2652
2653 OP(oscan) = CURLYM;
2654 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2655 && (OP(nxt2) != WHILEM))
c277df42
IZ
2656 nxt = nxt2;
2657 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2658 /* Need to optimize away parenths. */
2659 if (data->flags & SF_IN_PAR) {
2660 /* Set the parenth number. */
2661 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2662
b81d288d 2663 if (OP(nxt) != CLOSE)
b45f050a 2664 FAIL("Panic opt close");
eb160463 2665 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2666 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2667 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2668#ifdef DEBUGGING
2669 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2670 OP(nxt + 1) = OPTIMIZED; /* was count. */
2671 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2672 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2673#endif
c277df42
IZ
2674#if 0
2675 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2676 regnode *nnxt = regnext(nxt1);
b81d288d 2677
c277df42
IZ
2678 if (nnxt == nxt) {
2679 if (reg_off_by_arg[OP(nxt1)])
2680 ARG_SET(nxt1, nxt2 - nxt1);
2681 else if (nxt2 - nxt1 < U16_MAX)
2682 NEXT_OFF(nxt1) = nxt2 - nxt1;
2683 else
2684 OP(nxt) = NOTHING; /* Cannot beautify */
2685 }
2686 nxt1 = nnxt;
2687 }
2688#endif
2689 /* Optimize again: */
b81d288d 2690 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2691 NULL, 0,depth+1);
a0ed51b3
LW
2692 }
2693 else
c277df42 2694 oscan->flags = 0;
c277df42 2695 }
e1901655
IZ
2696 else if ((OP(oscan) == CURLYX)
2697 && (flags & SCF_WHILEM_VISITED_POS)
2698 /* See the comment on a similar expression above.
2699 However, this time it not a subexpression
2700 we care about, but the expression itself. */
2701 && (maxcount == REG_INFTY)
2702 && data && ++data->whilem_c < 16) {
2703 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2704 /* Find WHILEM (as in regexec.c) */
2705 regnode *nxt = oscan + NEXT_OFF(oscan);
2706
2707 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2708 nxt += ARG(nxt);
eb160463
GS
2709 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2710 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2711 }
b81d288d 2712 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2713 pars++;
2714 if (flags & SCF_DO_SUBSTR) {
c445ea15 2715 SV *last_str = NULL;
c277df42
IZ
2716 int counted = mincount != 0;
2717
2718 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2719#if defined(SPARC64_GCC_WORKAROUND)
2720 I32 b = 0;
2721 STRLEN l = 0;
cfd0369c 2722 const char *s = NULL;
5d1c421c
JH
2723 I32 old = 0;
2724
2725 if (pos_before >= data->last_start_min)
2726 b = pos_before;
2727 else
2728 b = data->last_start_min;
2729
2730 l = 0;
cfd0369c 2731 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2732 old = b - data->last_start_min;
2733
2734#else
b81d288d 2735 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2736 ? pos_before : data->last_start_min;
2737 STRLEN l;
d4c19fe8 2738 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2739 I32 old = b - data->last_start_min;
5d1c421c 2740#endif
a0ed51b3
LW
2741
2742 if (UTF)
2743 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2744
a0ed51b3 2745 l -= old;
c277df42 2746 /* Get the added string: */
79cb57f6 2747 last_str = newSVpvn(s + old, l);
0e933229
IH
2748 if (UTF)
2749 SvUTF8_on(last_str);
c277df42
IZ
2750 if (deltanext == 0 && pos_before == b) {
2751 /* What was added is a constant string */
2752 if (mincount > 1) {
2753 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2754 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2755 SvPVX_const(last_str), l, mincount - 1);
b162af07 2756 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2757 /* Add additional parts. */
b81d288d 2758 SvCUR_set(data->last_found,
c277df42
IZ
2759 SvCUR(data->last_found) - l);
2760 sv_catsv(data->last_found, last_str);
0eda9292
JH
2761 {
2762 SV * sv = data->last_found;
2763 MAGIC *mg =
2764 SvUTF8(sv) && SvMAGICAL(sv) ?
2765 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2766 if (mg && mg->mg_len >= 0)
2767 mg->mg_len += CHR_SVLEN(last_str);
2768 }
c277df42
IZ
2769 data->last_end += l * (mincount - 1);
2770 }
2a8d9689
HS
2771 } else {
2772 /* start offset must point into the last copy */
2773 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2774 data->last_start_max += is_inf ? I32_MAX
2775 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2776 }
2777 }
2778 /* It is counted once already... */
2779 data->pos_min += minnext * (mincount - counted);
2780 data->pos_delta += - counted * deltanext +
2781 (minnext + deltanext) * maxcount - minnext * mincount;
2782 if (mincount != maxcount) {
653099ff
GS
2783 /* Cannot extend fixed substrings found inside
2784 the group. */
830247a4 2785 scan_commit(pRExC_state,data);
c277df42 2786 if (mincount && last_str) {
d4c19fe8
AL
2787 SV * const sv = data->last_found;
2788 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
2789 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2790
2791 if (mg)
2792 mg->mg_len = -1;
2793 sv_setsv(sv, last_str);
c277df42 2794 data->last_end = data->pos_min;
b81d288d 2795 data->last_start_min =
a0ed51b3 2796 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2797 data->last_start_max = is_inf
2798 ? I32_MAX
c277df42 2799 : data->pos_min + data->pos_delta
a0ed51b3 2800 - CHR_SVLEN(last_str);
c277df42
IZ
2801 }
2802 data->longest = &(data->longest_float);
2803 }
aca2d497 2804 SvREFCNT_dec(last_str);
c277df42 2805 }
405ff068 2806 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2807 data->flags |= SF_HAS_EVAL;
2808 optimize_curly_tail:
c277df42 2809 if (OP(oscan) != CURLYX) {
3dab1dad 2810 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2811 && NEXT_OFF(next))
2812 NEXT_OFF(oscan) += NEXT_OFF(next);
2813 }
c277df42 2814 continue;
653099ff 2815 default: /* REF and CLUMP only? */
c277df42 2816 if (flags & SCF_DO_SUBSTR) {
830247a4 2817 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2818 data->longest = &(data->longest_float);
2819 }
aca2d497 2820 is_inf = is_inf_internal = 1;
653099ff 2821 if (flags & SCF_DO_STCLASS_OR)
830247a4 2822 cl_anything(pRExC_state, data->start_class);
653099ff 2823 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2824 break;
2825 }
a0ed51b3 2826 }
bfed75c6 2827 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2828 int value = 0;
653099ff 2829
c277df42 2830 if (flags & SCF_DO_SUBSTR) {
830247a4 2831 scan_commit(pRExC_state,data);
c277df42
IZ
2832 data->pos_min++;
2833 }
2834 min++;
653099ff
GS
2835 if (flags & SCF_DO_STCLASS) {
2836 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2837
2838 /* Some of the logic below assumes that switching
2839 locale on will only add false positives. */
3dab1dad 2840 switch (PL_regkind[OP(scan)]) {
653099ff 2841 case SANY:
653099ff
GS
2842 default:
2843 do_default:
2844 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2845 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2846 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2847 break;
2848 case REG_ANY:
2849 if (OP(scan) == SANY)
2850 goto do_default;
2851 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2852 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2853 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2854 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2855 }
2856 if (flags & SCF_DO_STCLASS_AND || !value)
2857 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2858 break;
2859 case ANYOF:
2860 if (flags & SCF_DO_STCLASS_AND)
2861 cl_and(data->start_class,
2862 (struct regnode_charclass_class*)scan);
2863 else
830247a4 2864 cl_or(pRExC_state, data->start_class,
653099ff
GS
2865 (struct regnode_charclass_class*)scan);
2866 break;
2867 case ALNUM:
2868 if (flags & SCF_DO_STCLASS_AND) {
2869 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2870 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2871 for (value = 0; value < 256; value++)
2872 if (!isALNUM(value))
2873 ANYOF_BITMAP_CLEAR(data->start_class, value);
2874 }
2875 }
2876 else {
2877 if (data->start_class->flags & ANYOF_LOCALE)
2878 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2879 else {
2880 for (value = 0; value < 256; value++)
2881 if (isALNUM(value))
b81d288d 2882 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2883 }
2884 }
2885 break;
2886 case ALNUML:
2887 if (flags & SCF_DO_STCLASS_AND) {
2888 if (data->start_class->flags & ANYOF_LOCALE)
2889 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2890 }
2891 else {
2892 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2893 data->start_class->flags |= ANYOF_LOCALE;
2894 }
2895 break;
2896 case NALNUM:
2897 if (flags & SCF_DO_STCLASS_AND) {
2898 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2899 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2900 for (value = 0; value < 256; value++)
2901 if (isALNUM(value))
2902 ANYOF_BITMAP_CLEAR(data->start_class, value);
2903 }
2904 }
2905 else {
2906 if (data->start_class->flags & ANYOF_LOCALE)
2907 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2908 else {
2909 for (value = 0; value < 256; value++)
2910 if (!isALNUM(value))
b81d288d 2911 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2912 }
2913 }
2914 break;
2915 case NALNUML:
2916 if (flags & SCF_DO_STCLASS_AND) {
2917 if (data->start_class->flags & ANYOF_LOCALE)
2918 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2919 }
2920 else {
2921 data->start_class->flags |= ANYOF_LOCALE;
2922 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2923 }
2924 break;
2925 case SPACE:
2926 if (flags & SCF_DO_STCLASS_AND) {
2927 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2928 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2929 for (value = 0; value < 256; value++)
2930 if (!isSPACE(value))
2931 ANYOF_BITMAP_CLEAR(data->start_class, value);
2932 }
2933 }
2934 else {
2935 if (data->start_class->flags & ANYOF_LOCALE)
2936 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2937 else {
2938 for (value = 0; value < 256; value++)
2939 if (isSPACE(value))
b81d288d 2940 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2941 }
2942 }
2943 break;
2944 case SPACEL:
2945 if (flags & SCF_DO_STCLASS_AND) {
2946 if (data->start_class->flags & ANYOF_LOCALE)
2947 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2948 }
2949 else {
2950 data->start_class->flags |= ANYOF_LOCALE;
2951 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2952 }
2953 break;
2954 case NSPACE:
2955 if (flags & SCF_DO_STCLASS_AND) {
2956 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2957 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2958 for (value = 0; value < 256; value++)
2959 if (isSPACE(value))
2960 ANYOF_BITMAP_CLEAR(data->start_class, value);
2961 }
2962 }
2963 else {
2964 if (data->start_class->flags & ANYOF_LOCALE)
2965 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2966 else {
2967 for (value = 0; value < 256; value++)
2968 if (!isSPACE(value))
b81d288d 2969 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2970 }
2971 }
2972 break;
2973 case NSPACEL:
2974 if (flags & SCF_DO_STCLASS_AND) {
2975 if (data->start_class->flags & ANYOF_LOCALE) {
2976 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2977 for (value = 0; value < 256; value++)
2978 if (!isSPACE(value))
2979 ANYOF_BITMAP_CLEAR(data->start_class, value);
2980 }
2981 }
2982 else {
2983 data->start_class->flags |= ANYOF_LOCALE;
2984 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2985 }
2986 break;
2987 case DIGIT:
2988 if (flags & SCF_DO_STCLASS_AND) {
2989 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2990 for (value = 0; value < 256; value++)
2991 if (!isDIGIT(value))
2992 ANYOF_BITMAP_CLEAR(data->start_class, value);
2993 }
2994 else {
2995 if (data->start_class->flags & ANYOF_LOCALE)
2996 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2997 else {
2998 for (value = 0; value < 256; value++)
2999 if (isDIGIT(value))
b81d288d 3000 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3001 }
3002 }
3003 break;
3004 case NDIGIT:
3005 if (flags & SCF_DO_STCLASS_AND) {
3006 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3007 for (value = 0; value < 256; value++)
3008 if (isDIGIT(value))
3009 ANYOF_BITMAP_CLEAR(data->start_class, value);
3010 }
3011 else {
3012 if (data->start_class->flags & ANYOF_LOCALE)
3013 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3014 else {
3015 for (value = 0; value < 256; value++)
3016 if (!isDIGIT(value))
b81d288d 3017 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3018 }
3019 }
3020 break;
3021 }
3022 if (flags & SCF_DO_STCLASS_OR)
3023 cl_and(data->start_class, &and_with);
3024 flags &= ~SCF_DO_STCLASS;
3025 }
a0ed51b3 3026 }
3dab1dad 3027 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3028 data->flags |= (OP(scan) == MEOL
3029 ? SF_BEFORE_MEOL
3030 : SF_BEFORE_SEOL);
a0ed51b3 3031 }
3dab1dad 3032 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3033 /* Lookbehind, or need to calculate parens/evals/stclass: */
3034 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3035 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 3036 /* Lookahead/lookbehind */
cb434fcc 3037 I32 deltanext, minnext, fake = 0;
c277df42 3038 regnode *nscan;
653099ff
GS
3039 struct regnode_charclass_class intrnl;
3040 int f = 0;
c277df42
IZ
3041
3042 data_fake.flags = 0;
b81d288d 3043 if (data) {
2c2d71f5 3044 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
3045 data_fake.last_closep = data->last_closep;
3046 }
3047 else
3048 data_fake.last_closep = &fake;
653099ff
GS
3049 if ( flags & SCF_DO_STCLASS && !scan->flags
3050 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 3051 cl_init(pRExC_state, &intrnl);
653099ff 3052 data_fake.start_class = &intrnl;
e1901655 3053 f |= SCF_DO_STCLASS_AND;
653099ff 3054 }
e1901655
IZ
3055 if (flags & SCF_WHILEM_VISITED_POS)
3056 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
3057 next = regnext(scan);
3058 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 3059 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
3060 if (scan->flags) {
3061 if (deltanext) {
9baa0206 3062 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
3063 }
3064 else if (minnext > U8_MAX) {
9baa0206 3065 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 3066 }
eb160463 3067 scan->flags = (U8)minnext;
c277df42 3068 }
be8e71aa
YO
3069 if (data) {
3070 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3071 pars++;
3072 if (data_fake.flags & SF_HAS_EVAL)
3073 data->flags |= SF_HAS_EVAL;
2c2d71f5 3074 data->whilem_c = data_fake.whilem_c;
be8e71aa 3075 }
e1901655 3076 if (f & SCF_DO_STCLASS_AND) {
a28509cc 3077 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
3078
3079 cl_and(data->start_class, &intrnl);
3080 if (was)
3081 data->start_class->flags |= ANYOF_EOS;
3082 }
a0ed51b3
LW
3083 }
3084 else if (OP(scan) == OPEN) {
c277df42 3085 pars++;
a0ed51b3 3086 }
cb434fcc 3087 else if (OP(scan) == CLOSE) {
eb160463 3088 if ((I32)ARG(scan) == is_par) {
cb434fcc 3089 next = regnext(scan);
c277df42 3090
cb434fcc
IZ
3091 if ( next && (OP(next) != WHILEM) && next < last)
3092 is_par = 0; /* Disable optimization */
3093 }
3094 if (data)
3095 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
3096 }
3097 else if (OP(scan) == EVAL) {
c277df42
IZ
3098 if (data)
3099 data->flags |= SF_HAS_EVAL;
3100 }
96776eda 3101 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 3102 if (flags & SCF_DO_SUBSTR) {
830247a4 3103 scan_commit(pRExC_state,data);
0f5d15d6
IZ
3104 data->longest = &(data->longest_float);
3105 }
3106 is_inf = is_inf_internal = 1;
653099ff 3107 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3108 cl_anything(pRExC_state, data->start_class);
96776eda 3109 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3110 }
c277df42
IZ
3111 /* Else: zero-length, ignore. */
3112 scan = regnext(scan);
3113 }
3114
3115 finish:
3116 *scanp = scan;
aca2d497 3117 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3118 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
3119 data->pos_delta = I32_MAX - data->pos_min;
3120 if (is_par > U8_MAX)
3121 is_par = 0;
3122 if (is_par && pars==1 && data) {
3123 data->flags |= SF_IN_PAR;
3124 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3125 }
3126 else if (pars && data) {
c277df42
IZ
3127 data->flags |= SF_HAS_PAR;
3128 data->flags &= ~SF_IN_PAR;
3129 }
653099ff
GS
3130 if (flags & SCF_DO_STCLASS_OR)
3131 cl_and(data->start_class, &and_with);
07be1b83
YO
3132 if (flags & SCF_EXACT_TRIE)
3133 data->flags |= SCF_EXACT_TRIE;
c277df42
IZ
3134 return min;
3135}
3136
76e3520e 3137STATIC I32
5f66b61c 3138S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3139{
830247a4 3140 if (RExC_rx->data) {
b81d288d
AB
3141 Renewc(RExC_rx->data,
3142 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 3143 char, struct reg_data);
830247a4
IZ
3144 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3145 RExC_rx->data->count += n;
a0ed51b3
LW
3146 }
3147 else {
a02a5408 3148 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3149 char, struct reg_data);
a02a5408 3150 Newx(RExC_rx->data->what, n, U8);
830247a4 3151 RExC_rx->data->count = n;
c277df42 3152 }
830247a4
IZ
3153 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3154 return RExC_rx->data->count - n;
c277df42
IZ
3155}
3156
76234dfb 3157#ifndef PERL_IN_XSUB_RE
d88dccdf 3158void
864dbfa3 3159Perl_reginitcolors(pTHX)
d88dccdf 3160{
97aff369 3161 dVAR;
1df70142 3162 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3163 if (s) {
1df70142
AL
3164 char *t = savepv(s);
3165 int i = 0;
3166 PL_colors[0] = t;
d88dccdf 3167 while (++i < 6) {
1df70142
AL
3168 t = strchr(t, '\t');
3169 if (t) {
3170 *t = '\0';
3171 PL_colors[i] = ++t;
d88dccdf
IZ
3172 }
3173 else
1df70142 3174 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3175 }
3176 } else {
1df70142 3177 int i = 0;
b81d288d 3178 while (i < 6)
06b5626a 3179 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3180 }
3181 PL_colorset = 1;
3182}
76234dfb 3183#endif
8615cb43 3184
07be1b83 3185
a687059c 3186/*
e50aee73 3187 - pregcomp - compile a regular expression into internal code
a687059c
LW
3188 *
3189 * We can't allocate space until we know how big the compiled form will be,
3190 * but we can't compile it (and thus know how big it is) until we've got a
3191 * place to put the code. So we cheat: we compile it twice, once with code
3192 * generation turned off and size counting turned on, and once "for real".
3193 * This also means that we don't allocate space until we are sure that the
3194 * thing really will compile successfully, and we never have to move the
3195 * code and thus invalidate pointers into it. (Note that it has to be in
3196 * one piece because free() must be able to free it all.) [NB: not true in perl]
3197 *
3198 * Beware that the optimization-preparation code in here knows about some
3199 * of the structure of the compiled regexp. [I'll say.]
3200 */
3201regexp *
864dbfa3 3202Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3203{
97aff369 3204 dVAR;
a0d0e21e 3205 register regexp *r;
c277df42 3206 regnode *scan;
c277df42 3207 regnode *first;
a0d0e21e 3208 I32 flags;
a0d0e21e
LW
3209 I32 minlen = 0;
3210 I32 sawplus = 0;
3211 I32 sawopen = 0;
2c2d71f5 3212 scan_data_t data;
830247a4 3213 RExC_state_t RExC_state;
be8e71aa 3214 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3215#ifdef TRIE_STUDY_OPT
3216 int restudied= 0;
3217 RExC_state_t copyRExC_state;
3218#endif
a0d0e21e 3219
a3621e74
YO
3220 GET_RE_DEBUG_FLAGS_DECL;
3221
a0d0e21e 3222 if (exp == NULL)
c277df42 3223 FAIL("NULL regexp argument");
a0d0e21e 3224
a5961de5 3225 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3226
5cfc7842 3227 RExC_precomp = exp;
a3621e74
YO
3228 DEBUG_r(if (!PL_colorset) reginitcolors());
3229 DEBUG_COMPILE_r({
3230 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
3231 PL_colors[4],PL_colors[5],PL_colors[0],
3232 (int)(xend - exp), RExC_precomp, PL_colors[1]);
3233 });
e2509266 3234 RExC_flags = pm->op_pmflags;
830247a4 3235 RExC_sawback = 0;
bbce6d69 3236
830247a4
IZ
3237 RExC_seen = 0;
3238 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3239 RExC_seen_evals = 0;
3240 RExC_extralen = 0;
c277df42 3241
bbce6d69 3242 /* First pass: determine size, legality. */
830247a4 3243 RExC_parse = exp;
fac92740 3244 RExC_start = exp;
830247a4
IZ
3245 RExC_end = xend;
3246 RExC_naughty = 0;
3247 RExC_npar = 1;
3248 RExC_size = 0L;
3249 RExC_emit = &PL_regdummy;
3250 RExC_whilem_seen = 0;
85ddcde9
JH
3251#if 0 /* REGC() is (currently) a NOP at the first pass.
3252 * Clever compilers notice this and complain. --jhi */
830247a4 3253 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3254#endif
3dab1dad
YO
3255 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3256 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3257 RExC_precomp = NULL;
a0d0e21e
LW
3258 return(NULL);
3259 }
3dab1dad
YO
3260 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3261 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3262 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
07be1b83
YO
3263 DEBUG_PARSE_r({
3264 RExC_lastnum=0;
3265 RExC_lastparse=NULL;
3266 });
c277df42 3267
07be1b83 3268
c277df42
IZ
3269 /* Small enough for pointer-storage convention?
3270 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3271 if (RExC_size >= 0x10000L && RExC_extralen)
3272 RExC_size += RExC_extralen;
c277df42 3273 else
830247a4
IZ
3274 RExC_extralen = 0;
3275 if (RExC_whilem_seen > 15)
3276 RExC_whilem_seen = 15;
a0d0e21e 3277
bbce6d69 3278 /* Allocate space and initialize. */
a02a5408 3279 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3280 char, regexp);
a0d0e21e 3281 if (r == NULL)
b45f050a
JF
3282 FAIL("Regexp out of space");
3283
0f79a09d
GS
3284#ifdef DEBUGGING
3285 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3286 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3287#endif
c277df42 3288 r->refcnt = 1;
bbce6d69 3289 r->prelen = xend - exp;
5cfc7842 3290 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3291 r->subbeg = NULL;
f8c7b90f 3292#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3293 r->saved_copy = NULL;
ed252734 3294#endif
cf93c79d 3295 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3296 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3297 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3298
3299 r->substrs = 0; /* Useful during FAIL. */
3300 r->startp = 0; /* Useful during FAIL. */
3301 r->endp = 0; /* Useful during FAIL. */
3302
a02a5408 3303 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3304 if (r->offsets) {
2af232bd 3305 r->offsets[0] = RExC_size;
fac92740 3306 }
a3621e74 3307 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3308 "%s %"UVuf" bytes for offset annotations.\n",
3309 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3310 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 3311
830247a4 3312 RExC_rx = r;
bbce6d69 3313
3314 /* Second pass: emit code. */
e2509266 3315 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
3316 RExC_parse = exp;
3317 RExC_end = xend;
3318 RExC_naughty = 0;
3319 RExC_npar = 1;
fac92740 3320 RExC_emit_start = r->program;
830247a4 3321 RExC_emit = r->program;
2cd61cdb 3322 /* Store the count of eval-groups for security checks: */
eb160463 3323 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 3324 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 3325 r->data = 0;
3dab1dad 3326 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 3327 return(NULL);
07be1b83
YO
3328 /* XXXX To minimize changes to RE engine we always allocate
3329 3-units-long substrs field. */
3330 Newx(r->substrs, 1, struct reg_substr_data);
a0d0e21e 3331
07be1b83
YO
3332reStudy:
3333 Zero(r->substrs, 1, struct reg_substr_data);
3334 StructCopy(&zero_scan_data, &data, scan_data_t);
a3621e74 3335
07be1b83
YO
3336#ifdef TRIE_STUDY_OPT
3337 if ( restudied ) {
3338 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3339 RExC_state=copyRExC_state;
3340 if (data.longest_fixed)
3341 SvREFCNT_dec(data.longest_fixed);
3342 if (data.longest_float)
3343 SvREFCNT_dec(data.longest_float);
3344 if (data.last_found)
3345 SvREFCNT_dec(data.last_found);
3346 } else {
3347 copyRExC_state=RExC_state;
3348 }
3349#endif
a0d0e21e 3350 /* Dig out information for optimizations. */
cf93c79d 3351 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 3352 pm->op_pmflags = RExC_flags;
a0ed51b3 3353 if (UTF)
5ff6fc6d 3354 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 3355 r->regstclass = NULL;
830247a4 3356 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 3357 r->reganch |= ROPT_NAUGHTY;
c277df42 3358 scan = r->program + 1; /* First BRANCH. */
2779dcf1 3359
653099ff 3360 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 3361 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 3362 I32 fake;
c5254dd6 3363 STRLEN longest_float_length, longest_fixed_length;
07be1b83 3364 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 3365 int stclass_flag;
07be1b83 3366 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
3367
3368 first = scan;
c277df42 3369 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 3370 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 3371 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 3372 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
3373 /* for now we can't handle lookbehind IFMATCH*/
3374 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
3375 (OP(first) == PLUS) ||
3376 (OP(first) == MINMOD) ||
653099ff 3377 /* An {n,m} with n>0 */
07be1b83
YO
3378 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3379 {
3380 DEBUG_PEEP("first:",first,0);
a0d0e21e
LW
3381 if (OP(first) == PLUS)
3382 sawplus = 1;
3383 else
3dab1dad 3384 first += regarglen[OP(first)];
07be1b83
YO
3385 if (OP(first) == IFMATCH) {
3386 first = NEXTOPER(first);
3387 first += EXTRA_STEP_2ARGS;
3388 } else /*xxx possible optimisation for /(?=)/*/
3389 first = NEXTOPER(first);
a687059c
LW
3390 }
3391
a0d0e21e
LW
3392 /* Starting-point info. */
3393 again:
07be1b83 3394 /* Ignore EXACT as we deal with it later. */
3dab1dad 3395 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 3396 if (OP(first) == EXACT)
6f207bd3 3397 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 3398 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
3399 r->regstclass = first;
3400 }
07be1b83
YO
3401#ifdef TRIE_STCLASS
3402 else if (OP(first) == TRIE &&
3403 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3404 {
3405 /* this can happen only on restudy */
3406 struct regnode_1 *trie_op;
3407 Newxz(trie_op,1,struct regnode_1);
3408 StructCopy(first,trie_op,struct regnode_1);
3409 make_trie_failtable(pRExC_state, (regnode *)first, (regnode *)trie_op, 0);
3410 r->regstclass = (regnode *)trie_op;
3411 }
3412#endif
bfed75c6 3413 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 3414 r->regstclass = first;
3dab1dad
YO
3415 else if (PL_regkind[OP(first)] == BOUND ||
3416 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 3417 r->regstclass = first;
3dab1dad 3418 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
3419 r->reganch |= (OP(first) == MBOL
3420 ? ROPT_ANCH_MBOL
3421 : (OP(first) == SBOL
3422 ? ROPT_ANCH_SBOL
3423 : ROPT_ANCH_BOL));
a0d0e21e 3424 first = NEXTOPER(first);
774d564b 3425 goto again;
3426 }
3427 else if (OP(first) == GPOS) {
3428 r->reganch |= ROPT_ANCH_GPOS;
3429 first = NEXTOPER(first);
3430 goto again;
a0d0e21e 3431 }
e09294f4 3432 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 3433 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3434 !(r->reganch & ROPT_ANCH) )
3435 {
3436 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
3437 const int type =
3438 (OP(NEXTOPER(first)) == REG_ANY)
3439 ? ROPT_ANCH_MBOL
3440 : ROPT_ANCH_SBOL;
cad2e5aa 3441 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3442 first = NEXTOPER(first);
774d564b 3443 goto again;
a0d0e21e 3444 }
b81d288d 3445 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3446 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3447 /* x+ must match at the 1st pos of run of x's */
3448 r->reganch |= ROPT_SKIP;
a0d0e21e 3449
c277df42 3450 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa
YO
3451#ifdef TRIE_STUDY_OPT
3452 DEBUG_COMPILE_r(
3453 if (!restudied)
3454 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3455 (IV)(first - scan + 1))
3456 );
3457#else
3458 DEBUG_COMPILE_r(
3459 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3460 (IV)(first - scan + 1))
3461 );
3462#endif
3463
3464
a0d0e21e
LW
3465 /*
3466 * If there's something expensive in the r.e., find the
3467 * longest literal string that must appear and make it the
3468 * regmust. Resolve ties in favor of later strings, since
3469 * the regstart check works with the beginning of the r.e.
3470 * and avoiding duplication strengthens checking. Not a
3471 * strong reason, but sufficient in the absence of others.
3472 * [Now we resolve ties in favor of the earlier string if
c277df42 3473 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3474 * earlier string may buy us something the later one won't.]
3475 */
a0d0e21e 3476 minlen = 0;
a687059c 3477
396482e1
GA
3478 data.longest_fixed = newSVpvs("");
3479 data.longest_float = newSVpvs("");
3480 data.last_found = newSVpvs("");
c277df42
IZ
3481 data.longest = &(data.longest_fixed);
3482 first = scan;
653099ff 3483 if (!r->regstclass) {
830247a4 3484 cl_init(pRExC_state, &ch_class);
653099ff
GS
3485 data.start_class = &ch_class;
3486 stclass_flag = SCF_DO_STCLASS_AND;
3487 } else /* XXXX Check for BOUND? */
3488 stclass_flag = 0;
cb434fcc 3489 data.last_closep = &last_close;
653099ff 3490
830247a4 3491 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3492 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83
YO
3493
3494#ifdef TRIE_STUDY_OPT
3495 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3496 goto reStudy;
3497 }
3498#endif
3499
830247a4 3500 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3501 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3502 && !RExC_seen_zerolen
3503 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3504 r->reganch |= ROPT_CHECK_ALL;
830247a4 3505 scan_commit(pRExC_state, &data);
c277df42
IZ
3506 SvREFCNT_dec(data.last_found);
3507
a0ed51b3 3508 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3509 if (longest_float_length
c277df42
IZ
3510 || (data.flags & SF_FL_BEFORE_EOL
3511 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3512 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3513 int t;
3514
a0ed51b3 3515 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3516 && data.offset_fixed == data.offset_float_min
3517 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3518 goto remove_float; /* As in (a)+. */
3519
33b8afdf
JH
3520 if (SvUTF8(data.longest_float)) {
3521 r->float_utf8 = data.longest_float;
c445ea15 3522 r->float_substr = NULL;
33b8afdf
JH
3523 } else {
3524 r->float_substr = data.longest_float;
c445ea15 3525 r->float_utf8 = NULL;
33b8afdf 3526 }
c277df42
IZ
3527 r->float_min_offset = data.offset_float_min;
3528 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3529 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3530 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3531 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3532 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3533 }
3534 else {
aca2d497 3535 remove_float:
c445ea15 3536 r->float_substr = r->float_utf8 = NULL;
c277df42 3537 SvREFCNT_dec(data.longest_float);
c5254dd6 3538 longest_float_length = 0;
a0d0e21e 3539 }
c277df42 3540
a0ed51b3 3541 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3542 if (longest_fixed_length
c277df42
IZ
3543 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3544 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3545 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3546 int t;
3547
33b8afdf
JH
3548 if (SvUTF8(data.longest_fixed)) {
3549 r->anchored_utf8 = data.longest_fixed;
c445ea15 3550 r->anchored_substr = NULL;
33b8afdf
JH
3551 } else {
3552 r->anchored_substr = data.longest_fixed;
c445ea15 3553 r->anchored_utf8 = NULL;
33b8afdf 3554 }
c277df42 3555 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3556 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3557 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3558 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3559 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3560 }
3561 else {
c445ea15 3562 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 3563 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3564 longest_fixed_length = 0;
a0d0e21e 3565 }
b81d288d 3566 if (r->regstclass
ffc61ed2 3567 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3568 r->regstclass = NULL;
33b8afdf
JH
3569 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3570 && stclass_flag
653099ff 3571 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3572 && !cl_is_anything(data.start_class))
3573 {
1df70142 3574 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3575
a02a5408 3576 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3577 struct regnode_charclass_class);
3578 StructCopy(data.start_class,
830247a4 3579 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3580 struct regnode_charclass_class);
830247a4 3581 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3582 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3583 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 3584 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3585 PerlIO_printf(Perl_debug_log,
a0288114 3586 "synthetic stclass \"%s\".\n",
3f7c398e 3587 SvPVX_const(sv));});
653099ff 3588 }
c277df42
IZ
3589
3590 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3591 if (longest_fixed_length > longest_float_length) {
c277df42 3592 r->check_substr = r->anchored_substr;
33b8afdf 3593 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3594 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3595 if (r->reganch & ROPT_ANCH_SINGLE)
3596 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3597 }
3598 else {
c277df42 3599 r->check_substr = r->float_substr;
33b8afdf 3600 r->check_utf8 = r->float_utf8;
c277df42
IZ
3601 r->check_offset_min = data.offset_float_min;
3602 r->check_offset_max = data.offset_float_max;
a0d0e21e 3603 }
30382c73
IZ
3604 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3605 This should be changed ASAP! */
33b8afdf 3606 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3607 r->reganch |= RE_USE_INTUIT;
33b8afdf 3608 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3609 r->reganch |= RE_INTUIT_TAIL;
3610 }
a0ed51b3
LW
3611 }
3612 else {
c277df42
IZ
3613 /* Several toplevels. Best we can is to set minlen. */
3614 I32 fake;
653099ff 3615 struct regnode_charclass_class ch_class;
cb434fcc 3616 I32 last_close = 0;
c277df42 3617
a3621e74 3618 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
07be1b83 3619
c277df42 3620 scan = r->program + 1;
830247a4 3621 cl_init(pRExC_state, &ch_class);
653099ff 3622 data.start_class = &ch_class;
cb434fcc 3623 data.last_closep = &last_close;
07be1b83
YO
3624
3625 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
3626 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3627
3628#ifdef TRIE_STUDY_OPT
3629 if ( (data.flags & SCF_EXACT_TRIE) && ! restudied++ ) {
3630 goto reStudy;
3631 }
3632#endif
3633
33b8afdf 3634 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 3635 = r->float_substr = r->float_utf8 = NULL;
653099ff 3636 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3637 && !cl_is_anything(data.start_class))
3638 {
1df70142 3639 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3640
a02a5408 3641 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3642 struct regnode_charclass_class);
3643 StructCopy(data.start_class,
830247a4 3644 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3645 struct regnode_charclass_class);
830247a4 3646 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3647 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3648 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 3649 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3650 PerlIO_printf(Perl_debug_log,
a0288114 3651 "synthetic stclass \"%s\".\n",
3f7c398e 3652 SvPVX_const(sv));});
653099ff 3653 }
a0d0e21e
LW
3654 }
3655
a0d0e21e 3656 r->minlen = minlen;
b81d288d 3657 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3658 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3659 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3660 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3661 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3662 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3663 if (RExC_seen & REG_SEEN_CANY)
3664 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3665 Newxz(r->startp, RExC_npar, I32);
3666 Newxz(r->endp, RExC_npar, I32);
07be1b83 3667
f2278c82 3668 DEBUG_r( RX_DEBUG_on(r) );
be8e71aa
YO
3669 DEBUG_DUMP_r({
3670 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
3671 regdump(r);
3672 });
8e9a8a48
YO
3673 DEBUG_OFFSETS_r(if (r->offsets) {
3674 const U32 len = r->offsets[0];
3675 U32 i;
3676 GET_RE_DEBUG_FLAGS_DECL;
3677 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
3678 for (i = 1; i <= len; i++) {
3679 if (r->offsets[i*2-1] || r->offsets[i*2])
3680 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
3681 i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
3682 }
3683 PerlIO_printf(Perl_debug_log, "\n");
3684 });
a0d0e21e 3685 return(r);
a687059c
LW
3686}
3687
3dab1dad
YO
3688
3689#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3690 int rem=(int)(RExC_end - RExC_parse); \
3691 int cut; \
3692 int num; \
3693 int iscut=0; \
3694 if (rem>10) { \
3695 rem=10; \
3696 iscut=1; \
3697 } \
3698 cut=10-rem; \
3699 if (RExC_lastparse!=RExC_parse) \
3700 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3701 rem, RExC_parse, \
3702 cut + 4, \
3703 iscut ? "..." : "<" \
3704 ); \
3705 else \
3706 PerlIO_printf(Perl_debug_log,"%16s",""); \
3707 \
3708 if (SIZE_ONLY) \
3709 num=RExC_size; \
3710 else \
3711 num=REG_NODE_NUM(RExC_emit); \
3712 if (RExC_lastnum!=num) \
be8e71aa 3713 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 3714 else \
be8e71aa
YO
3715 PerlIO_printf(Perl_debug_log,"|%4s",""); \
3716 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
3717 (int)((depth*2)), "", \
3dab1dad
YO
3718 (funcname) \
3719 ); \
3720 RExC_lastnum=num; \
3721 RExC_lastparse=RExC_parse; \
3722})
3723
07be1b83
YO
3724
3725
3dab1dad
YO
3726#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3727 DEBUG_PARSE_MSG((funcname)); \
3728 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3729})
a687059c
LW
3730/*
3731 - reg - regular expression, i.e. main body or parenthesized thing
3732 *
3733 * Caller must absorb opening parenthesis.
3734 *
3735 * Combining parenthesis handling with the base level of regular expression
3736 * is a trifle forced, but the need to tie the tails of the branches to what
3737 * follows makes it hard to avoid.
3738 */
07be1b83
YO
3739#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
3740#ifdef DEBUGGING
3741#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
3742#else
3743#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
3744#endif
3dab1dad 3745
76e3520e 3746STATIC regnode *
3dab1dad 3747S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 3748 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3749{
27da23d5 3750 dVAR;
c277df42
IZ
3751 register regnode *ret; /* Will be the head of the group. */
3752 register regnode *br;
3753 register regnode *lastbr;
cbbf8932 3754 register regnode *ender = NULL;
a0d0e21e 3755 register I32 parno = 0;
cbbf8932
AL
3756 I32 flags;
3757 const I32 oregflags = RExC_flags;
6136c704
AL
3758 bool have_branch = 0;
3759 bool is_open = 0;
9d1d55b5
JP
3760
3761 /* for (?g), (?gc), and (?o) warnings; warning
3762 about (?c) will warn about (?g) -- japhy */
3763
6136c704
AL
3764#define WASTED_O 0x01
3765#define WASTED_G 0x02
3766#define WASTED_C 0x04
3767#define WASTED_GC (0x02|0x04)
cbbf8932 3768 I32 wastedflags = 0x00;
9d1d55b5 3769
fac92740 3770 char * parse_start = RExC_parse; /* MJD */
a28509cc 3771 char * const oregcomp_parse = RExC_parse;
a0d0e21e 3772
3dab1dad
YO
3773 GET_RE_DEBUG_FLAGS_DECL;
3774 DEBUG_PARSE("reg ");
3775
3776
821b33a5 3777 *flagp = 0; /* Tentatively. */
a0d0e21e 3778
9d1d55b5 3779
a0d0e21e
LW
3780 /* Make an OPEN node, if parenthesized. */
3781 if (paren) {
fac92740 3782 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3783 U32 posflags = 0, negflags = 0;
3784 U32 *flagsp = &posflags;
6136c704 3785 bool is_logical = 0;
a28509cc 3786 const char * const seqstart = RExC_parse;
ca9dfc88 3787
830247a4
IZ
3788 RExC_parse++;
3789 paren = *RExC_parse++;
c277df42 3790 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3791 switch (paren) {
fac92740 3792 case '<': /* (?<...) */
830247a4 3793 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3794 if (*RExC_parse == '!')
c277df42 3795 paren = ',';
b81d288d 3796 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3797 goto unknown;
830247a4 3798 RExC_parse++;
fac92740
MJD
3799 case '=': /* (?=...) */
3800 case '!': /* (?!...) */
830247a4 3801 RExC_seen_zerolen++;
fac92740
MJD
3802 case ':': /* (?:...) */
3803 case '>': /* (?>...) */
a0d0e21e 3804 break;
fac92740
MJD
3805 case '$': /* (?$...) */
3806 case '@': /* (?@...) */
8615cb43 3807 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3808 break;
fac92740 3809 case '#': /* (?#...) */
830247a4
IZ
3810 while (*RExC_parse && *RExC_parse != ')')
3811 RExC_parse++;
3812 if (*RExC_parse != ')')
c277df42 3813 FAIL("Sequence (?#... not terminated");
830247a4 3814 nextchar(pRExC_state);
a0d0e21e
LW
3815 *flagp = TRYAGAIN;
3816 return NULL;
fac92740 3817 case 'p': /* (?p...) */
9014280d 3818 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3819 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3820 /* FALL THROUGH*/
fac92740 3821 case '?': /* (??...) */
6136c704 3822 is_logical = 1;
438a3801
YST
3823 if (*RExC_parse != '{')
3824 goto unknown;
830247a4 3825 paren = *RExC_parse++;
0f5d15d6 3826 /* FALL THROUGH */
fac92740 3827 case '{': /* (?{...}) */
c277df42 3828 {
c277df42
IZ
3829 I32 count = 1, n = 0;
3830 char c;
830247a4 3831 char *s = RExC_parse;
c277df42 3832
830247a4
IZ
3833 RExC_seen_zerolen++;
3834 RExC_seen |= REG_SEEN_EVAL;
3835 while (count && (c = *RExC_parse)) {
6136c704
AL
3836 if (c == '\\') {
3837 if (RExC_parse[1])
3838 RExC_parse++;
3839 }
b81d288d 3840 else if (c == '{')
c277df42 3841 count++;
b81d288d 3842 else if (c == '}')
c277df42 3843 count--;
830247a4 3844 RExC_parse++;
c277df42 3845 }
6136c704 3846 if (*RExC_parse != ')') {
b81d288d 3847 RExC_parse = s;
b45f050a
JF
3848 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3849 }
c277df42 3850 if (!SIZE_ONLY) {
f3548bdc 3851 PAD *pad;
6136c704
AL
3852 OP_4tree *sop, *rop;
3853 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3854
569233ed
SB
3855 ENTER;
3856 Perl_save_re_context(aTHX);
f3548bdc 3857 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3858 sop->op_private |= OPpREFCOUNTED;
3859 /* re_dup will OpREFCNT_inc */
3860 OpREFCNT_set(sop, 1);
569233ed 3861 LEAVE;
c277df42 3862
830247a4
IZ
3863 n = add_data(pRExC_state, 3, "nop");
3864 RExC_rx->data->data[n] = (void*)rop;
3865 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3866 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3867 SvREFCNT_dec(sv);
a0ed51b3 3868 }
e24b16f9 3869 else { /* First pass */
830247a4 3870 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3871 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3872 /* No compiled RE interpolated, has runtime
3873 components ===> unsafe. */
3874 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3875 if (PL_tainting && PL_tainted)
cc6b7395 3876 FAIL("Eval-group in insecure regular expression");
54df2634 3877#if PERL_VERSION > 8
923e4eb5 3878 if (IN_PERL_COMPILETIME)
b5c19bd7 3879 PL_cv_has_eval = 1;
54df2634 3880#endif
c277df42 3881 }
b5c19bd7 3882
830247a4 3883 nextchar(pRExC_state);
6136c704 3884 if (is_logical) {
830247a4 3885 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3886 if (!SIZE_ONLY)
3887 ret->flags = 2;
3dab1dad 3888 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3889 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3890 return ret;
3891 }
ccb2c380
MP
3892 ret = reganode(pRExC_state, EVAL, n);
3893 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3894 Set_Node_Offset(ret, parse_start);
3895 return ret;
c277df42 3896 }
fac92740 3897 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3898 {
fac92740 3899 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3900 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3901 || RExC_parse[1] == '<'
830247a4 3902 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3903 I32 flag;
3904
830247a4 3905 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3906 if (!SIZE_ONLY)
3907 ret->flags = 1;
3dab1dad 3908 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 3909 goto insert_if;
b81d288d 3910 }
a0ed51b3 3911 }
830247a4 3912 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3913 /* (?(1)...) */
6136c704 3914 char c;
830247a4 3915 parno = atoi(RExC_parse++);
c277df42 3916
830247a4
IZ
3917 while (isDIGIT(*RExC_parse))
3918 RExC_parse++;
fac92740 3919 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3920
830247a4 3921 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3922 vFAIL("Switch condition not recognized");
c277df42 3923 insert_if:
3dab1dad
YO
3924 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3925 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 3926 if (br == NULL)
830247a4 3927 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3928 else
3dab1dad 3929 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 3930 c = *nextchar(pRExC_state);
d1b80229
IZ
3931 if (flags&HASWIDTH)
3932 *flagp |= HASWIDTH;
c277df42 3933 if (c == '|') {
830247a4 3934 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
3935 regbranch(pRExC_state, &flags, 1,depth+1);
3936 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
3937 if (flags&HASWIDTH)
3938 *flagp |= HASWIDTH;
830247a4 3939 c = *nextchar(pRExC_state);
a0ed51b3
LW
3940 }
3941 else
c277df42
IZ
3942 lastbr = NULL;
3943 if (c != ')')
8615cb43 3944 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 3945 ender = reg_node(pRExC_state, TAIL);
3dab1dad 3946 REGTAIL(pRExC_state, br, ender);
c277df42 3947 if (lastbr) {
3dab1dad
YO
3948 REGTAIL(pRExC_state, lastbr, ender);
3949 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3950 }
3951 else
3dab1dad 3952 REGTAIL(pRExC_state, ret, ender);
c277df42 3953 return ret;
a0ed51b3
LW
3954 }
3955 else {
830247a4 3956 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3957 }
3958 }
1b1626e4 3959 case 0:
830247a4 3960 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3961 vFAIL("Sequence (? incomplete");
1b1626e4 3962 break;
a0d0e21e 3963 default:
830247a4 3964 --RExC_parse;
fac92740 3965 parse_flags: /* (?i) */
830247a4 3966 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3967 /* (?g), (?gc) and (?o) are useless here
3968 and must be globally applied -- japhy */
3969
3970 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3971 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 3972 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
3973 if (! (wastedflags & wflagbit) ) {
3974 wastedflags |= wflagbit;
3975 vWARN5(
3976 RExC_parse + 1,
3977 "Useless (%s%c) - %suse /%c modifier",
3978 flagsp == &negflags ? "?-" : "?",
3979 *RExC_parse,
3980 flagsp == &negflags ? "don't " : "",
3981 *RExC_parse
3982 );
3983 }
3984 }
3985 }
3986 else if (*RExC_parse == 'c') {
3987 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
3988 if (! (wastedflags & WASTED_C) ) {
3989 wastedflags |= WASTED_GC;
9d1d55b5
JP
3990 vWARN3(
3991 RExC_parse + 1,
3992 "Useless (%sc) - %suse /gc modifier",
3993 flagsp == &negflags ? "?-" : "?",
3994 flagsp == &negflags ? "don't " : ""
3995 );
3996 }
3997 }
3998 }
3999 else { pmflag(flagsp, *RExC_parse); }
4000
830247a4 4001 ++RExC_parse;
ca9dfc88 4002 }
830247a4 4003 if (*RExC_parse == '-') {
ca9dfc88 4004 flagsp = &negflags;
9d1d55b5 4005 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 4006 ++RExC_parse;
ca9dfc88 4007 goto parse_flags;
48c036b1 4008 }
e2509266
JH
4009 RExC_flags |= posflags;
4010 RExC_flags &= ~negflags;
830247a4
IZ
4011 if (*RExC_parse == ':') {
4012 RExC_parse++;
ca9dfc88
IZ
4013 paren = ':';
4014 break;
4015 }
c277df42 4016 unknown:
830247a4
IZ
4017 if (*RExC_parse != ')') {
4018 RExC_parse++;
4019 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 4020 }
830247a4 4021 nextchar(pRExC_state);
a0d0e21e
LW
4022 *flagp = TRYAGAIN;
4023 return NULL;
4024 }
4025 }
fac92740 4026 else { /* (...) */
830247a4
IZ
4027 parno = RExC_npar;
4028 RExC_npar++;
4029 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
4030 Set_Node_Length(ret, 1); /* MJD */
4031 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 4032 is_open = 1;
a0d0e21e 4033 }
a0ed51b3 4034 }
fac92740 4035 else /* ! paren */
a0d0e21e
LW
4036 ret = NULL;
4037
4038 /* Pick up the branches, linking them together. */
fac92740 4039 parse_start = RExC_parse; /* MJD */
3dab1dad 4040 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 4041 /* branch_len = (paren != 0); */
2af232bd 4042
a0d0e21e
LW
4043 if (br == NULL)
4044 return(NULL);
830247a4
IZ
4045 if (*RExC_parse == '|') {
4046 if (!SIZE_ONLY && RExC_extralen) {
4047 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 4048 }
fac92740 4049 else { /* MJD */
830247a4 4050 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
4051 Set_Node_Length(br, paren != 0);
4052 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4053 }
c277df42
IZ
4054 have_branch = 1;
4055 if (SIZE_ONLY)
830247a4 4056 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
4057 }
4058 else if (paren == ':') {
c277df42
IZ
4059 *flagp |= flags&SIMPLE;
4060 }
6136c704 4061 if (is_open) { /* Starts with OPEN. */
3dab1dad 4062 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
4063 }
4064 else if (paren != '?') /* Not Conditional */
a0d0e21e 4065 ret = br;
32a0ca98 4066 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 4067 lastbr = br;
830247a4
IZ
4068 while (*RExC_parse == '|') {
4069 if (!SIZE_ONLY && RExC_extralen) {
4070 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 4071 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
4072 }
4073 if (SIZE_ONLY)
830247a4
IZ
4074 RExC_extralen += 2; /* Account for LONGJMP. */
4075 nextchar(pRExC_state);
3dab1dad 4076 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 4077
a687059c 4078 if (br == NULL)
a0d0e21e 4079 return(NULL);
3dab1dad 4080 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 4081 lastbr = br;
821b33a5
IZ
4082 if (flags&HASWIDTH)
4083 *flagp |= HASWIDTH;
a687059c 4084 *flagp |= flags&SPSTART;
a0d0e21e
LW
4085 }
4086
c277df42
IZ
4087 if (have_branch || paren != ':') {
4088 /* Make a closing node, and hook it on the end. */
4089 switch (paren) {
4090 case ':':
830247a4 4091 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
4092 break;
4093 case 1:
830247a4 4094 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
4095 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4096 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
4097 break;
4098 case '<':
c277df42
IZ
4099 case ',':
4100 case '=':
4101 case '!':
c277df42 4102 *flagp &= ~HASWIDTH;
821b33a5
IZ
4103 /* FALL THROUGH */
4104 case '>':
830247a4 4105 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
4106 break;
4107 case 0:
830247a4 4108 ender = reg_node(pRExC_state, END);
c277df42
IZ
4109 break;
4110 }
07be1b83 4111 REGTAIL_STUDY(pRExC_state, lastbr, ender);
a0d0e21e 4112
9674d46a 4113 if (have_branch && !SIZE_ONLY) {
c277df42 4114 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
4115 for (br = ret; br; br = regnext(br)) {
4116 const U8 op = PL_regkind[OP(br)];
4117 if (op == BRANCH) {
07be1b83 4118 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
4119 }
4120 else if (op == BRANCHJ) {
07be1b83 4121 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 4122 }
c277df42
IZ
4123 }
4124 }
a0d0e21e 4125 }
c277df42
IZ
4126
4127 {
e1ec3a88
AL
4128 const char *p;
4129 static const char parens[] = "=!<,>";
c277df42
IZ
4130
4131 if (paren && (p = strchr(parens, paren))) {
eb160463 4132 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
4133 int flag = (p - parens) > 1;
4134
4135 if (paren == '>')
4136 node = SUSPEND, flag = 0;
830247a4 4137 reginsert(pRExC_state, node,ret);
45948336
EP
4138 Set_Node_Cur_Length(ret);
4139 Set_Node_Offset(ret, parse_start + 1);
c277df42 4140 ret->flags = flag;
07be1b83 4141 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 4142 }
a0d0e21e
LW
4143 }
4144
4145 /* Check for proper termination. */
ce3e6498 4146 if (paren) {
e2509266 4147 RExC_flags = oregflags;
830247a4
IZ
4148 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4149 RExC_parse = oregcomp_parse;
380a0633 4150 vFAIL("Unmatched (");
ce3e6498 4151 }
a0ed51b3 4152 }
830247a4
IZ
4153 else if (!paren && RExC_parse < RExC_end) {
4154 if (*RExC_parse == ')') {
4155 RExC_parse++;
380a0633 4156 vFAIL("Unmatched )");
a0ed51b3
LW
4157 }
4158 else
b45f050a 4159 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
4160 /* NOTREACHED */
4161 }
a687059c 4162
a0d0e21e 4163 return(ret);
a687059c
LW
4164}
4165
4166/*
4167 - regbranch - one alternative of an | operator
4168 *
4169 * Implements the concatenation operator.
4170 */
76e3520e 4171STATIC regnode *
3dab1dad 4172S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 4173{
97aff369 4174 dVAR;
c277df42
IZ
4175 register regnode *ret;
4176 register regnode *chain = NULL;
4177 register regnode *latest;
4178 I32 flags = 0, c = 0;
3dab1dad
YO
4179 GET_RE_DEBUG_FLAGS_DECL;
4180 DEBUG_PARSE("brnc");
b81d288d 4181 if (first)
c277df42
IZ
4182 ret = NULL;
4183 else {
b81d288d 4184 if (!SIZE_ONLY && RExC_extralen)
830247a4 4185 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 4186 else {
830247a4 4187 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
4188 Set_Node_Length(ret, 1);
4189 }
c277df42
IZ
4190 }
4191
b81d288d 4192 if (!first && SIZE_ONLY)
830247a4 4193 RExC_extralen += 1; /* BRANCHJ */
b81d288d 4194
c277df42 4195 *flagp = WORST; /* Tentatively. */
a0d0e21e 4196
830247a4
IZ
4197 RExC_parse--;
4198 nextchar(pRExC_state);
4199 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 4200 flags &= ~TRYAGAIN;
3dab1dad 4201 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4202 if (latest == NULL) {
4203 if (flags & TRYAGAIN)
4204 continue;
4205 return(NULL);
a0ed51b3
LW
4206 }
4207 else if (ret == NULL)
c277df42 4208 ret = latest;
a0d0e21e 4209 *flagp |= flags&HASWIDTH;
c277df42 4210 if (chain == NULL) /* First piece. */
a0d0e21e
LW
4211 *flagp |= flags&SPSTART;
4212 else {
830247a4 4213 RExC_naughty++;
3dab1dad 4214 REGTAIL(pRExC_state, chain, latest);
a687059c 4215 }
a0d0e21e 4216 chain = latest;
c277df42
IZ
4217 c++;
4218 }
4219 if (chain == NULL) { /* Loop ran zero times. */
830247a4 4220 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
4221 if (ret == NULL)
4222 ret = chain;
4223 }
4224 if (c == 1) {
4225 *flagp |= flags&SIMPLE;
a0d0e21e 4226 }
a687059c 4227
d4c19fe8 4228 return ret;
a687059c
LW
4229}
4230
4231/*
4232 - regpiece - something followed by possible [*+?]
4233 *
4234 * Note that the branching code sequences used for ? and the general cases
4235 * of * and + are somewhat optimized: they use the same NOTHING node as
4236 * both the endmarker for their branch list and the body of the last branch.
4237 * It might seem that this node could be dispensed with entirely, but the
4238 * endmarker role is not redundant.
4239 */
76e3520e 4240STATIC regnode *
3dab1dad 4241S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4242{
97aff369 4243 dVAR;
c277df42 4244 register regnode *ret;
a0d0e21e
LW
4245 register char op;
4246 register char *next;
4247 I32 flags;
1df70142 4248 const char * const origparse = RExC_parse;
a0d0e21e 4249 I32 min;
c277df42 4250 I32 max = REG_INFTY;
fac92740 4251 char *parse_start;
3dab1dad
YO
4252 GET_RE_DEBUG_FLAGS_DECL;
4253 DEBUG_PARSE("piec");
a0d0e21e 4254
3dab1dad 4255 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4256 if (ret == NULL) {
4257 if (flags & TRYAGAIN)
4258 *flagp |= TRYAGAIN;
4259 return(NULL);
4260 }
4261
830247a4 4262 op = *RExC_parse;
a0d0e21e 4263
830247a4 4264 if (op == '{' && regcurly(RExC_parse)) {
3dab1dad 4265 const char *maxpos = NULL;
fac92740 4266 parse_start = RExC_parse; /* MJD */
830247a4 4267 next = RExC_parse + 1;
a0d0e21e
LW
4268 while (isDIGIT(*next) || *next == ',') {
4269 if (*next == ',') {
4270 if (maxpos)
4271 break;
4272 else
4273 maxpos = next;
a687059c 4274 }
a0d0e21e
LW
4275 next++;
4276 }
4277 if (*next == '}') { /* got one */
4278 if (!maxpos)
4279 maxpos = next;
830247a4
IZ
4280 RExC_parse++;
4281 min = atoi(RExC_parse);
a0d0e21e
LW
4282 if (*maxpos == ',')
4283 maxpos++;
4284 else
830247a4 4285 maxpos = RExC_parse;
a0d0e21e
LW
4286 max = atoi(maxpos);
4287 if (!max && *maxpos != '0')
c277df42
IZ
4288 max = REG_INFTY; /* meaning "infinity" */
4289 else if (max >= REG_INFTY)
8615cb43 4290 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
4291 RExC_parse = next;
4292 nextchar(pRExC_state);
a0d0e21e
LW
4293
4294 do_curly:
4295 if ((flags&SIMPLE)) {
830247a4
IZ
4296 RExC_naughty += 2 + RExC_naughty / 2;
4297 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
4298 Set_Node_Offset(ret, parse_start+1); /* MJD */
4299 Set_Node_Cur_Length(ret);
a0d0e21e
LW
4300 }
4301 else {
3dab1dad 4302 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
4303
4304 w->flags = 0;
3dab1dad 4305 REGTAIL(pRExC_state, ret, w);
830247a4
IZ
4306 if (!SIZE_ONLY && RExC_extralen) {
4307 reginsert(pRExC_state, LONGJMP,ret);
4308 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
4309 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4310 }
830247a4 4311 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
4312 /* MJD hk */
4313 Set_Node_Offset(ret, parse_start+1);
2af232bd 4314 Set_Node_Length(ret,
fac92740 4315 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 4316
830247a4 4317 if (!SIZE_ONLY && RExC_extralen)
c277df42 4318 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 4319 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 4320 if (SIZE_ONLY)
830247a4
IZ
4321 RExC_whilem_seen++, RExC_extralen += 3;
4322 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 4323 }
c277df42 4324 ret->flags = 0;
a0d0e21e
LW
4325
4326 if (min > 0)
821b33a5
IZ
4327 *flagp = WORST;
4328 if (max > 0)
4329 *flagp |= HASWIDTH;
a0d0e21e 4330 if (max && max < min)
8615cb43 4331 vFAIL("Can't do {n,m} with n > m");
c277df42 4332 if (!SIZE_ONLY) {
eb160463
GS
4333 ARG1_SET(ret, (U16)min);
4334 ARG2_SET(ret, (U16)max);
a687059c 4335 }
a687059c 4336
a0d0e21e 4337 goto nest_check;
a687059c 4338 }
a0d0e21e 4339 }
a687059c 4340
a0d0e21e
LW
4341 if (!ISMULT1(op)) {
4342 *flagp = flags;
a687059c 4343 return(ret);
a0d0e21e 4344 }
bb20fd44 4345
c277df42 4346#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
4347
4348 /* if this is reinstated, don't forget to put this back into perldiag:
4349
4350 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4351
4352 (F) The part of the regexp subject to either the * or + quantifier
4353 could match an empty string. The {#} shows in the regular
4354 expression about where the problem was discovered.
4355
4356 */
4357
bb20fd44 4358 if (!(flags&HASWIDTH) && op != '?')
b45f050a 4359 vFAIL("Regexp *+ operand could be empty");
b81d288d 4360#endif
bb20fd44 4361
fac92740 4362 parse_start = RExC_parse;
830247a4 4363 nextchar(pRExC_state);
a0d0e21e 4364
821b33a5 4365 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
4366
4367 if (op == '*' && (flags&SIMPLE)) {
830247a4 4368 reginsert(pRExC_state, STAR, ret);
c277df42 4369 ret->flags = 0;
830247a4 4370 RExC_naughty += 4;
a0d0e21e
LW
4371 }
4372 else if (op == '*') {
4373 min = 0;
4374 goto do_curly;
a0ed51b3
LW
4375 }
4376 else if (op == '+' && (flags&SIMPLE)) {
830247a4 4377 reginsert(pRExC_state, PLUS, ret);
c277df42 4378 ret->flags = 0;
830247a4 4379 RExC_naughty += 3;
a0d0e21e
LW
4380 }
4381 else if (op == '+') {
4382 min = 1;
4383 goto do_curly;
a0ed51b3
LW
4384 }
4385 else if (op == '?') {
a0d0e21e
LW
4386 min = 0; max = 1;
4387 goto do_curly;
4388 }
4389 nest_check:
041457d9 4390 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 4391 vWARN3(RExC_parse,
b45f050a 4392 "%.*s matches null string many times",
afd78fd5 4393 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 4394 origparse);
a0d0e21e
LW
4395 }
4396
830247a4
IZ
4397 if (*RExC_parse == '?') {
4398 nextchar(pRExC_state);
4399 reginsert(pRExC_state, MINMOD, ret);
3dab1dad 4400 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 4401 }
830247a4
IZ
4402 if (ISMULT2(RExC_parse)) {
4403 RExC_parse++;
b45f050a
JF
4404 vFAIL("Nested quantifiers");
4405 }
a0d0e21e
LW
4406
4407 return(ret);
a687059c
LW
4408}
4409
4410/*
4411 - regatom - the lowest level
4412 *
4413 * Optimization: gobbles an entire sequence of ordinary characters so that
4414 * it can turn them into a single node, which is smaller to store and
4415 * faster to run. Backslashed characters are exceptions, each becoming a
4416 * separate node; the code is simpler that way and it's not worth fixing.
4417 *
7f6f358c
YO
4418 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4419 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4420 */
76e3520e 4421STATIC regnode *
3dab1dad 4422S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4423{
97aff369 4424 dVAR;
cbbf8932 4425 register regnode *ret = NULL;
a0d0e21e 4426 I32 flags;
45948336 4427 char *parse_start = RExC_parse;
3dab1dad
YO
4428 GET_RE_DEBUG_FLAGS_DECL;
4429 DEBUG_PARSE("atom");
a0d0e21e
LW
4430 *flagp = WORST; /* Tentatively. */
4431
4432tryagain:
830247a4 4433 switch (*RExC_parse) {
a0d0e21e 4434 case '^':
830247a4
IZ
4435 RExC_seen_zerolen++;
4436 nextchar(pRExC_state);
e2509266 4437 if (RExC_flags & PMf_MULTILINE)
830247a4 4438 ret = reg_node(pRExC_state, MBOL);
e2509266 4439 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4440 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 4441 else
830247a4 4442 ret = reg_node(pRExC_state, BOL);
fac92740 4443 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4444 break;
4445 case '$':
830247a4 4446 nextchar(pRExC_state);
b81d288d 4447 if (*RExC_parse)
830247a4 4448 RExC_seen_zerolen++;
e2509266 4449 if (RExC_flags & PMf_MULTILINE)
830247a4 4450 ret = reg_node(pRExC_state, MEOL);
e2509266 4451 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4452 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 4453 else
830247a4 4454 ret = reg_node(pRExC_state, EOL);
fac92740 4455 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4456 break;
4457 case '.':
830247a4 4458 nextchar(pRExC_state);
e2509266 4459 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
4460 ret = reg_node(pRExC_state, SANY);
4461 else
4462 ret = reg_node(pRExC_state, REG_ANY);
4463 *flagp |= HASWIDTH|SIMPLE;
830247a4 4464 RExC_naughty++;
fac92740 4465 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4466 break;
4467 case '[':
b45f050a 4468 {
3dab1dad
YO
4469 char * const oregcomp_parse = ++RExC_parse;
4470 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
4471 if (*RExC_parse != ']') {
4472 RExC_parse = oregcomp_parse;
b45f050a
JF
4473 vFAIL("Unmatched [");
4474 }
830247a4 4475 nextchar(pRExC_state);
a0d0e21e 4476 *flagp |= HASWIDTH|SIMPLE;
fac92740 4477 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 4478 break;
b45f050a 4479 }
a0d0e21e 4480 case '(':
830247a4 4481 nextchar(pRExC_state);
3dab1dad 4482 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 4483 if (ret == NULL) {
bf93d4cc 4484 if (flags & TRYAGAIN) {
830247a4 4485 if (RExC_parse == RExC_end) {
bf93d4cc
GS
4486 /* Make parent create an empty node if needed. */
4487 *flagp |= TRYAGAIN;
4488 return(NULL);
4489 }
a0d0e21e 4490 goto tryagain;
bf93d4cc 4491 }
a0d0e21e
LW
4492 return(NULL);
4493 }
c277df42 4494 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
4495 break;
4496 case '|':
4497 case ')':
4498 if (flags & TRYAGAIN) {
4499 *flagp |= TRYAGAIN;
4500 return NULL;
4501 }
b45f050a 4502 vFAIL("Internal urp");
a0d0e21e
LW
4503 /* Supposed to be caught earlier. */
4504 break;
85afd4ae 4505 case '{':
830247a4
IZ
4506 if (!regcurly(RExC_parse)) {
4507 RExC_parse++;
85afd4ae
CS
4508 goto defchar;
4509 }
4510 /* FALL THROUGH */
a0d0e21e
LW
4511 case '?':
4512 case '+':
4513 case '*':
830247a4 4514 RExC_parse++;
b45f050a 4515 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
4516 break;
4517 case '\\':
830247a4 4518 switch (*++RExC_parse) {
a0d0e21e 4519 case 'A':
830247a4
IZ
4520 RExC_seen_zerolen++;
4521 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 4522 *flagp |= SIMPLE;
830247a4 4523 nextchar(pRExC_state);
fac92740 4524 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4525 break;
4526 case 'G':
830247a4
IZ
4527 ret = reg_node(pRExC_state, GPOS);
4528 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 4529 *flagp |= SIMPLE;
830247a4 4530 nextchar(pRExC_state);
fac92740 4531 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4532 break;
4533 case 'Z':
830247a4 4534 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 4535 *flagp |= SIMPLE;
a1917ab9 4536 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 4537 nextchar(pRExC_state);
a0d0e21e 4538 break;
b85d18e9 4539 case 'z':
830247a4 4540 ret = reg_node(pRExC_state, EOS);
b85d18e9 4541 *flagp |= SIMPLE;
830247a4
IZ
4542 RExC_seen_zerolen++; /* Do not optimize RE away */
4543 nextchar(pRExC_state);
fac92740 4544 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 4545 break;
4a2d328f 4546 case 'C':
f33976b4
DB
4547 ret = reg_node(pRExC_state, CANY);
4548 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 4549 *flagp |= HASWIDTH|SIMPLE;
830247a4 4550 nextchar(pRExC_state);
fac92740 4551 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
4552 break;
4553 case 'X':
830247a4 4554 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 4555 *flagp |= HASWIDTH;
830247a4 4556 nextchar(pRExC_state);
fac92740 4557 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 4558 break;
a0d0e21e 4559 case 'w':
eb160463 4560 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 4561 *flagp |= HASWIDTH|SIMPLE;
830247a4 4562 nextchar(pRExC_state);
fac92740 4563 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4564 break;
4565 case 'W':
eb160463 4566 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4567 *flagp |= HASWIDTH|SIMPLE;
830247a4 4568 nextchar(pRExC_state);
fac92740 4569 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4570 break;
4571 case 'b':
830247a4
IZ
4572 RExC_seen_zerolen++;
4573 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4574 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4575 *flagp |= SIMPLE;
830247a4 4576 nextchar(pRExC_state);
fac92740 4577 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4578 break;
4579 case 'B':
830247a4
IZ
4580 RExC_seen_zerolen++;
4581 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4582 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4583 *flagp |= SIMPLE;
830247a4 4584 nextchar(pRExC_state);
fac92740 4585 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4586 break;
4587 case 's':
eb160463 4588 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4589 *flagp |= HASWIDTH|SIMPLE;
830247a4 4590 nextchar(pRExC_state);
fac92740 4591 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4592 break;
4593 case 'S':
eb160463 4594 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4595 *flagp |= HASWIDTH|SIMPLE;
830247a4 4596 nextchar(pRExC_state);
fac92740 4597 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4598 break;
4599 case 'd':
ffc61ed2 4600 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4601 *flagp |= HASWIDTH|SIMPLE;
830247a4 4602 nextchar(pRExC_state);
fac92740 4603 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4604 break;
4605 case 'D':
ffc61ed2 4606 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4607 *flagp |= HASWIDTH|SIMPLE;
830247a4 4608 nextchar(pRExC_state);
fac92740 4609 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4610 break;
a14b48bc
LW
4611 case 'p':
4612 case 'P':
3568d838 4613 {
3dab1dad 4614 char* const oldregxend = RExC_end;
ccb2c380 4615 char* parse_start = RExC_parse - 2;
a14b48bc 4616
830247a4 4617 if (RExC_parse[1] == '{') {
3568d838 4618 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4619 RExC_end = strchr(RExC_parse, '}');
4620 if (!RExC_end) {
3dab1dad 4621 const U8 c = (U8)*RExC_parse;
830247a4
IZ
4622 RExC_parse += 2;
4623 RExC_end = oldregxend;
0da60cf5 4624 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4625 }
830247a4 4626 RExC_end++;
a14b48bc 4627 }
af6f566e 4628 else {
830247a4 4629 RExC_end = RExC_parse + 2;
af6f566e
HS
4630 if (RExC_end > oldregxend)
4631 RExC_end = oldregxend;
4632 }
830247a4 4633 RExC_parse--;
a14b48bc 4634
3dab1dad 4635 ret = regclass(pRExC_state,depth+1);
a14b48bc 4636
830247a4
IZ
4637 RExC_end = oldregxend;
4638 RExC_parse--;
ccb2c380
MP
4639
4640 Set_Node_Offset(ret, parse_start + 2);
4641 Set_Node_Cur_Length(ret);
830247a4 4642 nextchar(pRExC_state);
a14b48bc
LW
4643 *flagp |= HASWIDTH|SIMPLE;
4644 }
4645 break;
a0d0e21e
LW
4646 case 'n':
4647 case 'r':
4648 case 't':
4649 case 'f':
4650 case 'e':
4651 case 'a':
4652 case 'x':
4653 case 'c':
4654 case '0':
4655 goto defchar;
4656 case '1': case '2': case '3': case '4':
4657 case '5': case '6': case '7': case '8': case '9':
4658 {
1df70142 4659 const I32 num = atoi(RExC_parse);
a0d0e21e 4660
830247a4 4661 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4662 goto defchar;
4663 else {
3dab1dad 4664 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4665 while (isDIGIT(*RExC_parse))
4666 RExC_parse++;
b45f050a 4667
eb160463 4668 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4669 vFAIL("Reference to nonexistent group");
830247a4 4670 RExC_sawback = 1;
eb160463
GS
4671 ret = reganode(pRExC_state,
4672 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4673 num);
a0d0e21e 4674 *flagp |= HASWIDTH;
2af232bd 4675
fac92740 4676 /* override incorrect value set in reganode MJD */
2af232bd 4677 Set_Node_Offset(ret, parse_start+1);
fac92740 4678 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4679 RExC_parse--;
4680 nextchar(pRExC_state);
a0d0e21e
LW
4681 }
4682 }
4683 break;
4684 case '\0':
830247a4 4685 if (RExC_parse >= RExC_end)
b45f050a 4686 FAIL("Trailing \\");
a0d0e21e
LW
4687 /* FALL THROUGH */
4688 default:
a0288114 4689 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 4690 back into the quick-grab loop below */
45948336 4691 parse_start--;
a0d0e21e
LW
4692 goto defchar;
4693 }
4694 break;
4633a7c4
LW
4695
4696 case '#':
e2509266 4697 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
4698 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4699 RExC_parse++;
830247a4 4700 if (RExC_parse < RExC_end)
4633a7c4
LW
4701 goto tryagain;
4702 }
4703 /* FALL THROUGH */
4704
a0d0e21e 4705 default: {
ba210ebe 4706 register STRLEN len;
58ae7d3f 4707 register UV ender;
a0d0e21e 4708 register char *p;
3dab1dad 4709 char *s;
80aecb99 4710 STRLEN foldlen;
89ebb4a3 4711 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4712
4713 parse_start = RExC_parse - 1;
a0d0e21e 4714
830247a4 4715 RExC_parse++;
a0d0e21e
LW
4716
4717 defchar:
58ae7d3f 4718 ender = 0;
eb160463
GS
4719 ret = reg_node(pRExC_state,
4720 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4721 s = STRING(ret);
830247a4
IZ
4722 for (len = 0, p = RExC_parse - 1;
4723 len < 127 && p < RExC_end;
a0d0e21e
LW
4724 len++)
4725 {
3dab1dad 4726 char * const oldp = p;
5b5a24f7 4727
e2509266 4728 if (RExC_flags & PMf_EXTENDED)
830247a4 4729 p = regwhite(p, RExC_end);
a0d0e21e
LW
4730 switch (*p) {
4731 case '^':
4732 case '$':
4733 case '.':
4734 case '[':
4735 case '(':
4736 case ')':
4737 case '|':
4738 goto loopdone;
4739 case '\\':
4740 switch (*++p) {
4741 case 'A':
1ed8eac0
JF
4742 case 'C':
4743 case 'X':
a0d0e21e
LW
4744 case 'G':
4745 case 'Z':
b85d18e9 4746 case 'z':
a0d0e21e
LW
4747 case 'w':
4748 case 'W':
4749 case 'b':
4750 case 'B':
4751 case 's':
4752 case 'S':
4753 case 'd':
4754 case 'D':
a14b48bc
LW
4755 case 'p':
4756 case 'P':
a0d0e21e
LW
4757 --p;
4758 goto loopdone;
4759 case 'n':
4760 ender = '\n';
4761 p++;
a687059c 4762 break;
a0d0e21e
LW
4763 case 'r':
4764 ender = '\r';
4765 p++;
a687059c 4766 break;
a0d0e21e
LW
4767 case 't':
4768 ender = '\t';
4769 p++;
a687059c 4770 break;
a0d0e21e
LW
4771 case 'f':
4772 ender = '\f';
4773 p++;
a687059c 4774 break;
a0d0e21e 4775 case 'e':
c7f1f016 4776 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4777 p++;
a687059c 4778 break;
a0d0e21e 4779 case 'a':
c7f1f016 4780 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4781 p++;
a687059c 4782 break;
a0d0e21e 4783 case 'x':
a0ed51b3 4784 if (*++p == '{') {
1df70142 4785 char* const e = strchr(p, '}');
b81d288d 4786
b45f050a 4787 if (!e) {
830247a4 4788 RExC_parse = p + 1;
b45f050a
JF
4789 vFAIL("Missing right brace on \\x{}");
4790 }
de5f0749 4791 else {
a4c04bdc
NC
4792 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4793 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 4794 STRLEN numlen = e - p - 1;
53305cf1 4795 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4796 if (ender > 0xff)
4797 RExC_utf8 = 1;
a0ed51b3
LW
4798 p = e + 1;
4799 }
a0ed51b3
LW
4800 }
4801 else {
a4c04bdc 4802 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 4803 STRLEN numlen = 2;
53305cf1 4804 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4805 p += numlen;
4806 }
a687059c 4807 break;
a0d0e21e
LW
4808 case 'c':
4809 p++;
bbce6d69 4810 ender = UCHARAT(p++);
4811 ender = toCTRL(ender);
a687059c 4812 break;
a0d0e21e
LW
4813 case '0': case '1': case '2': case '3':case '4':
4814 case '5': case '6': case '7': case '8':case '9':
4815 if (*p == '0' ||
830247a4 4816 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 4817 I32 flags = 0;
1df70142 4818 STRLEN numlen = 3;
53305cf1 4819 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4820 p += numlen;
4821 }
4822 else {
4823 --p;
4824 goto loopdone;
a687059c
LW
4825 }
4826 break;
a0d0e21e 4827 case '\0':
830247a4 4828 if (p >= RExC_end)
b45f050a 4829 FAIL("Trailing \\");
a687059c 4830 /* FALL THROUGH */
a0d0e21e 4831 default:
041457d9 4832 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 4833 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4834 goto normal_default;
a0d0e21e
LW
4835 }
4836 break;
a687059c 4837 default:
a0ed51b3 4838 normal_default:
fd400ab9 4839 if (UTF8_IS_START(*p) && UTF) {
1df70142 4840 STRLEN numlen;
5e12f4fb 4841 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 4842 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
4843 p += numlen;
4844 }
4845 else
4846 ender = *p++;
a0d0e21e 4847 break;
a687059c 4848 }
e2509266 4849 if (RExC_flags & PMf_EXTENDED)
830247a4 4850 p = regwhite(p, RExC_end);
60a8b682
JH
4851 if (UTF && FOLD) {
4852 /* Prime the casefolded buffer. */
ac7e0132 4853 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4854 }
a0d0e21e
LW
4855 if (ISMULT2(p)) { /* Back off on ?+*. */
4856 if (len)
4857 p = oldp;
16ea2a2e 4858 else if (UTF) {
80aecb99 4859 if (FOLD) {
60a8b682 4860 /* Emit all the Unicode characters. */
1df70142 4861 STRLEN numlen;
80aecb99
JH
4862 for (foldbuf = tmpbuf;
4863 foldlen;
4864 foldlen -= numlen) {
4865 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4866 if (numlen > 0) {
71207a34 4867 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
4868 s += unilen;
4869 len += unilen;
4870 /* In EBCDIC the numlen
4871 * and unilen can differ. */
9dc45d57 4872 foldbuf += numlen;
47654450
JH
4873 if (numlen >= foldlen)
4874 break;
9dc45d57
JH
4875 }
4876 else
4877 break; /* "Can't happen." */
80aecb99
JH
4878 }
4879 }
4880 else {
71207a34 4881 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 4882 if (unilen > 0) {
0ebc6274
JH
4883 s += unilen;
4884 len += unilen;
9dc45d57 4885 }
80aecb99 4886 }
a0ed51b3 4887 }
a0d0e21e
LW
4888 else {
4889 len++;
eb160463 4890 REGC((char)ender, s++);
a0d0e21e
LW
4891 }
4892 break;
a687059c 4893 }
16ea2a2e 4894 if (UTF) {
80aecb99 4895 if (FOLD) {
60a8b682 4896 /* Emit all the Unicode characters. */
1df70142 4897 STRLEN numlen;
80aecb99
JH
4898 for (foldbuf = tmpbuf;
4899 foldlen;
4900 foldlen -= numlen) {
4901 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4902 if (numlen > 0) {
71207a34 4903 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
4904 len += unilen;
4905 s += unilen;
4906 /* In EBCDIC the numlen
4907 * and unilen can differ. */
9dc45d57 4908 foldbuf += numlen;
47654450
JH
4909 if (numlen >= foldlen)
4910 break;
9dc45d57
JH
4911 }
4912 else
4913 break;
80aecb99
JH
4914 }
4915 }
4916 else {
71207a34 4917 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 4918 if (unilen > 0) {
0ebc6274
JH
4919 s += unilen;
4920 len += unilen;
9dc45d57 4921 }
80aecb99
JH
4922 }
4923 len--;
a0ed51b3
LW
4924 }
4925 else
eb160463 4926 REGC((char)ender, s++);
a0d0e21e
LW
4927 }
4928 loopdone:
830247a4 4929 RExC_parse = p - 1;
fac92740 4930 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4931 nextchar(pRExC_state);
793db0cb
JH
4932 {
4933 /* len is STRLEN which is unsigned, need to copy to signed */
4934 IV iv = len;
4935 if (iv < 0)
4936 vFAIL("Internal disaster");
4937 }
a0d0e21e
LW
4938 if (len > 0)
4939 *flagp |= HASWIDTH;
090f7165 4940 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4941 *flagp |= SIMPLE;
3dab1dad 4942
cd439c50 4943 if (SIZE_ONLY)
830247a4 4944 RExC_size += STR_SZ(len);
3dab1dad
YO
4945 else {
4946 STR_LEN(ret) = len;
830247a4 4947 RExC_emit += STR_SZ(len);
07be1b83 4948 }
3dab1dad 4949 }
a0d0e21e
LW
4950 break;
4951 }
a687059c 4952
60a8b682
JH
4953 /* If the encoding pragma is in effect recode the text of
4954 * any EXACT-kind nodes. */
3dab1dad
YO
4955 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
4956 const STRLEN oldlen = STR_LEN(ret);
4957 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
d0063567
DK
4958
4959 if (RExC_utf8)
4960 SvUTF8_on(sv);
4961 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
4962 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4963 const STRLEN newlen = SvCUR(sv);
d0063567
DK
4964
4965 if (SvUTF8(sv))
4966 RExC_utf8 = 1;
4967 if (!SIZE_ONLY) {
a3621e74
YO
4968 GET_RE_DEBUG_FLAGS_DECL;
4969 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
4970 (int)oldlen, STRING(ret),
4971 (int)newlen, s));
4972 Copy(s, STRING(ret), newlen, char);
4973 STR_LEN(ret) += newlen - oldlen;
4974 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4975 } else
4976 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4977 }
a72c7584
JH
4978 }
4979
a0d0e21e 4980 return(ret);
a687059c
LW
4981}
4982
873ef191 4983STATIC char *
5f66b61c 4984S_regwhite(char *p, const char *e)
5b5a24f7
CS
4985{
4986 while (p < e) {
4987 if (isSPACE(*p))
4988 ++p;
4989 else if (*p == '#') {
4990 do {
4991 p++;
4992 } while (p < e && *p != '\n');
4993 }
4994 else
4995 break;
4996 }
4997 return p;
4998}
4999
b8c5462f
JH
5000/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
5001 Character classes ([:foo:]) can also be negated ([:^foo:]).
5002 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
5003 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 5004 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
5005
5006#define POSIXCC_DONE(c) ((c) == ':')
5007#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
5008#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
5009
b8c5462f 5010STATIC I32
830247a4 5011S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 5012{
97aff369 5013 dVAR;
936ed897 5014 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 5015
830247a4 5016 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 5017 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 5018 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 5019 const char c = UCHARAT(RExC_parse);
097eb12c 5020 char* const s = RExC_parse++;
b81d288d 5021
9a86a77b 5022 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
5023 RExC_parse++;
5024 if (RExC_parse == RExC_end)
620e46c5 5025 /* Grandfather lone [:, [=, [. */
830247a4 5026 RExC_parse = s;
620e46c5 5027 else {
3dab1dad 5028 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
5029 assert(*t == c);
5030
9a86a77b 5031 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 5032 const char *posixcc = s + 1;
830247a4 5033 RExC_parse++; /* skip over the ending ] */
3dab1dad 5034
b8c5462f 5035 if (*s == ':') {
1df70142
AL
5036 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5037 const I32 skip = t - posixcc;
80916619
NC
5038
5039 /* Initially switch on the length of the name. */
5040 switch (skip) {
5041 case 4:
3dab1dad
YO
5042 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5043 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 5044 break;
80916619
NC
5045 case 5:
5046 /* Names all of length 5. */
5047 /* alnum alpha ascii blank cntrl digit graph lower
5048 print punct space upper */
5049 /* Offset 4 gives the best switch position. */
5050 switch (posixcc[4]) {
5051 case 'a':
3dab1dad
YO
5052 if (memEQ(posixcc, "alph", 4)) /* alpha */
5053 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
5054 break;
5055 case 'e':
3dab1dad
YO
5056 if (memEQ(posixcc, "spac", 4)) /* space */
5057 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
5058 break;
5059 case 'h':
3dab1dad
YO
5060 if (memEQ(posixcc, "grap", 4)) /* graph */
5061 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
5062 break;
5063 case 'i':
3dab1dad
YO
5064 if (memEQ(posixcc, "asci", 4)) /* ascii */
5065 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
5066 break;
5067 case 'k':
3dab1dad
YO
5068 if (memEQ(posixcc, "blan", 4)) /* blank */
5069 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
5070 break;
5071 case 'l':
3dab1dad
YO
5072 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5073 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
5074 break;
5075 case 'm':
3dab1dad
YO
5076 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5077 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
5078 break;
5079 case 'r':
3dab1dad
YO
5080 if (memEQ(posixcc, "lowe", 4)) /* lower */
5081 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5082 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5083 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
5084 break;
5085 case 't':
3dab1dad
YO
5086 if (memEQ(posixcc, "digi", 4)) /* digit */
5087 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5088 else if (memEQ(posixcc, "prin", 4)) /* print */
5089 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5090 else if (memEQ(posixcc, "punc", 4)) /* punct */
5091 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 5092 break;
b8c5462f
JH
5093 }
5094 break;
80916619 5095 case 6:
3dab1dad
YO
5096 if (memEQ(posixcc, "xdigit", 6))
5097 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
5098 break;
5099 }
80916619
NC
5100
5101 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
5102 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5103 t - s - 1, s + 1);
80916619
NC
5104 assert (posixcc[skip] == ':');
5105 assert (posixcc[skip+1] == ']');
b45f050a 5106 } else if (!SIZE_ONLY) {
b8c5462f 5107 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 5108
830247a4 5109 /* adjust RExC_parse so the warning shows after
b45f050a 5110 the class closes */
9a86a77b 5111 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 5112 RExC_parse++;
b45f050a
JF
5113 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5114 }
b8c5462f
JH
5115 } else {
5116 /* Maternal grandfather:
5117 * "[:" ending in ":" but not in ":]" */
830247a4 5118 RExC_parse = s;
767d463e 5119 }
620e46c5
JH
5120 }
5121 }
5122
b8c5462f
JH
5123 return namedclass;
5124}
5125
5126STATIC void
830247a4 5127S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 5128{
97aff369 5129 dVAR;
3dab1dad 5130 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
5131 const char *s = RExC_parse;
5132 const char c = *s++;
b8c5462f 5133
3dab1dad 5134 while (isALNUM(*s))
b8c5462f
JH
5135 s++;
5136 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
5137 if (ckWARN(WARN_REGEXP))
5138 vWARN3(s+2,
5139 "POSIX syntax [%c %c] belongs inside character classes",
5140 c, c);
b45f050a
JF
5141
5142 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 5143 if (POSIXCC_NOTYET(c)) {
830247a4 5144 /* adjust RExC_parse so the error shows after
b45f050a 5145 the class closes */
9a86a77b 5146 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 5147 NOOP;
b45f050a
JF
5148 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5149 }
b8c5462f
JH
5150 }
5151 }
620e46c5
JH
5152}
5153
7f6f358c
YO
5154
5155/*
5156 parse a class specification and produce either an ANYOF node that
5157 matches the pattern. If the pattern matches a single char only and
5158 that char is < 256 then we produce an EXACT node instead.
5159*/
76e3520e 5160STATIC regnode *
3dab1dad 5161S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 5162{
97aff369 5163 dVAR;
07be1b83 5164 register UV value;
9a86a77b 5165 register UV nextvalue;
3568d838 5166 register IV prevvalue = OOB_UNICODE;
ffc61ed2 5167 register IV range = 0;
c277df42 5168 register regnode *ret;
ba210ebe 5169 STRLEN numlen;
ffc61ed2 5170 IV namedclass;
cbbf8932 5171 char *rangebegin = NULL;
936ed897 5172 bool need_class = 0;
c445ea15 5173 SV *listsv = NULL;
ffc61ed2 5174 UV n;
9e55ce06 5175 bool optimize_invert = TRUE;
cbbf8932 5176 AV* unicode_alternate = NULL;
1b2d223b
JH
5177#ifdef EBCDIC
5178 UV literal_endpoint = 0;
5179#endif
f9049ba1
SP
5180#ifndef DEBUGGING
5181 PERL_UNUSED_ARG(depth);
5182#endif
7f6f358c 5183 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 5184
3dab1dad 5185 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 5186 case we need to change the emitted regop to an EXACT. */
07be1b83 5187 const char * orig_parse = RExC_parse;
3dab1dad
YO
5188 GET_RE_DEBUG_FLAGS_DECL;
5189 DEBUG_PARSE("clas");
7f6f358c
YO
5190
5191 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
5192 ret = reganode(pRExC_state, ANYOF, 0);
5193
5194 if (!SIZE_ONLY)
5195 ANYOF_FLAGS(ret) = 0;
5196
9a86a77b 5197 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
5198 RExC_naughty++;
5199 RExC_parse++;
5200 if (!SIZE_ONLY)
5201 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5202 }
a0d0e21e 5203
73060fc4 5204 if (SIZE_ONLY) {
830247a4 5205 RExC_size += ANYOF_SKIP;
73060fc4
JH
5206 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5207 }
936ed897 5208 else {
830247a4 5209 RExC_emit += ANYOF_SKIP;
936ed897
IZ
5210 if (FOLD)
5211 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5212 if (LOC)
5213 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 5214 ANYOF_BITMAP_ZERO(ret);
396482e1 5215 listsv = newSVpvs("# comment\n");
a0d0e21e 5216 }
b8c5462f 5217
9a86a77b
JH
5218 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5219
b938889d 5220 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 5221 checkposixcc(pRExC_state);
b8c5462f 5222
f064b6ad
HS
5223 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5224 if (UCHARAT(RExC_parse) == ']')
5225 goto charclassloop;
ffc61ed2 5226
9a86a77b 5227 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
5228
5229 charclassloop:
5230
5231 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5232
73b437c8 5233 if (!range)
830247a4 5234 rangebegin = RExC_parse;
ffc61ed2 5235 if (UTF) {
5e12f4fb 5236 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 5237 RExC_end - RExC_parse,
9f7f3913 5238 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
5239 RExC_parse += numlen;
5240 }
5241 else
5242 value = UCHARAT(RExC_parse++);
7f6f358c 5243
9a86a77b
JH
5244 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5245 if (value == '[' && POSIXCC(nextvalue))
830247a4 5246 namedclass = regpposixcc(pRExC_state, value);
620e46c5 5247 else if (value == '\\') {
ffc61ed2 5248 if (UTF) {
5e12f4fb 5249 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 5250 RExC_end - RExC_parse,
9f7f3913 5251 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
5252 RExC_parse += numlen;
5253 }
5254 else
5255 value = UCHARAT(RExC_parse++);
470c3474 5256 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 5257 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
5258 * be a problem later if we want switch on Unicode.
5259 * A similar issue a little bit later when switching on
5260 * namedclass. --jhi */
ffc61ed2 5261 switch ((I32)value) {
b8c5462f
JH
5262 case 'w': namedclass = ANYOF_ALNUM; break;
5263 case 'W': namedclass = ANYOF_NALNUM; break;
5264 case 's': namedclass = ANYOF_SPACE; break;
5265 case 'S': namedclass = ANYOF_NSPACE; break;
5266 case 'd': namedclass = ANYOF_DIGIT; break;
5267 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
5268 case 'p':
5269 case 'P':
3dab1dad
YO
5270 {
5271 char *e;
af6f566e 5272 if (RExC_parse >= RExC_end)
2a4859cd 5273 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 5274 if (*RExC_parse == '{') {
1df70142 5275 const U8 c = (U8)value;
ffc61ed2
JH
5276 e = strchr(RExC_parse++, '}');
5277 if (!e)
0da60cf5 5278 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
5279 while (isSPACE(UCHARAT(RExC_parse)))
5280 RExC_parse++;
5281 if (e == RExC_parse)
0da60cf5 5282 vFAIL2("Empty \\%c{}", c);
ffc61ed2 5283 n = e - RExC_parse;
ab13f0c7
JH
5284 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5285 n--;
ffc61ed2
JH
5286 }
5287 else {
5288 e = RExC_parse;
5289 n = 1;
5290 }
5291 if (!SIZE_ONLY) {
ab13f0c7
JH
5292 if (UCHARAT(RExC_parse) == '^') {
5293 RExC_parse++;
5294 n--;
5295 value = value == 'p' ? 'P' : 'p'; /* toggle */
5296 while (isSPACE(UCHARAT(RExC_parse))) {
5297 RExC_parse++;
5298 n--;
5299 }
5300 }
097eb12c
AL
5301 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5302 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
5303 }
5304 RExC_parse = e + 1;
5305 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 5306 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 5307 }
f81125e2 5308 break;
b8c5462f
JH
5309 case 'n': value = '\n'; break;
5310 case 'r': value = '\r'; break;
5311 case 't': value = '\t'; break;
5312 case 'f': value = '\f'; break;
5313 case 'b': value = '\b'; break;
c7f1f016
NIS
5314 case 'e': value = ASCII_TO_NATIVE('\033');break;
5315 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 5316 case 'x':
ffc61ed2 5317 if (*RExC_parse == '{') {
a4c04bdc
NC
5318 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5319 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 5320 char * const e = strchr(RExC_parse++, '}');
b81d288d 5321 if (!e)
ffc61ed2 5322 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
5323
5324 numlen = e - RExC_parse;
5325 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5326 RExC_parse = e + 1;
5327 }
5328 else {
a4c04bdc 5329 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
5330 numlen = 2;
5331 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5332 RExC_parse += numlen;
5333 }
b8c5462f
JH
5334 break;
5335 case 'c':
830247a4 5336 value = UCHARAT(RExC_parse++);
b8c5462f
JH
5337 value = toCTRL(value);
5338 break;
5339 case '0': case '1': case '2': case '3': case '4':
5340 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
5341 {
5342 I32 flags = 0;
5343 numlen = 3;
5344 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 5345 RExC_parse += numlen;
b8c5462f 5346 break;
53305cf1 5347 }
1028017a 5348 default:
041457d9 5349 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
5350 vWARN2(RExC_parse,
5351 "Unrecognized escape \\%c in character class passed through",
5352 (int)value);
1028017a 5353 break;
b8c5462f 5354 }
ffc61ed2 5355 } /* end of \blah */
1b2d223b
JH
5356#ifdef EBCDIC
5357 else
5358 literal_endpoint++;
5359#endif
ffc61ed2
JH
5360
5361 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5362
5363 if (!SIZE_ONLY && !need_class)
936ed897 5364 ANYOF_CLASS_ZERO(ret);
ffc61ed2 5365
936ed897 5366 need_class = 1;
ffc61ed2
JH
5367
5368 /* a bad range like a-\d, a-[:digit:] ? */
5369 if (range) {
73b437c8 5370 if (!SIZE_ONLY) {
afd78fd5 5371 if (ckWARN(WARN_REGEXP)) {
097eb12c 5372 const int w =
afd78fd5
JH
5373 RExC_parse >= rangebegin ?
5374 RExC_parse - rangebegin : 0;
830247a4 5375 vWARN4(RExC_parse,
b45f050a 5376 "False [] range \"%*.*s\"",
097eb12c 5377 w, w, rangebegin);
afd78fd5 5378 }
3568d838
JH
5379 if (prevvalue < 256) {
5380 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
5381 ANYOF_BITMAP_SET(ret, '-');
5382 }
5383 else {
5384 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5385 Perl_sv_catpvf(aTHX_ listsv,
3568d838 5386 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 5387 }
b8c5462f 5388 }
ffc61ed2
JH
5389
5390 range = 0; /* this was not a true range */
73b437c8 5391 }
ffc61ed2 5392
73b437c8 5393 if (!SIZE_ONLY) {
c49a72a9
NC
5394 const char *what = NULL;
5395 char yesno = 0;
5396
3568d838
JH
5397 if (namedclass > OOB_NAMEDCLASS)
5398 optimize_invert = FALSE;
e2962f66
JH
5399 /* Possible truncation here but in some 64-bit environments
5400 * the compiler gets heartburn about switch on 64-bit values.
5401 * A similar issue a little earlier when switching on value.
98f323fa 5402 * --jhi */
e2962f66 5403 switch ((I32)namedclass) {
73b437c8
JH
5404 case ANYOF_ALNUM:
5405 if (LOC)
936ed897 5406 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
5407 else {
5408 for (value = 0; value < 256; value++)
5409 if (isALNUM(value))
936ed897 5410 ANYOF_BITMAP_SET(ret, value);
73b437c8 5411 }
c49a72a9
NC
5412 yesno = '+';
5413 what = "Word";
73b437c8
JH
5414 break;
5415 case ANYOF_NALNUM:
5416 if (LOC)
936ed897 5417 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
5418 else {
5419 for (value = 0; value < 256; value++)
5420 if (!isALNUM(value))
936ed897 5421 ANYOF_BITMAP_SET(ret, value);
73b437c8 5422 }
c49a72a9
NC
5423 yesno = '!';
5424 what = "Word";
73b437c8 5425 break;
ffc61ed2 5426 case ANYOF_ALNUMC:
73b437c8 5427 if (LOC)
ffc61ed2 5428 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
5429 else {
5430 for (value = 0; value < 256; value++)
ffc61ed2 5431 if (isALNUMC(value))
936ed897 5432 ANYOF_BITMAP_SET(ret, value);
73b437c8 5433 }
c49a72a9
NC
5434 yesno = '+';
5435 what = "Alnum";
73b437c8
JH
5436 break;
5437 case ANYOF_NALNUMC:
5438 if (LOC)
936ed897 5439 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
5440 else {
5441 for (value = 0; value < 256; value++)
5442 if (!isALNUMC(value))
936ed897 5443 ANYOF_BITMAP_SET(ret, value);
73b437c8 5444 }
c49a72a9
NC
5445 yesno = '!';
5446 what = "Alnum";
73b437c8
JH
5447 break;
5448 case ANYOF_ALPHA:
5449 if (LOC)
936ed897 5450 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
5451 else {
5452 for (value = 0; value < 256; value++)
5453 if (isALPHA(value))
936ed897 5454 ANYOF_BITMAP_SET(ret, value);
73b437c8 5455 }
c49a72a9
NC
5456 yesno = '+';
5457 what = "Alpha";
73b437c8
JH
5458 break;
5459 case ANYOF_NALPHA:
5460 if (LOC)
936ed897 5461 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
5462 else {
5463 for (value = 0; value < 256; value++)
5464 if (!isALPHA(value))
936ed897 5465 ANYOF_BITMAP_SET(ret, value);
73b437c8 5466 }
c49a72a9
NC
5467 yesno = '!';
5468 what = "Alpha";
73b437c8
JH
5469 break;
5470 case ANYOF_ASCII:
5471 if (LOC)
936ed897 5472 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 5473 else {
c7f1f016 5474#ifndef EBCDIC
1ba5c669
JH
5475 for (value = 0; value < 128; value++)
5476 ANYOF_BITMAP_SET(ret, value);
5477#else /* EBCDIC */
ffbc6a93 5478 for (value = 0; value < 256; value++) {
3a3c4447
JH
5479 if (isASCII(value))
5480 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 5481 }
1ba5c669 5482#endif /* EBCDIC */
73b437c8 5483 }
c49a72a9
NC
5484 yesno = '+';
5485 what = "ASCII";
73b437c8
JH
5486 break;
5487 case ANYOF_NASCII:
5488 if (LOC)
936ed897 5489 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 5490 else {
c7f1f016 5491#ifndef EBCDIC
1ba5c669
JH
5492 for (value = 128; value < 256; value++)
5493 ANYOF_BITMAP_SET(ret, value);
5494#else /* EBCDIC */
ffbc6a93 5495 for (value = 0; value < 256; value++) {
3a3c4447
JH
5496 if (!isASCII(value))
5497 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 5498 }
1ba5c669 5499#endif /* EBCDIC */
73b437c8 5500 }
c49a72a9
NC
5501 yesno = '!';
5502 what = "ASCII";
73b437c8 5503 break;
aaa51d5e
JF
5504 case ANYOF_BLANK:
5505 if (LOC)
5506 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5507 else {
5508 for (value = 0; value < 256; value++)
5509 if (isBLANK(value))
5510 ANYOF_BITMAP_SET(ret, value);
5511 }
c49a72a9
NC
5512 yesno = '+';
5513 what = "Blank";
aaa51d5e
JF
5514 break;
5515 case ANYOF_NBLANK:
5516 if (LOC)
5517 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5518 else {
5519 for (value = 0; value < 256; value++)
5520 if (!isBLANK(value))
5521 ANYOF_BITMAP_SET(ret, value);
5522 }
c49a72a9
NC
5523 yesno = '!';
5524 what = "Blank";
aaa51d5e 5525 break;
73b437c8
JH
5526 case ANYOF_CNTRL:
5527 if (LOC)
936ed897 5528 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
5529 else {
5530 for (value = 0; value < 256; value++)
5531 if (isCNTRL(value))
936ed897 5532 ANYOF_BITMAP_SET(ret, value);
73b437c8 5533 }
c49a72a9
NC
5534 yesno = '+';
5535 what = "Cntrl";
73b437c8
JH
5536 break;
5537 case ANYOF_NCNTRL:
5538 if (LOC)
936ed897 5539 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5540 else {
5541 for (value = 0; value < 256; value++)
5542 if (!isCNTRL(value))
936ed897 5543 ANYOF_BITMAP_SET(ret, value);
73b437c8 5544 }
c49a72a9
NC
5545 yesno = '!';
5546 what = "Cntrl";
ffc61ed2
JH
5547 break;
5548 case ANYOF_DIGIT:
5549 if (LOC)
5550 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5551 else {
5552 /* consecutive digits assumed */
5553 for (value = '0'; value <= '9'; value++)
5554 ANYOF_BITMAP_SET(ret, value);
5555 }
c49a72a9
NC
5556 yesno = '+';
5557 what = "Digit";
ffc61ed2
JH
5558 break;
5559 case ANYOF_NDIGIT:
5560 if (LOC)
5561 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5562 else {
5563 /* consecutive digits assumed */
5564 for (value = 0; value < '0'; value++)
5565 ANYOF_BITMAP_SET(ret, value);
5566 for (value = '9' + 1; value < 256; value++)
5567 ANYOF_BITMAP_SET(ret, value);
5568 }
c49a72a9
NC
5569 yesno = '!';
5570 what = "Digit";
73b437c8
JH
5571 break;
5572 case ANYOF_GRAPH:
5573 if (LOC)
936ed897 5574 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5575 else {
5576 for (value = 0; value < 256; value++)
5577 if (isGRAPH(value))
936ed897 5578 ANYOF_BITMAP_SET(ret, value);
73b437c8 5579 }
c49a72a9
NC
5580 yesno = '+';
5581 what = "Graph";
73b437c8
JH
5582 break;
5583 case ANYOF_NGRAPH:
5584 if (LOC)
936ed897 5585 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5586 else {
5587 for (value = 0; value < 256; value++)
5588 if (!isGRAPH(value))
936ed897 5589 ANYOF_BITMAP_SET(ret, value);
73b437c8 5590 }
c49a72a9
NC
5591 yesno = '!';
5592 what = "Graph";
73b437c8
JH
5593 break;
5594 case ANYOF_LOWER:
5595 if (LOC)
936ed897 5596 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5597 else {
5598 for (value = 0; value < 256; value++)
5599 if (isLOWER(value))
936ed897 5600 ANYOF_BITMAP_SET(ret, value);
73b437c8 5601 }
c49a72a9
NC
5602 yesno = '+';
5603 what = "Lower";
73b437c8
JH
5604 break;
5605 case ANYOF_NLOWER:
5606 if (LOC)
936ed897 5607 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5608 else {
5609 for (value = 0; value < 256; value++)
5610 if (!isLOWER(value))
936ed897 5611 ANYOF_BITMAP_SET(ret, value);
73b437c8 5612 }
c49a72a9
NC
5613 yesno = '!';
5614 what = "Lower";
73b437c8
JH
5615 break;
5616 case ANYOF_PRINT:
5617 if (LOC)
936ed897 5618 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5619 else {
5620 for (value = 0; value < 256; value++)
5621 if (isPRINT(value))
936ed897 5622 ANYOF_BITMAP_SET(ret, value);
73b437c8 5623 }
c49a72a9
NC
5624 yesno = '+';
5625 what = "Print";
73b437c8
JH
5626 break;
5627 case ANYOF_NPRINT:
5628 if (LOC)
936ed897 5629 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5630 else {
5631 for (value = 0; value < 256; value++)
5632 if (!isPRINT(value))
936ed897 5633 ANYOF_BITMAP_SET(ret, value);
73b437c8 5634 }
c49a72a9
NC
5635 yesno = '!';
5636 what = "Print";
73b437c8 5637 break;
aaa51d5e
JF
5638 case ANYOF_PSXSPC:
5639 if (LOC)
5640 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5641 else {
5642 for (value = 0; value < 256; value++)
5643 if (isPSXSPC(value))
5644 ANYOF_BITMAP_SET(ret, value);
5645 }
c49a72a9
NC
5646 yesno = '+';
5647 what = "Space";
aaa51d5e
JF
5648 break;
5649 case ANYOF_NPSXSPC:
5650 if (LOC)
5651 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5652 else {
5653 for (value = 0; value < 256; value++)
5654 if (!isPSXSPC(value))
5655 ANYOF_BITMAP_SET(ret, value);
5656 }
c49a72a9
NC
5657 yesno = '!';
5658 what = "Space";
aaa51d5e 5659 break;
73b437c8
JH
5660 case ANYOF_PUNCT:
5661 if (LOC)
936ed897 5662 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5663 else {
5664 for (value = 0; value < 256; value++)
5665 if (isPUNCT(value))
936ed897 5666 ANYOF_BITMAP_SET(ret, value);
73b437c8 5667 }
c49a72a9
NC
5668 yesno = '+';
5669 what = "Punct";
73b437c8
JH
5670 break;
5671 case ANYOF_NPUNCT:
5672 if (LOC)
936ed897 5673 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5674 else {
5675 for (value = 0; value < 256; value++)
5676 if (!isPUNCT(value))
936ed897 5677 ANYOF_BITMAP_SET(ret, value);
73b437c8 5678 }
c49a72a9
NC
5679 yesno = '!';
5680 what = "Punct";
ffc61ed2
JH
5681 break;
5682 case ANYOF_SPACE:
5683 if (LOC)
5684 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5685 else {
5686 for (value = 0; value < 256; value++)
5687 if (isSPACE(value))
5688 ANYOF_BITMAP_SET(ret, value);
5689 }
c49a72a9
NC
5690 yesno = '+';
5691 what = "SpacePerl";
ffc61ed2
JH
5692 break;
5693 case ANYOF_NSPACE:
5694 if (LOC)
5695 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5696 else {
5697 for (value = 0; value < 256; value++)
5698 if (!isSPACE(value))
5699 ANYOF_BITMAP_SET(ret, value);
5700 }
c49a72a9
NC
5701 yesno = '!';
5702 what = "SpacePerl";
73b437c8
JH
5703 break;
5704 case ANYOF_UPPER:
5705 if (LOC)
936ed897 5706 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5707 else {
5708 for (value = 0; value < 256; value++)
5709 if (isUPPER(value))
936ed897 5710 ANYOF_BITMAP_SET(ret, value);
73b437c8 5711 }
c49a72a9
NC
5712 yesno = '+';
5713 what = "Upper";
73b437c8
JH
5714 break;
5715 case ANYOF_NUPPER:
5716 if (LOC)
936ed897 5717 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5718 else {
5719 for (value = 0; value < 256; value++)
5720 if (!isUPPER(value))
936ed897 5721 ANYOF_BITMAP_SET(ret, value);
73b437c8 5722 }
c49a72a9
NC
5723 yesno = '!';
5724 what = "Upper";
73b437c8
JH
5725 break;
5726 case ANYOF_XDIGIT:
5727 if (LOC)
936ed897 5728 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5729 else {
5730 for (value = 0; value < 256; value++)
5731 if (isXDIGIT(value))
936ed897 5732 ANYOF_BITMAP_SET(ret, value);
73b437c8 5733 }
c49a72a9
NC
5734 yesno = '+';
5735 what = "XDigit";
73b437c8
JH
5736 break;
5737 case ANYOF_NXDIGIT:
5738 if (LOC)
936ed897 5739 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5740 else {
5741 for (value = 0; value < 256; value++)
5742 if (!isXDIGIT(value))
936ed897 5743 ANYOF_BITMAP_SET(ret, value);
73b437c8 5744 }
c49a72a9
NC
5745 yesno = '!';
5746 what = "XDigit";
73b437c8 5747 break;
f81125e2
JP
5748 case ANYOF_MAX:
5749 /* this is to handle \p and \P */
5750 break;
73b437c8 5751 default:
b45f050a 5752 vFAIL("Invalid [::] class");
73b437c8 5753 break;
b8c5462f 5754 }
c49a72a9
NC
5755 if (what) {
5756 /* Strings such as "+utf8::isWord\n" */
5757 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5758 }
b8c5462f 5759 if (LOC)
936ed897 5760 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5761 continue;
a0d0e21e 5762 }
ffc61ed2
JH
5763 } /* end of namedclass \blah */
5764
a0d0e21e 5765 if (range) {
eb160463 5766 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
5767 const int w = RExC_parse - rangebegin;
5768 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 5769 range = 0; /* not a valid range */
73b437c8 5770 }
a0d0e21e
LW
5771 }
5772 else {
3568d838 5773 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5774 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5775 RExC_parse[1] != ']') {
5776 RExC_parse++;
ffc61ed2
JH
5777
5778 /* a bad range like \w-, [:word:]- ? */
5779 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 5780 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 5781 const int w =
afd78fd5
JH
5782 RExC_parse >= rangebegin ?
5783 RExC_parse - rangebegin : 0;
830247a4 5784 vWARN4(RExC_parse,
b45f050a 5785 "False [] range \"%*.*s\"",
097eb12c 5786 w, w, rangebegin);
afd78fd5 5787 }
73b437c8 5788 if (!SIZE_ONLY)
936ed897 5789 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5790 } else
ffc61ed2
JH
5791 range = 1; /* yeah, it's a range! */
5792 continue; /* but do it the next time */
a0d0e21e 5793 }
a687059c 5794 }
ffc61ed2 5795
93733859 5796 /* now is the next time */
07be1b83 5797 /*stored += (value - prevvalue + 1);*/
ae5c130c 5798 if (!SIZE_ONLY) {
3568d838 5799 if (prevvalue < 256) {
1df70142 5800 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 5801 IV i;
3568d838 5802#ifdef EBCDIC
1b2d223b
JH
5803 /* In EBCDIC [\x89-\x91] should include
5804 * the \x8e but [i-j] should not. */
5805 if (literal_endpoint == 2 &&
5806 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5807 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5808 {
3568d838
JH
5809 if (isLOWER(prevvalue)) {
5810 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5811 if (isLOWER(i))
5812 ANYOF_BITMAP_SET(ret, i);
5813 } else {
3568d838 5814 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5815 if (isUPPER(i))
5816 ANYOF_BITMAP_SET(ret, i);
5817 }
8ada0baa 5818 }
ffc61ed2 5819 else
8ada0baa 5820#endif
07be1b83
YO
5821 for (i = prevvalue; i <= ceilvalue; i++) {
5822 if (!ANYOF_BITMAP_TEST(ret,i)) {
5823 stored++;
5824 ANYOF_BITMAP_SET(ret, i);
5825 }
5826 }
3568d838 5827 }
a5961de5 5828 if (value > 255 || UTF) {
1df70142
AL
5829 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5830 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 5831 stored+=2; /* can't optimize this class */
ffc61ed2 5832 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5833 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5834 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5835 prevnatvalue, natvalue);
5836 }
5837 else if (prevnatvalue == natvalue) {
5838 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5839 if (FOLD) {
89ebb4a3 5840 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5841 STRLEN foldlen;
1df70142 5842 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5843
c840d2a2
JH
5844 /* If folding and foldable and a single
5845 * character, insert also the folded version
5846 * to the charclass. */
9e55ce06 5847 if (f != value) {
eb160463 5848 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5849 Perl_sv_catpvf(aTHX_ listsv,
5850 "%04"UVxf"\n", f);
5851 else {
5852 /* Any multicharacter foldings
5853 * require the following transform:
5854 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5855 * where E folds into "pq" and F folds
5856 * into "rst", all other characters
5857 * fold to single characters. We save
5858 * away these multicharacter foldings,
5859 * to be later saved as part of the
5860 * additional "s" data. */
5861 SV *sv;
5862
5863 if (!unicode_alternate)
5864 unicode_alternate = newAV();
5865 sv = newSVpvn((char*)foldbuf, foldlen);
5866 SvUTF8_on(sv);
5867 av_push(unicode_alternate, sv);
5868 }
5869 }
254ba52a 5870
60a8b682
JH
5871 /* If folding and the value is one of the Greek
5872 * sigmas insert a few more sigmas to make the
5873 * folding rules of the sigmas to work right.
5874 * Note that not all the possible combinations
5875 * are handled here: some of them are handled
9e55ce06
JH
5876 * by the standard folding rules, and some of
5877 * them (literal or EXACTF cases) are handled
5878 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5879 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5880 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5881 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5882 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5883 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5884 }
5885 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5886 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5887 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5888 }
5889 }
ffc61ed2 5890 }
1b2d223b
JH
5891#ifdef EBCDIC
5892 literal_endpoint = 0;
5893#endif
8ada0baa 5894 }
ffc61ed2
JH
5895
5896 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5897 }
ffc61ed2 5898
936ed897 5899 if (need_class) {
4f66b38d 5900 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5901 if (SIZE_ONLY)
830247a4 5902 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5903 else
830247a4 5904 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5905 }
ffc61ed2 5906
7f6f358c
YO
5907
5908 if (SIZE_ONLY)
5909 return ret;
5910 /****** !SIZE_ONLY AFTER HERE *********/
5911
5912 if( stored == 1 && value < 256
5913 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
5914 ) {
5915 /* optimize single char class to an EXACT node
5916 but *only* when its not a UTF/high char */
07be1b83
YO
5917 const char * cur_parse= RExC_parse;
5918 RExC_emit = (regnode *)orig_emit;
5919 RExC_parse = (char *)orig_parse;
7f6f358c
YO
5920 ret = reg_node(pRExC_state,
5921 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 5922 RExC_parse = (char *)cur_parse;
7f6f358c
YO
5923 *STRING(ret)= (char)value;
5924 STR_LEN(ret)= 1;
5925 RExC_emit += STR_SZ(1);
5926 return ret;
5927 }
ae5c130c 5928 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 5929 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5930 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5931 ) {
a0ed51b3 5932 for (value = 0; value < 256; ++value) {
936ed897 5933 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5934 UV fold = PL_fold[value];
ffc61ed2
JH
5935
5936 if (fold != value)
5937 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5938 }
5939 }
936ed897 5940 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5941 }
ffc61ed2 5942
ae5c130c 5943 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 5944 if (optimize_invert &&
ffc61ed2
JH
5945 /* If the only flag is inversion. */
5946 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5947 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5948 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5949 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5950 }
7f6f358c 5951 {
097eb12c 5952 AV * const av = newAV();
ffc61ed2 5953 SV *rv;
9e55ce06 5954 /* The 0th element stores the character class description
6a0407ee 5955 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5956 * to initialize the appropriate swash (which gets stored in
5957 * the 1st element), and also useful for dumping the regnode.
5958 * The 2nd element stores the multicharacter foldings,
6a0407ee 5959 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
5960 av_store(av, 0, listsv);
5961 av_store(av, 1, NULL);
9e55ce06 5962 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 5963 rv = newRV_noinc((SV*)av);
19860706 5964 n = add_data(pRExC_state, 1, "s");
830247a4 5965 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 5966 ARG_SET(ret, n);
a0ed51b3 5967 }
a0ed51b3
LW
5968 return ret;
5969}
5970
76e3520e 5971STATIC char*
830247a4 5972S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 5973{
097eb12c 5974 char* const retval = RExC_parse++;
a0d0e21e 5975
4633a7c4 5976 for (;;) {
830247a4
IZ
5977 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5978 RExC_parse[2] == '#') {
e994fd66
AE
5979 while (*RExC_parse != ')') {
5980 if (RExC_parse == RExC_end)
5981 FAIL("Sequence (?#... not terminated");
830247a4 5982 RExC_parse++;
e994fd66 5983 }
830247a4 5984 RExC_parse++;
4633a7c4
LW
5985 continue;
5986 }
e2509266 5987 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
5988 if (isSPACE(*RExC_parse)) {
5989 RExC_parse++;
748a9306
LW
5990 continue;
5991 }
830247a4 5992 else if (*RExC_parse == '#') {
e994fd66
AE
5993 while (RExC_parse < RExC_end)
5994 if (*RExC_parse++ == '\n') break;
748a9306
LW
5995 continue;
5996 }
748a9306 5997 }
4633a7c4 5998 return retval;
a0d0e21e 5999 }
a687059c
LW
6000}
6001
6002/*
c277df42 6003- reg_node - emit a node
a0d0e21e 6004*/
76e3520e 6005STATIC regnode * /* Location. */
830247a4 6006S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 6007{
97aff369 6008 dVAR;
c277df42 6009 register regnode *ptr;
504618e9 6010 regnode * const ret = RExC_emit;
07be1b83 6011 GET_RE_DEBUG_FLAGS_DECL;
a687059c 6012
c277df42 6013 if (SIZE_ONLY) {
830247a4
IZ
6014 SIZE_ALIGN(RExC_size);
6015 RExC_size += 1;
a0d0e21e
LW
6016 return(ret);
6017 }
c277df42 6018 NODE_ALIGN_FILL(ret);
a0d0e21e 6019 ptr = ret;
c277df42 6020 FILL_ADVANCE_NODE(ptr, op);
fac92740 6021 if (RExC_offsets) { /* MJD */
07be1b83 6022 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
6023 "reg_node", __LINE__,
6024 reg_name[op],
07be1b83
YO
6025 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6026 ? "Overwriting end of array!\n" : "OK",
6027 (UV)(RExC_emit - RExC_emit_start),
6028 (UV)(RExC_parse - RExC_start),
6029 (UV)RExC_offsets[0]));
ccb2c380 6030 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 6031 }
07be1b83 6032
830247a4 6033 RExC_emit = ptr;
a687059c 6034
a0d0e21e 6035 return(ret);
a687059c
LW
6036}
6037
6038/*
a0d0e21e
LW
6039- reganode - emit a node with an argument
6040*/
76e3520e 6041STATIC regnode * /* Location. */
830247a4 6042S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 6043{
97aff369 6044 dVAR;
c277df42 6045 register regnode *ptr;
504618e9 6046 regnode * const ret = RExC_emit;
07be1b83 6047 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 6048
c277df42 6049 if (SIZE_ONLY) {
830247a4
IZ
6050 SIZE_ALIGN(RExC_size);
6051 RExC_size += 2;
a0d0e21e
LW
6052 return(ret);
6053 }
fe14fcc3 6054
c277df42 6055 NODE_ALIGN_FILL(ret);
a0d0e21e 6056 ptr = ret;
c277df42 6057 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 6058 if (RExC_offsets) { /* MJD */
07be1b83 6059 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6060 "reganode",
ccb2c380
MP
6061 __LINE__,
6062 reg_name[op],
07be1b83 6063 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 6064 "Overwriting end of array!\n" : "OK",
07be1b83
YO
6065 (UV)(RExC_emit - RExC_emit_start),
6066 (UV)(RExC_parse - RExC_start),
6067 (UV)RExC_offsets[0]));
ccb2c380 6068 Set_Cur_Node_Offset;
fac92740
MJD
6069 }
6070
830247a4 6071 RExC_emit = ptr;
fe14fcc3 6072
a0d0e21e 6073 return(ret);
fe14fcc3
LW
6074}
6075
6076/*
cd439c50 6077- reguni - emit (if appropriate) a Unicode character
a0ed51b3 6078*/
71207a34
AL
6079STATIC STRLEN
6080S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 6081{
97aff369 6082 dVAR;
71207a34 6083 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
6084}
6085
6086/*
a0d0e21e
LW
6087- reginsert - insert an operator in front of already-emitted operand
6088*
6089* Means relocating the operand.
6090*/
76e3520e 6091STATIC void
830247a4 6092S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 6093{
97aff369 6094 dVAR;
c277df42
IZ
6095 register regnode *src;
6096 register regnode *dst;
6097 register regnode *place;
504618e9 6098 const int offset = regarglen[(U8)op];
07be1b83 6099 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 6100/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
6101
6102 if (SIZE_ONLY) {
830247a4 6103 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
6104 return;
6105 }
a687059c 6106
830247a4
IZ
6107 src = RExC_emit;
6108 RExC_emit += NODE_STEP_REGNODE + offset;
6109 dst = RExC_emit;
fac92740 6110 while (src > opnd) {
c277df42 6111 StructCopy(--src, --dst, regnode);
fac92740 6112 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 6113 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 6114 "reg_insert",
ccb2c380
MP
6115 __LINE__,
6116 reg_name[op],
07be1b83
YO
6117 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6118 ? "Overwriting end of array!\n" : "OK",
6119 (UV)(src - RExC_emit_start),
6120 (UV)(dst - RExC_emit_start),
6121 (UV)RExC_offsets[0]));
ccb2c380
MP
6122 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6123 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
6124 }
6125 }
6126
a0d0e21e
LW
6127
6128 place = opnd; /* Op node, where operand used to be. */
fac92740 6129 if (RExC_offsets) { /* MJD */
07be1b83 6130 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6131 "reginsert",
ccb2c380
MP
6132 __LINE__,
6133 reg_name[op],
07be1b83 6134 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 6135 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
6136 (UV)(place - RExC_emit_start),
6137 (UV)(RExC_parse - RExC_start),
fac92740 6138 RExC_offsets[0]));
ccb2c380 6139 Set_Node_Offset(place, RExC_parse);
45948336 6140 Set_Node_Length(place, 1);
fac92740 6141 }
c277df42
IZ
6142 src = NEXTOPER(place);
6143 FILL_ADVANCE_NODE(place, op);
6144 Zero(src, offset, regnode);
a687059c
LW
6145}
6146
6147/*
c277df42 6148- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 6149- SEE ALSO: regtail_study
a0d0e21e 6150*/
097eb12c 6151/* TODO: All three parms should be const */
76e3520e 6152STATIC void
3dab1dad 6153S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 6154{
97aff369 6155 dVAR;
c277df42 6156 register regnode *scan;
f9049ba1
SP
6157#ifndef DEBUGGING
6158 PERL_UNUSED_ARG(depth);
6159#endif
3dab1dad 6160 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 6161
c277df42 6162 if (SIZE_ONLY)
a0d0e21e
LW
6163 return;
6164
6165 /* Find last node. */
6166 scan = p;
6167 for (;;) {
504618e9 6168 regnode * const temp = regnext(scan);
3dab1dad
YO
6169 DEBUG_PARSE_r({
6170 SV * const mysv=sv_newmortal();
6171 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6172 regprop(RExC_rx, mysv, scan);
6173 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6174 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6175 });
6176 if (temp == NULL)
6177 break;
6178 scan = temp;
6179 }
6180
6181 if (reg_off_by_arg[OP(scan)]) {
6182 ARG_SET(scan, val - scan);
6183 }
6184 else {
6185 NEXT_OFF(scan) = val - scan;
6186 }
6187}
6188
07be1b83 6189#ifdef DEBUGGING
3dab1dad
YO
6190/*
6191- regtail_study - set the next-pointer at the end of a node chain of p to val.
6192- Look for optimizable sequences at the same time.
6193- currently only looks for EXACT chains.
07be1b83
YO
6194
6195This is expermental code. The idea is to use this routine to perform
6196in place optimizations on branches and groups as they are constructed,
6197with the long term intention of removing optimization from study_chunk so
6198that it is purely analytical.
6199
6200Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6201to control which is which.
6202
3dab1dad
YO
6203*/
6204/* TODO: All four parms should be const */
07be1b83 6205
3dab1dad
YO
6206STATIC U8
6207S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6208{
6209 dVAR;
6210 register regnode *scan;
07be1b83
YO
6211 U8 exact = PSEUDO;
6212#ifdef EXPERIMENTAL_INPLACESCAN
6213 I32 min = 0;
6214#endif
6215
3dab1dad
YO
6216 GET_RE_DEBUG_FLAGS_DECL;
6217
07be1b83 6218
3dab1dad
YO
6219 if (SIZE_ONLY)
6220 return exact;
6221
6222 /* Find last node. */
6223
6224 scan = p;
6225 for (;;) {
6226 regnode * const temp = regnext(scan);
07be1b83
YO
6227#ifdef EXPERIMENTAL_INPLACESCAN
6228 if (PL_regkind[OP(scan)] == EXACT)
6229 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6230 return EXACT;
6231#endif
3dab1dad
YO
6232 if ( exact ) {
6233 switch (OP(scan)) {
6234 case EXACT:
6235 case EXACTF:
6236 case EXACTFL:
6237 if( exact == PSEUDO )
6238 exact= OP(scan);
07be1b83
YO
6239 else if ( exact != OP(scan) )
6240 exact= 0;
3dab1dad
YO
6241 case NOTHING:
6242 break;
6243 default:
6244 exact= 0;
6245 }
6246 }
6247 DEBUG_PARSE_r({
6248 SV * const mysv=sv_newmortal();
6249 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6250 regprop(RExC_rx, mysv, scan);
6251 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6252 SvPV_nolen_const(mysv),
6253 reg_name[exact],
6254 REG_NODE_NUM(scan));
6255 });
a0d0e21e
LW
6256 if (temp == NULL)
6257 break;
6258 scan = temp;
6259 }
07be1b83
YO
6260 DEBUG_PARSE_r({
6261 SV * const mysv_val=sv_newmortal();
6262 DEBUG_PARSE_MSG("");
6263 regprop(RExC_rx, mysv_val, val);
6264 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6265 SvPV_nolen_const(mysv_val),
6266 REG_NODE_NUM(val),
6267 val - scan
6268 );
6269 });
c277df42
IZ
6270 if (reg_off_by_arg[OP(scan)]) {
6271 ARG_SET(scan, val - scan);
a0ed51b3
LW
6272 }
6273 else {
c277df42
IZ
6274 NEXT_OFF(scan) = val - scan;
6275 }
3dab1dad
YO
6276
6277 return exact;
a687059c 6278}
07be1b83 6279#endif
a687059c
LW
6280
6281/*
a687059c
LW
6282 - regcurly - a little FSA that accepts {\d+,?\d*}
6283 */
79072805 6284STATIC I32
5f66b61c 6285S_regcurly(register const char *s)
a687059c
LW
6286{
6287 if (*s++ != '{')
6288 return FALSE;
f0fcb552 6289 if (!isDIGIT(*s))
a687059c 6290 return FALSE;
f0fcb552 6291 while (isDIGIT(*s))
a687059c
LW
6292 s++;
6293 if (*s == ',')
6294 s++;
f0fcb552 6295 while (isDIGIT(*s))
a687059c
LW
6296 s++;
6297 if (*s != '}')
6298 return FALSE;
6299 return TRUE;
6300}
6301
a687059c
LW
6302
6303/*
fd181c75 6304 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
6305 */
6306void
097eb12c 6307Perl_regdump(pTHX_ const regexp *r)
a687059c 6308{
35ff7856 6309#ifdef DEBUGGING
97aff369 6310 dVAR;
c445ea15 6311 SV * const sv = sv_newmortal();
a687059c 6312
4f639d21 6313 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
6314
6315 /* Header fields of interest. */
c277df42 6316 if (r->anchored_substr)
7b0972df 6317 PerlIO_printf(Perl_debug_log,
a0288114 6318 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
3280af22 6319 PL_colors[0],
7b0972df 6320 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
3f7c398e 6321 SvPVX_const(r->anchored_substr),
3280af22 6322 PL_colors[1],
c277df42 6323 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 6324 (IV)r->anchored_offset);
33b8afdf
JH
6325 else if (r->anchored_utf8)
6326 PerlIO_printf(Perl_debug_log,
a0288114 6327 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
33b8afdf
JH
6328 PL_colors[0],
6329 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
3f7c398e 6330 SvPVX_const(r->anchored_utf8),
33b8afdf
JH
6331 PL_colors[1],
6332 SvTAIL(r->anchored_utf8) ? "$" : "",
6333 (IV)r->anchored_offset);
c277df42 6334 if (r->float_substr)
7b0972df 6335 PerlIO_printf(Perl_debug_log,
a0288114 6336 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
3280af22 6337 PL_colors[0],
b81d288d 6338 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
3f7c398e 6339 SvPVX_const(r->float_substr),
3280af22 6340 PL_colors[1],
c277df42 6341 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 6342 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
6343 else if (r->float_utf8)
6344 PerlIO_printf(Perl_debug_log,
a0288114 6345 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
33b8afdf
JH
6346 PL_colors[0],
6347 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
3f7c398e 6348 SvPVX_const(r->float_utf8),
33b8afdf
JH
6349 PL_colors[1],
6350 SvTAIL(r->float_utf8) ? "$" : "",
6351 (IV)r->float_min_offset, (UV)r->float_max_offset);
6352 if (r->check_substr || r->check_utf8)
b81d288d
AB
6353 PerlIO_printf(Perl_debug_log,
6354 r->check_substr == r->float_substr
33b8afdf 6355 && r->check_utf8 == r->float_utf8
c277df42
IZ
6356 ? "(checking floating" : "(checking anchored");
6357 if (r->reganch & ROPT_NOSCAN)
6358 PerlIO_printf(Perl_debug_log, " noscan");
6359 if (r->reganch & ROPT_CHECK_ALL)
6360 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 6361 if (r->check_substr || r->check_utf8)
c277df42
IZ
6362 PerlIO_printf(Perl_debug_log, ") ");
6363
46fc3d4c 6364 if (r->regstclass) {
32fc9b6a 6365 regprop(r, sv, r->regstclass);
3f7c398e 6366 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
46fc3d4c 6367 }
774d564b 6368 if (r->reganch & ROPT_ANCH) {
6369 PerlIO_printf(Perl_debug_log, "anchored");
6370 if (r->reganch & ROPT_ANCH_BOL)
6371 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
6372 if (r->reganch & ROPT_ANCH_MBOL)
6373 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
6374 if (r->reganch & ROPT_ANCH_SBOL)
6375 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 6376 if (r->reganch & ROPT_ANCH_GPOS)
6377 PerlIO_printf(Perl_debug_log, "(GPOS)");
6378 PerlIO_putc(Perl_debug_log, ' ');
6379 }
c277df42
IZ
6380 if (r->reganch & ROPT_GPOS_SEEN)
6381 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 6382 if (r->reganch & ROPT_SKIP)
760ac839 6383 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 6384 if (r->reganch & ROPT_IMPLICIT)
760ac839 6385 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 6386 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
6387 if (r->reganch & ROPT_EVAL_SEEN)
6388 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 6389 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 6390#else
96a5add6 6391 PERL_UNUSED_CONTEXT;
65e66c80 6392 PERL_UNUSED_ARG(r);
17c3b450 6393#endif /* DEBUGGING */
a687059c
LW
6394}
6395
6396/*
a0d0e21e
LW
6397- regprop - printable representation of opcode
6398*/
46fc3d4c 6399void
32fc9b6a 6400Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 6401{
35ff7856 6402#ifdef DEBUGGING
97aff369 6403 dVAR;
9b155405 6404 register int k;
a0d0e21e 6405
54dc92de 6406 sv_setpvn(sv, "", 0);
9b155405 6407 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
6408 /* It would be nice to FAIL() here, but this may be called from
6409 regexec.c, and it would be hard to supply pRExC_state. */
6410 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 6411 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 6412
3dab1dad 6413 k = PL_regkind[OP(o)];
9b155405 6414
2a782b5b 6415 if (k == EXACT) {
396482e1 6416 SV * const dsv = sv_2mortal(newSVpvs(""));
c728cb41
JH
6417 /* Using is_utf8_string() is a crude hack but it may
6418 * be the best for now since we have no flag "this EXACTish
6419 * node was UTF-8" --jhi */
1df70142 6420 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
2d03de9c 6421 const char * const s = do_utf8 ?
c728cb41
JH
6422 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
6423 UNI_DISPLAY_REGEX) :
2a782b5b 6424 STRING(o);
e1ec3a88 6425 const int len = do_utf8 ?
2a782b5b
JH
6426 strlen(s) :
6427 STR_LEN(o);
6428 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
6429 PL_colors[0],
6430 len, s,
6431 PL_colors[1]);
bb263b4e 6432 } else if (k == TRIE) {
3dab1dad
YO
6433 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6434 /* print the details of the trie in dumpuntil instead, as
4f639d21 6435 * prog->data isn't available here */
a3621e74 6436 } else if (k == CURLY) {
cb434fcc 6437 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
6438 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6439 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 6440 }
2c2d71f5
JH
6441 else if (k == WHILEM && o->flags) /* Ordinal/of */
6442 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 6443 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 6444 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 6445 else if (k == LOGICAL)
04ebc1ab 6446 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
6447 else if (k == ANYOF) {
6448 int i, rangestart = -1;
2d03de9c 6449 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
6450
6451 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6452 static const char * const anyofs[] = {
653099ff
GS
6453 "\\w",
6454 "\\W",
6455 "\\s",
6456 "\\S",
6457 "\\d",
6458 "\\D",
6459 "[:alnum:]",
6460 "[:^alnum:]",
6461 "[:alpha:]",
6462 "[:^alpha:]",
6463 "[:ascii:]",
6464 "[:^ascii:]",
6465 "[:ctrl:]",
6466 "[:^ctrl:]",
6467 "[:graph:]",
6468 "[:^graph:]",
6469 "[:lower:]",
6470 "[:^lower:]",
6471 "[:print:]",
6472 "[:^print:]",
6473 "[:punct:]",
6474 "[:^punct:]",
6475 "[:upper:]",
aaa51d5e 6476 "[:^upper:]",
653099ff 6477 "[:xdigit:]",
aaa51d5e
JF
6478 "[:^xdigit:]",
6479 "[:space:]",
6480 "[:^space:]",
6481 "[:blank:]",
6482 "[:^blank:]"
653099ff
GS
6483 };
6484
19860706 6485 if (flags & ANYOF_LOCALE)
396482e1 6486 sv_catpvs(sv, "{loc}");
19860706 6487 if (flags & ANYOF_FOLD)
396482e1 6488 sv_catpvs(sv, "{i}");
653099ff 6489 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 6490 if (flags & ANYOF_INVERT)
396482e1 6491 sv_catpvs(sv, "^");
ffc61ed2
JH
6492 for (i = 0; i <= 256; i++) {
6493 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6494 if (rangestart == -1)
6495 rangestart = i;
6496 } else if (rangestart != -1) {
6497 if (i <= rangestart + 3)
6498 for (; rangestart < i; rangestart++)
653099ff 6499 put_byte(sv, rangestart);
ffc61ed2
JH
6500 else {
6501 put_byte(sv, rangestart);
396482e1 6502 sv_catpvs(sv, "-");
ffc61ed2 6503 put_byte(sv, i - 1);
653099ff 6504 }
ffc61ed2 6505 rangestart = -1;
653099ff 6506 }
847a199f 6507 }
ffc61ed2
JH
6508
6509 if (o->flags & ANYOF_CLASS)
bb7a0f54 6510 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
6511 if (ANYOF_CLASS_TEST(o,i))
6512 sv_catpv(sv, anyofs[i]);
6513
6514 if (flags & ANYOF_UNICODE)
396482e1 6515 sv_catpvs(sv, "{unicode}");
1aa99e6b 6516 else if (flags & ANYOF_UNICODE_ALL)
396482e1 6517 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
6518
6519 {
6520 SV *lv;
32fc9b6a 6521 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 6522
ffc61ed2
JH
6523 if (lv) {
6524 if (sw) {
89ebb4a3 6525 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 6526
ffc61ed2 6527 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 6528 uvchr_to_utf8(s, i);
ffc61ed2 6529
3568d838 6530 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
6531 if (rangestart == -1)
6532 rangestart = i;
6533 } else if (rangestart != -1) {
ffc61ed2
JH
6534 if (i <= rangestart + 3)
6535 for (; rangestart < i; rangestart++) {
2d03de9c
AL
6536 const U8 * const e = uvchr_to_utf8(s,rangestart);
6537 U8 *p;
6538 for(p = s; p < e; p++)
ffc61ed2
JH
6539 put_byte(sv, *p);
6540 }
6541 else {
2d03de9c
AL
6542 const U8 *e = uvchr_to_utf8(s,rangestart);
6543 U8 *p;
6544 for (p = s; p < e; p++)
ffc61ed2 6545 put_byte(sv, *p);
396482e1 6546 sv_catpvs(sv, "-");
2d03de9c
AL
6547 e = uvchr_to_utf8(s, i-1);
6548 for (p = s; p < e; p++)
1df70142 6549 put_byte(sv, *p);
ffc61ed2
JH
6550 }
6551 rangestart = -1;
6552 }
19860706 6553 }
ffc61ed2 6554
396482e1 6555 sv_catpvs(sv, "..."); /* et cetera */
19860706 6556 }
fde631ed 6557
ffc61ed2 6558 {
2e0de35c 6559 char *s = savesvpv(lv);
c445ea15 6560 char * const origs = s;
b81d288d 6561
3dab1dad
YO
6562 while (*s && *s != '\n')
6563 s++;
b81d288d 6564
ffc61ed2 6565 if (*s == '\n') {
2d03de9c 6566 const char * const t = ++s;
ffc61ed2
JH
6567
6568 while (*s) {
6569 if (*s == '\n')
6570 *s = ' ';
6571 s++;
6572 }
6573 if (s[-1] == ' ')
6574 s[-1] = 0;
6575
6576 sv_catpv(sv, t);
fde631ed 6577 }
b81d288d 6578
ffc61ed2 6579 Safefree(origs);
fde631ed
JH
6580 }
6581 }
653099ff 6582 }
ffc61ed2 6583
653099ff
GS
6584 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6585 }
9b155405 6586 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 6587 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 6588#else
96a5add6 6589 PERL_UNUSED_CONTEXT;
65e66c80
SP
6590 PERL_UNUSED_ARG(sv);
6591 PERL_UNUSED_ARG(o);
f9049ba1 6592 PERL_UNUSED_ARG(prog);
17c3b450 6593#endif /* DEBUGGING */
35ff7856 6594}
a687059c 6595
cad2e5aa
JH
6596SV *
6597Perl_re_intuit_string(pTHX_ regexp *prog)
6598{ /* Assume that RE_INTUIT is set */
97aff369 6599 dVAR;
a3621e74 6600 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
6601 PERL_UNUSED_CONTEXT;
6602
a3621e74 6603 DEBUG_COMPILE_r(
cfd0369c 6604 {
2d03de9c 6605 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 6606 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
6607
6608 if (!PL_colorset) reginitcolors();
6609 PerlIO_printf(Perl_debug_log,
a0288114 6610 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
6611 PL_colors[4],
6612 prog->check_substr ? "" : "utf8 ",
6613 PL_colors[5],PL_colors[0],
cad2e5aa
JH
6614 s,
6615 PL_colors[1],
6616 (strlen(s) > 60 ? "..." : ""));
6617 } );
6618
33b8afdf 6619 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
6620}
6621
2b69d0c2 6622void
864dbfa3 6623Perl_pregfree(pTHX_ struct regexp *r)
a687059c 6624{
27da23d5 6625 dVAR;
9e55ce06 6626#ifdef DEBUGGING
c445ea15 6627 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
9e55ce06 6628#endif
fc32ee4a 6629 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 6630
7821416a
IZ
6631 if (!r || (--r->refcnt > 0))
6632 return;
be8e71aa 6633 DEBUG_COMPILE_r(if (RX_DEBUG(r)){
c445ea15 6634 const char * const s = (r->reganch & ROPT_UTF8)
e1ec3a88 6635 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 6636 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
e1ec3a88 6637 const int len = SvCUR(dsv);
9e55ce06
JH
6638 if (!PL_colorset)
6639 reginitcolors();
6640 PerlIO_printf(Perl_debug_log,
a3621e74 6641 "%sFreeing REx:%s %s%*.*s%s%s\n",
9e55ce06
JH
6642 PL_colors[4],PL_colors[5],PL_colors[0],
6643 len, len, s,
6644 PL_colors[1],
6645 len > 60 ? "..." : "");
6646 });
cad2e5aa 6647
43c5f42d
NC
6648 /* gcov results gave these as non-null 100% of the time, so there's no
6649 optimisation in checking them before calling Safefree */
6650 Safefree(r->precomp);
6651 Safefree(r->offsets); /* 20010421 MJD */
ed252734 6652 RX_MATCH_COPY_FREE(r);
f8c7b90f 6653#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
6654 if (r->saved_copy)
6655 SvREFCNT_dec(r->saved_copy);
6656#endif
a193d654
GS
6657 if (r->substrs) {
6658 if (r->anchored_substr)
6659 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6660 if (r->anchored_utf8)
6661 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6662 if (r->float_substr)
6663 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6664 if (r->float_utf8)
6665 SvREFCNT_dec(r->float_utf8);
2779dcf1 6666 Safefree(r->substrs);
a193d654 6667 }
c277df42
IZ
6668 if (r->data) {
6669 int n = r->data->count;
f3548bdc
DM
6670 PAD* new_comppad = NULL;
6671 PAD* old_comppad;
4026c95a 6672 PADOFFSET refcnt;
dfad63ad 6673
c277df42 6674 while (--n >= 0) {
261faec3 6675 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6676 switch (r->data->what[n]) {
6677 case 's':
6678 SvREFCNT_dec((SV*)r->data->data[n]);
6679 break;
653099ff
GS
6680 case 'f':
6681 Safefree(r->data->data[n]);
6682 break;
dfad63ad
HS
6683 case 'p':
6684 new_comppad = (AV*)r->data->data[n];
6685 break;
c277df42 6686 case 'o':
dfad63ad 6687 if (new_comppad == NULL)
cea2e8a9 6688 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6689 PAD_SAVE_LOCAL(old_comppad,
6690 /* Watch out for global destruction's random ordering. */
c445ea15 6691 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 6692 );
b34c0dd4 6693 OP_REFCNT_LOCK;
4026c95a
SH
6694 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6695 OP_REFCNT_UNLOCK;
6696 if (!refcnt)
9b978d73 6697 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6698
f3548bdc 6699 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6700 SvREFCNT_dec((SV*)new_comppad);
6701 new_comppad = NULL;
c277df42
IZ
6702 break;
6703 case 'n':
9e55ce06 6704 break;
07be1b83 6705 case 'T':
be8e71aa
YO
6706 { /* Aho Corasick add-on structure for a trie node.
6707 Used in stclass optimization only */
07be1b83
YO
6708 U32 refcount;
6709 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
6710 OP_REFCNT_LOCK;
6711 refcount = --aho->refcount;
6712 OP_REFCNT_UNLOCK;
6713 if ( !refcount ) {
6714 Safefree(aho->states);
6715 Safefree(aho->fail);
6716 aho->trie=NULL; /* not necessary to free this as it is
6717 handled by the 't' case */
6718 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 6719 Safefree(r->regstclass);
07be1b83
YO
6720 }
6721 }
6722 break;
a3621e74 6723 case 't':
07be1b83 6724 {
be8e71aa 6725 /* trie structure. */
07be1b83
YO
6726 U32 refcount;
6727 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6728 OP_REFCNT_LOCK;
6729 refcount = --trie->refcount;
6730 OP_REFCNT_UNLOCK;
6731 if ( !refcount ) {
6732 Safefree(trie->charmap);
6733 if (trie->widecharmap)
6734 SvREFCNT_dec((SV*)trie->widecharmap);
6735 Safefree(trie->states);
6736 Safefree(trie->trans);
6737 if (trie->bitmap)
6738 Safefree(trie->bitmap);
6739 if (trie->wordlen)
6740 Safefree(trie->wordlen);
a3621e74 6741#ifdef DEBUGGING
be8e71aa
YO
6742 if (RX_DEBUG(r)) {
6743 if (trie->words)
6744 SvREFCNT_dec((SV*)trie->words);
6745 if (trie->revcharmap)
6746 SvREFCNT_dec((SV*)trie->revcharmap);
6747 }
a3621e74 6748#endif
07be1b83 6749 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 6750 }
07be1b83
YO
6751 }
6752 break;
c277df42 6753 default:
830247a4 6754 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6755 }
6756 }
6757 Safefree(r->data->what);
6758 Safefree(r->data);
a0d0e21e
LW
6759 }
6760 Safefree(r->startp);
6761 Safefree(r->endp);
6762 Safefree(r);
a687059c 6763}
c277df42 6764
76234dfb 6765#ifndef PERL_IN_XSUB_RE
c277df42
IZ
6766/*
6767 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6768 */
6769regnode *
864dbfa3 6770Perl_regnext(pTHX_ register regnode *p)
c277df42 6771{
97aff369 6772 dVAR;
c277df42
IZ
6773 register I32 offset;
6774
3280af22 6775 if (p == &PL_regdummy)
c277df42
IZ
6776 return(NULL);
6777
6778 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6779 if (offset == 0)
6780 return(NULL);
6781
c277df42 6782 return(p+offset);
c277df42 6783}
76234dfb 6784#endif
c277df42 6785
01f988be 6786STATIC void
cea2e8a9 6787S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6788{
6789 va_list args;
6790 STRLEN l1 = strlen(pat1);
6791 STRLEN l2 = strlen(pat2);
6792 char buf[512];
06bf62c7 6793 SV *msv;
73d840c0 6794 const char *message;
c277df42
IZ
6795
6796 if (l1 > 510)
6797 l1 = 510;
6798 if (l1 + l2 > 510)
6799 l2 = 510 - l1;
6800 Copy(pat1, buf, l1 , char);
6801 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6802 buf[l1 + l2] = '\n';
6803 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6804#ifdef I_STDARG
6805 /* ANSI variant takes additional second argument */
c277df42 6806 va_start(args, pat2);
8736538c
AS
6807#else
6808 va_start(args);
6809#endif
5a844595 6810 msv = vmess(buf, &args);
c277df42 6811 va_end(args);
cfd0369c 6812 message = SvPV_const(msv,l1);
c277df42
IZ
6813 if (l1 > 512)
6814 l1 = 512;
6815 Copy(message, buf, l1 , char);
197cf9b9 6816 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6817 Perl_croak(aTHX_ "%s", buf);
c277df42 6818}
a0ed51b3
LW
6819
6820/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6821
76234dfb 6822#ifndef PERL_IN_XSUB_RE
a0ed51b3 6823void
864dbfa3 6824Perl_save_re_context(pTHX)
b81d288d 6825{
97aff369 6826 dVAR;
1ade1aa1
NC
6827
6828 struct re_save_state *state;
6829
6830 SAVEVPTR(PL_curcop);
6831 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6832
6833 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6834 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6835 SSPUSHINT(SAVEt_RE_STATE);
6836
46ab3289 6837 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 6838
a0ed51b3 6839 PL_reg_start_tmp = 0;
a0ed51b3 6840 PL_reg_start_tmpl = 0;
c445ea15 6841 PL_reg_oldsaved = NULL;
a5db57d6 6842 PL_reg_oldsavedlen = 0;
a5db57d6 6843 PL_reg_maxiter = 0;
a5db57d6 6844 PL_reg_leftiter = 0;
c445ea15 6845 PL_reg_poscache = NULL;
a5db57d6 6846 PL_reg_poscache_size = 0;
1ade1aa1
NC
6847#ifdef PERL_OLD_COPY_ON_WRITE
6848 PL_nrs = NULL;
6849#endif
ada6e8a9 6850
c445ea15
AL
6851 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6852 if (PL_curpm) {
6853 const REGEXP * const rx = PM_GETRE(PL_curpm);
6854 if (rx) {
1df70142 6855 U32 i;
ada6e8a9 6856 for (i = 1; i <= rx->nparens; i++) {
1df70142 6857 char digits[TYPE_CHARS(long)];
d9fad198 6858 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
6859 GV *const *const gvp
6860 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6861
b37c2d43
AL
6862 if (gvp) {
6863 GV * const gv = *gvp;
6864 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6865 save_scalar(gv);
49f27e4b 6866 }
ada6e8a9
AMS
6867 }
6868 }
6869 }
a0ed51b3 6870}
76234dfb 6871#endif
51371543 6872
51371543 6873static void
acfe0abc 6874clear_re(pTHX_ void *r)
51371543 6875{
97aff369 6876 dVAR;
51371543
GS
6877 ReREFCNT_dec((regexp *)r);
6878}
ffbc6a93 6879
a28509cc
AL
6880#ifdef DEBUGGING
6881
6882STATIC void
6883S_put_byte(pTHX_ SV *sv, int c)
6884{
6885 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6886 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6887 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6888 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6889 else
6890 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6891}
6892
3dab1dad
YO
6893#define CLEAR_OPTSTART \
6894 if (optstart) STMT_START { \
07be1b83 6895 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
6896 optstart=NULL; \
6897 } STMT_END
6898
6899#define DUMPUNTIL(a,b,c,d,e,f) CLEAR_OPTSTART; node=dumpuntil(a,b,c,d,e,f);
6900
b5a2f8d8
NC
6901STATIC const regnode *
6902S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6903 const regnode *last, SV* sv, I32 l)
a28509cc 6904{
97aff369 6905 dVAR;
a28509cc 6906 register U8 op = EXACT; /* Arbitrary non-END op. */
b5a2f8d8 6907 register const regnode *next;
3dab1dad
YO
6908 const regnode *optstart= NULL;
6909 GET_RE_DEBUG_FLAGS_DECL;
a28509cc
AL
6910
6911 while (op != END && (!last || node < last)) {
6912 /* While that wasn't END last time... */
6913
6914 NODE_ALIGN(node);
6915 op = OP(node);
6916 if (op == CLOSE)
6917 l--;
b5a2f8d8 6918 next = regnext((regnode *)node);
07be1b83 6919
a28509cc 6920 /* Where, what. */
8e11feef 6921 if (OP(node) == OPTIMIZED) {
e68ec53f 6922 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 6923 optstart = node;
3dab1dad 6924 else
8e11feef 6925 goto after_print;
3dab1dad
YO
6926 } else
6927 CLEAR_OPTSTART;
07be1b83 6928
32fc9b6a 6929 regprop(r, sv, node);
a28509cc
AL
6930 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6931 (int)(2*l + 1), "", SvPVX_const(sv));
3dab1dad
YO
6932
6933 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
6934 if (next == NULL) /* Next ptr. */
6935 PerlIO_printf(Perl_debug_log, "(0)");
6936 else
6937 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6938 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
6939 }
6940
a28509cc
AL
6941 after_print:
6942 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
6943 assert(next);
6944 {
6945 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
6946 ? regnext((regnode *)next)
6947 : next);
be8e71aa
YO
6948 if (last && nnode > last)
6949 nnode = last;
6950 DUMPUNTIL(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6951 }
a28509cc
AL
6952 }
6953 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 6954 assert(next);
3dab1dad 6955 DUMPUNTIL(r, start, NEXTOPER(node), next, sv, l + 1);
a28509cc
AL
6956 }
6957 else if ( PL_regkind[(U8)op] == TRIE ) {
6958 const I32 n = ARG(node);
4f639d21 6959 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
a28509cc
AL
6960 const I32 arry_len = av_len(trie->words)+1;
6961 I32 word_idx;
6962 PerlIO_printf(Perl_debug_log,
07be1b83 6963 "%*s[StS:%"UVuf" Wds:%d Cs:%d Uq:%d #Sts:%"IVdf" Mn:%d Mx:%d",
8e11feef
RGS
6964 (int)(2*(l+3)),
6965 "",
6966 trie->startstate,
6967 TRIE_WORDCOUNT(trie),
6968 (int)TRIE_CHARCOUNT(trie),
6969 trie->uniquecharcount,
6970 (IV)TRIE_LASTSTATE(trie)-1,
be8e71aa
YO
6971 (int)trie->minlen,
6972 (int)trie->maxlen
8e11feef 6973 );
3dab1dad
YO
6974 if (trie->bitmap) {
6975 int i;
6976 int rangestart= -1;
6977 sv_setpvn(sv, "", 0);
6978 for (i = 0; i <= 256; i++) {
6979 if (i < 256 && TRIE_BITMAP_TEST(trie,i)) {
6980 if (rangestart == -1)
6981 rangestart = i;
6982 } else if (rangestart != -1) {
6983 if (i <= rangestart + 3)
6984 for (; rangestart < i; rangestart++)
6985 put_byte(sv, rangestart);
6986 else {
6987 put_byte(sv, rangestart);
6988 sv_catpvs(sv, "-");
6989 put_byte(sv, i - 1);
6990 }
6991 rangestart = -1;
6992 }
6993 }
07be1b83 6994 PerlIO_printf(Perl_debug_log, " Stcls:%s]\n", SvPVX_const(sv));
3dab1dad 6995 } else
07be1b83 6996 PerlIO_printf(Perl_debug_log, " No-Stcls]\n");
a28509cc
AL
6997
6998 for (word_idx=0; word_idx < arry_len; word_idx++) {
097eb12c 6999 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
a28509cc
AL
7000 if (elem_ptr) {
7001 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
7002 (int)(2*(l+4)), "",
7003 PL_colors[0],
cfd0369c 7004 SvPV_nolen_const(*elem_ptr),
a28509cc
AL
7005 PL_colors[1]
7006 );
a28509cc 7007 }
a28509cc
AL
7008 }
7009
7010 node = NEXTOPER(node);
7011 node += regarglen[(U8)op];
7012
7013 }
7014 else if ( op == CURLY) { /* "next" might be very big: optimizer */
3dab1dad 7015 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
7016 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
7017 }
7018 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 7019 assert(next);
3dab1dad 7020 DUMPUNTIL(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
7021 next, sv, l + 1);
7022 }
7023 else if ( op == PLUS || op == STAR) {
3dab1dad 7024 DUMPUNTIL(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a28509cc
AL
7025 }
7026 else if (op == ANYOF) {
7027 /* arglen 1 + class block */
7028 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7029 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7030 node = NEXTOPER(node);
7031 }
7032 else if (PL_regkind[(U8)op] == EXACT) {
7033 /* Literal string, where present. */
7034 node += NODE_SZ_STR(node) - 1;
7035 node = NEXTOPER(node);
7036 }
7037 else {
7038 node = NEXTOPER(node);
7039 node += regarglen[(U8)op];
7040 }
7041 if (op == CURLYX || op == OPEN)
7042 l++;
7043 else if (op == WHILEM)
7044 l--;
7045 }
3dab1dad 7046 CLEAR_OPTSTART;
a28509cc
AL
7047 return node;
7048}
7049
7050#endif /* DEBUGGING */
7051
241d1a3b
NC
7052/*
7053 * Local variables:
7054 * c-indentation-style: bsd
7055 * c-basic-offset: 4
7056 * indent-tabs-mode: t
7057 * End:
7058 *
37442d52
RGS
7059 * ex: set ts=8 sts=4 sw=4 noet:
7060 */