This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
; was RE: [PATCH perlio.c pp.c regcomp.c] silence format and comparison warnings
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
1aa99e6b 120 I32 utf8;
830247a4
IZ
121#if ADD_TO_REGEXEC
122 char *starttry; /* -Dr: where regtry was called. */
123#define RExC_starttry (pRExC_state->starttry)
124#endif
3dab1dad 125#ifdef DEBUGGING
be8e71aa 126 const char *lastparse;
3dab1dad
YO
127 I32 lastnum;
128#define RExC_lastparse (pRExC_state->lastparse)
129#define RExC_lastnum (pRExC_state->lastnum)
130#endif
830247a4
IZ
131} RExC_state_t;
132
e2509266 133#define RExC_flags (pRExC_state->flags)
830247a4
IZ
134#define RExC_precomp (pRExC_state->precomp)
135#define RExC_rx (pRExC_state->rx)
fac92740 136#define RExC_start (pRExC_state->start)
830247a4
IZ
137#define RExC_end (pRExC_state->end)
138#define RExC_parse (pRExC_state->parse)
139#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 140#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 141#define RExC_emit (pRExC_state->emit)
fac92740 142#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
143#define RExC_naughty (pRExC_state->naughty)
144#define RExC_sawback (pRExC_state->sawback)
145#define RExC_seen (pRExC_state->seen)
146#define RExC_size (pRExC_state->size)
147#define RExC_npar (pRExC_state->npar)
148#define RExC_extralen (pRExC_state->extralen)
149#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
150#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 151#define RExC_utf8 (pRExC_state->utf8)
830247a4 152
a687059c
LW
153#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
154#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
155 ((*s) == '{' && regcurly(s)))
a687059c 156
35c8bce7
LW
157#ifdef SPSTART
158#undef SPSTART /* dratted cpp namespace... */
159#endif
a687059c
LW
160/*
161 * Flags to be passed up and down.
162 */
a687059c 163#define WORST 0 /* Worst case. */
821b33a5 164#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
165#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
166#define SPSTART 0x4 /* Starts with * or +. */
167#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 168
3dab1dad
YO
169#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
170
07be1b83
YO
171/* whether trie related optimizations are enabled */
172#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
173#define TRIE_STUDY_OPT
786e8c11 174#define FULL_TRIE_STUDY
07be1b83
YO
175#define TRIE_STCLASS
176#endif
2c2d71f5
JH
177/* Length of a variant. */
178
179typedef struct scan_data_t {
180 I32 len_min;
181 I32 len_delta;
182 I32 pos_min;
183 I32 pos_delta;
184 SV *last_found;
185 I32 last_end; /* min value, <0 unless valid. */
186 I32 last_start_min;
187 I32 last_start_max;
188 SV **longest; /* Either &l_fixed, or &l_float. */
189 SV *longest_fixed;
190 I32 offset_fixed;
191 SV *longest_float;
192 I32 offset_float_min;
193 I32 offset_float_max;
194 I32 flags;
195 I32 whilem_c;
cb434fcc 196 I32 *last_closep;
653099ff 197 struct regnode_charclass_class *start_class;
2c2d71f5
JH
198} scan_data_t;
199
a687059c 200/*
e50aee73 201 * Forward declarations for pregcomp()'s friends.
a687059c 202 */
a0d0e21e 203
27da23d5
JH
204static const scan_data_t zero_scan_data =
205 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
206
207#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
208#define SF_BEFORE_SEOL 0x0001
209#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
210#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
211#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
212
09b7f37c
CB
213#ifdef NO_UNARY_PLUS
214# define SF_FIX_SHIFT_EOL (0+2)
215# define SF_FL_SHIFT_EOL (0+4)
216#else
217# define SF_FIX_SHIFT_EOL (+2)
218# define SF_FL_SHIFT_EOL (+4)
219#endif
c277df42
IZ
220
221#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
222#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
223
224#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
225#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
226#define SF_IS_INF 0x0040
227#define SF_HAS_PAR 0x0080
228#define SF_IN_PAR 0x0100
229#define SF_HAS_EVAL 0x0200
230#define SCF_DO_SUBSTR 0x0400
653099ff
GS
231#define SCF_DO_STCLASS_AND 0x0800
232#define SCF_DO_STCLASS_OR 0x1000
233#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 234#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 235
786e8c11
YO
236#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
237
07be1b83 238
eb160463 239#define UTF (RExC_utf8 != 0)
e2509266
JH
240#define LOC ((RExC_flags & PMf_LOCALE) != 0)
241#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 242
ffc61ed2 243#define OOB_UNICODE 12345678
93733859 244#define OOB_NAMEDCLASS -1
b8c5462f 245
a0ed51b3
LW
246#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
247#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
248
8615cb43 249
b45f050a
JF
250/* length of regex to show in messages that don't mark a position within */
251#define RegexLengthToShowInErrorMessages 127
252
253/*
254 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
255 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
256 * op/pragma/warn/regcomp.
257 */
7253e4e3
RK
258#define MARKER1 "<-- HERE" /* marker as it appears in the description */
259#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 260
7253e4e3 261#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
262
263/*
264 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
265 * arg. Show regex, up to a maximum length. If it's too long, chop and add
266 * "...".
267 */
ccb2c380 268#define FAIL(msg) STMT_START { \
bfed75c6 269 const char *ellipses = ""; \
ccb2c380
MP
270 IV len = RExC_end - RExC_precomp; \
271 \
272 if (!SIZE_ONLY) \
273 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
274 if (len > RegexLengthToShowInErrorMessages) { \
275 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
276 len = RegexLengthToShowInErrorMessages - 10; \
277 ellipses = "..."; \
278 } \
279 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
280 msg, (int)len, RExC_precomp, ellipses); \
281} STMT_END
8615cb43 282
b45f050a 283/*
b45f050a
JF
284 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
285 */
ccb2c380 286#define Simple_vFAIL(m) STMT_START { \
a28509cc 287 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
288 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
289 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
290} STMT_END
b45f050a
JF
291
292/*
293 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
294 */
ccb2c380
MP
295#define vFAIL(m) STMT_START { \
296 if (!SIZE_ONLY) \
297 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
298 Simple_vFAIL(m); \
299} STMT_END
b45f050a
JF
300
301/*
302 * Like Simple_vFAIL(), but accepts two arguments.
303 */
ccb2c380 304#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 305 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
306 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
307 (int)offset, RExC_precomp, RExC_precomp + offset); \
308} STMT_END
b45f050a
JF
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
312 */
ccb2c380
MP
313#define vFAIL2(m,a1) STMT_START { \
314 if (!SIZE_ONLY) \
315 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
316 Simple_vFAIL2(m, a1); \
317} STMT_END
b45f050a
JF
318
319
320/*
321 * Like Simple_vFAIL(), but accepts three arguments.
322 */
ccb2c380 323#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 324 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
325 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
326 (int)offset, RExC_precomp, RExC_precomp + offset); \
327} STMT_END
b45f050a
JF
328
329/*
330 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
331 */
ccb2c380
MP
332#define vFAIL3(m,a1,a2) STMT_START { \
333 if (!SIZE_ONLY) \
334 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
335 Simple_vFAIL3(m, a1, a2); \
336} STMT_END
b45f050a
JF
337
338/*
339 * Like Simple_vFAIL(), but accepts four arguments.
340 */
ccb2c380 341#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 342 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
343 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
344 (int)offset, RExC_precomp, RExC_precomp + offset); \
345} STMT_END
b45f050a 346
ccb2c380 347#define vWARN(loc,m) STMT_START { \
a28509cc 348 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
349 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
350 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
351} STMT_END
352
353#define vWARNdep(loc,m) STMT_START { \
a28509cc 354 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
355 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
356 "%s" REPORT_LOCATION, \
357 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
358} STMT_END
359
360
361#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 362 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
363 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
364 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
365} STMT_END
366
367#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 368 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
369 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
370 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
371} STMT_END
372
373#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 374 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
375 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
376 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
377} STMT_END
378
379#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 380 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
381 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
382 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
383} STMT_END
9d1d55b5 384
8615cb43 385
cd439c50 386/* Allow for side effects in s */
ccb2c380
MP
387#define REGC(c,s) STMT_START { \
388 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
389} STMT_END
cd439c50 390
fac92740
MJD
391/* Macros for recording node offsets. 20001227 mjd@plover.com
392 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
393 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
394 * Element 0 holds the number n.
07be1b83 395 * Position is 1 indexed.
fac92740
MJD
396 */
397
ccb2c380
MP
398#define Set_Node_Offset_To_R(node,byte) STMT_START { \
399 if (! SIZE_ONLY) { \
400 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 401 __LINE__, (node), (int)(byte))); \
ccb2c380 402 if((node) < 0) { \
551405c4 403 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
404 } else { \
405 RExC_offsets[2*(node)-1] = (byte); \
406 } \
407 } \
408} STMT_END
409
410#define Set_Node_Offset(node,byte) \
411 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
412#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
413
414#define Set_Node_Length_To_R(node,len) STMT_START { \
415 if (! SIZE_ONLY) { \
416 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 417 __LINE__, (int)(node), (int)(len))); \
ccb2c380 418 if((node) < 0) { \
551405c4 419 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
420 } else { \
421 RExC_offsets[2*(node)] = (len); \
422 } \
423 } \
424} STMT_END
425
426#define Set_Node_Length(node,len) \
427 Set_Node_Length_To_R((node)-RExC_emit_start, len)
428#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
429#define Set_Node_Cur_Length(node) \
430 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
431
432/* Get offsets and lengths */
433#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
434#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
435
07be1b83
YO
436#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
437 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
438 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
439} STMT_END
440
441
442#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
443#define EXPERIMENTAL_INPLACESCAN
444#endif
445
acfe0abc 446static void clear_re(pTHX_ void *r);
4327152a 447
653099ff 448/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 449 Update the longest found anchored substring and the longest found
653099ff
GS
450 floating substrings if needed. */
451
4327152a 452STATIC void
097eb12c 453S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 454{
e1ec3a88
AL
455 const STRLEN l = CHR_SVLEN(data->last_found);
456 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 457
c277df42 458 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 459 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
460 if (*data->longest == data->longest_fixed) {
461 data->offset_fixed = l ? data->last_start_min : data->pos_min;
462 if (data->flags & SF_BEFORE_EOL)
b81d288d 463 data->flags
c277df42
IZ
464 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
465 else
466 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
467 }
468 else {
c277df42 469 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
470 data->offset_float_max = (l
471 ? data->last_start_max
c277df42 472 : data->pos_min + data->pos_delta);
9051bda5
HS
473 if ((U32)data->offset_float_max > (U32)I32_MAX)
474 data->offset_float_max = I32_MAX;
c277df42 475 if (data->flags & SF_BEFORE_EOL)
b81d288d 476 data->flags
c277df42
IZ
477 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
478 else
479 data->flags &= ~SF_FL_BEFORE_EOL;
480 }
481 }
482 SvCUR_set(data->last_found, 0);
0eda9292 483 {
a28509cc 484 SV * const sv = data->last_found;
097eb12c
AL
485 if (SvUTF8(sv) && SvMAGICAL(sv)) {
486 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
487 if (mg)
488 mg->mg_len = 0;
489 }
0eda9292 490 }
c277df42
IZ
491 data->last_end = -1;
492 data->flags &= ~SF_BEFORE_EOL;
493}
494
653099ff
GS
495/* Can match anything (initialization) */
496STATIC void
097eb12c 497S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 498{
653099ff 499 ANYOF_CLASS_ZERO(cl);
f8bef550 500 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 501 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
502 if (LOC)
503 cl->flags |= ANYOF_LOCALE;
504}
505
506/* Can match anything (initialization) */
507STATIC int
5f66b61c 508S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
509{
510 int value;
511
aaa51d5e 512 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
513 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
514 return 1;
1aa99e6b
IH
515 if (!(cl->flags & ANYOF_UNICODE_ALL))
516 return 0;
10edeb5d 517 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 518 return 0;
653099ff
GS
519 return 1;
520}
521
522/* Can match anything (initialization) */
523STATIC void
097eb12c 524S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 525{
8ecf7187 526 Zero(cl, 1, struct regnode_charclass_class);
653099ff 527 cl->type = ANYOF;
830247a4 528 cl_anything(pRExC_state, cl);
653099ff
GS
529}
530
531STATIC void
097eb12c 532S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 533{
8ecf7187 534 Zero(cl, 1, struct regnode_charclass_class);
653099ff 535 cl->type = ANYOF;
830247a4 536 cl_anything(pRExC_state, cl);
653099ff
GS
537 if (LOC)
538 cl->flags |= ANYOF_LOCALE;
539}
540
541/* 'And' a given class with another one. Can create false positives */
542/* We assume that cl is not inverted */
543STATIC void
5f66b61c 544S_cl_and(struct regnode_charclass_class *cl,
a28509cc 545 const struct regnode_charclass_class *and_with)
653099ff 546{
653099ff
GS
547 if (!(and_with->flags & ANYOF_CLASS)
548 && !(cl->flags & ANYOF_CLASS)
549 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
550 && !(and_with->flags & ANYOF_FOLD)
551 && !(cl->flags & ANYOF_FOLD)) {
552 int i;
553
554 if (and_with->flags & ANYOF_INVERT)
555 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
556 cl->bitmap[i] &= ~and_with->bitmap[i];
557 else
558 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
559 cl->bitmap[i] &= and_with->bitmap[i];
560 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
561 if (!(and_with->flags & ANYOF_EOS))
562 cl->flags &= ~ANYOF_EOS;
1aa99e6b 563
14ebb1a2
JH
564 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
565 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
566 cl->flags &= ~ANYOF_UNICODE_ALL;
567 cl->flags |= ANYOF_UNICODE;
568 ARG_SET(cl, ARG(and_with));
569 }
14ebb1a2
JH
570 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
571 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 572 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
573 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
574 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 575 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
576}
577
578/* 'OR' a given class with another one. Can create false positives */
579/* We assume that cl is not inverted */
580STATIC void
097eb12c 581S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 582{
653099ff
GS
583 if (or_with->flags & ANYOF_INVERT) {
584 /* We do not use
585 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
586 * <= (B1 | !B2) | (CL1 | !CL2)
587 * which is wasteful if CL2 is small, but we ignore CL2:
588 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
589 * XXXX Can we handle case-fold? Unclear:
590 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
591 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
592 */
593 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
594 && !(or_with->flags & ANYOF_FOLD)
595 && !(cl->flags & ANYOF_FOLD) ) {
596 int i;
597
598 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
599 cl->bitmap[i] |= ~or_with->bitmap[i];
600 } /* XXXX: logic is complicated otherwise */
601 else {
830247a4 602 cl_anything(pRExC_state, cl);
653099ff
GS
603 }
604 } else {
605 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
606 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 607 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
608 || (cl->flags & ANYOF_FOLD)) ) {
609 int i;
610
611 /* OR char bitmap and class bitmap separately */
612 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613 cl->bitmap[i] |= or_with->bitmap[i];
614 if (or_with->flags & ANYOF_CLASS) {
615 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
616 cl->classflags[i] |= or_with->classflags[i];
617 cl->flags |= ANYOF_CLASS;
618 }
619 }
620 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 621 cl_anything(pRExC_state, cl);
653099ff
GS
622 }
623 }
624 if (or_with->flags & ANYOF_EOS)
625 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
626
627 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
628 ARG(cl) != ARG(or_with)) {
629 cl->flags |= ANYOF_UNICODE_ALL;
630 cl->flags &= ~ANYOF_UNICODE;
631 }
632 if (or_with->flags & ANYOF_UNICODE_ALL) {
633 cl->flags |= ANYOF_UNICODE_ALL;
634 cl->flags &= ~ANYOF_UNICODE;
635 }
653099ff
GS
636}
637
a3621e74
YO
638#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
639#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
640#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
641#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
642
3dab1dad
YO
643
644#ifdef DEBUGGING
07be1b83 645/*
3dab1dad
YO
646 dump_trie(trie)
647 dump_trie_interim_list(trie,next_alloc)
648 dump_trie_interim_table(trie,next_alloc)
649
650 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
651 The _interim_ variants are used for debugging the interim
652 tables that are used to generate the final compressed
653 representation which is what dump_trie expects.
654
3dab1dad
YO
655 Part of the reason for their existance is to provide a form
656 of documentation as to how the different representations function.
07be1b83
YO
657
658*/
3dab1dad
YO
659
660/*
661 dump_trie(trie)
662 Dumps the final compressed table form of the trie to Perl_debug_log.
663 Used for debugging make_trie().
664*/
665
666STATIC void
667S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
668{
669 U32 state;
ab3bbdeb
YO
670 SV *sv=sv_newmortal();
671 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
672 GET_RE_DEBUG_FLAGS_DECL;
673
ab3bbdeb 674
3dab1dad
YO
675 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
676 (int)depth * 2 + 2,"",
677 "Match","Base","Ofs" );
678
679 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 680 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 681 if ( tmp ) {
ab3bbdeb
YO
682 PerlIO_printf( Perl_debug_log, "%*s",
683 colwidth,
ddc5bc0f 684 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
685 PL_colors[0], PL_colors[1],
686 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
687 PERL_PV_ESCAPE_FIRSTCHAR
688 )
689 );
3dab1dad
YO
690 }
691 }
692 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
693 (int)depth * 2 + 2,"");
694
695 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 696 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
697 PerlIO_printf( Perl_debug_log, "\n");
698
786e8c11 699 for( state = 1 ; state < trie->laststate ; state++ ) {
be8e71aa 700 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
701
702 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
703
704 if ( trie->states[ state ].wordnum ) {
705 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
706 } else {
707 PerlIO_printf( Perl_debug_log, "%6s", "" );
708 }
709
710 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
711
712 if ( base ) {
713 U32 ofs = 0;
714
715 while( ( base + ofs < trie->uniquecharcount ) ||
716 ( base + ofs - trie->uniquecharcount < trie->lasttrans
717 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
718 ofs++;
719
720 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
721
722 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
723 if ( ( base + ofs >= trie->uniquecharcount ) &&
724 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
725 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
726 {
ab3bbdeb
YO
727 PerlIO_printf( Perl_debug_log, "%*"UVXf,
728 colwidth,
3dab1dad
YO
729 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
730 } else {
ab3bbdeb 731 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
732 }
733 }
734
735 PerlIO_printf( Perl_debug_log, "]");
736
737 }
738 PerlIO_printf( Perl_debug_log, "\n" );
739 }
740}
741/*
742 dump_trie_interim_list(trie,next_alloc)
743 Dumps a fully constructed but uncompressed trie in list form.
744 List tries normally only are used for construction when the number of
745 possible chars (trie->uniquecharcount) is very high.
746 Used for debugging make_trie().
747*/
748STATIC void
749S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
750{
751 U32 state;
ab3bbdeb
YO
752 SV *sv=sv_newmortal();
753 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
754 GET_RE_DEBUG_FLAGS_DECL;
755 /* print out the table precompression. */
ab3bbdeb
YO
756 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
757 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
758 "------:-----+-----------------\n" );
3dab1dad
YO
759
760 for( state=1 ; state < next_alloc ; state ++ ) {
761 U16 charid;
762
ab3bbdeb 763 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
764 (int)depth * 2 + 2,"", (UV)state );
765 if ( ! trie->states[ state ].wordnum ) {
766 PerlIO_printf( Perl_debug_log, "%5s| ","");
767 } else {
768 PerlIO_printf( Perl_debug_log, "W%4x| ",
769 trie->states[ state ].wordnum
770 );
771 }
772 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 773 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
774 if ( tmp ) {
775 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
776 colwidth,
ddc5bc0f 777 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
778 PL_colors[0], PL_colors[1],
779 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
780 PERL_PV_ESCAPE_FIRSTCHAR
781 ) ,
3dab1dad
YO
782 TRIE_LIST_ITEM(state,charid).forid,
783 (UV)TRIE_LIST_ITEM(state,charid).newstate
784 );
785 }
ab3bbdeb
YO
786 }
787 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
788 }
789}
790
791/*
792 dump_trie_interim_table(trie,next_alloc)
793 Dumps a fully constructed but uncompressed trie in table form.
794 This is the normal DFA style state transition table, with a few
795 twists to facilitate compression later.
796 Used for debugging make_trie().
797*/
798STATIC void
799S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
800{
801 U32 state;
802 U16 charid;
ab3bbdeb
YO
803 SV *sv=sv_newmortal();
804 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
805 GET_RE_DEBUG_FLAGS_DECL;
806
807 /*
808 print out the table precompression so that we can do a visual check
809 that they are identical.
810 */
811
812 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
813
814 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 815 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 816 if ( tmp ) {
ab3bbdeb
YO
817 PerlIO_printf( Perl_debug_log, "%*s",
818 colwidth,
ddc5bc0f 819 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
820 PL_colors[0], PL_colors[1],
821 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
822 PERL_PV_ESCAPE_FIRSTCHAR
823 )
824 );
3dab1dad
YO
825 }
826 }
827
828 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
829
830 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 831 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
832 }
833
834 PerlIO_printf( Perl_debug_log, "\n" );
835
836 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
837
838 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
839 (int)depth * 2 + 2,"",
840 (UV)TRIE_NODENUM( state ) );
841
842 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
843 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
844 if (v)
845 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
846 else
847 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
848 }
849 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
850 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
851 } else {
852 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
853 trie->states[ TRIE_NODENUM( state ) ].wordnum );
854 }
855 }
07be1b83 856}
3dab1dad
YO
857
858#endif
859
786e8c11
YO
860/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
861 startbranch: the first branch in the whole branch sequence
862 first : start branch of sequence of branch-exact nodes.
863 May be the same as startbranch
864 last : Thing following the last branch.
865 May be the same as tail.
866 tail : item following the branch sequence
867 count : words in the sequence
868 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
869 depth : indent depth
3dab1dad 870
786e8c11 871Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 872
786e8c11
YO
873A trie is an N'ary tree where the branches are determined by digital
874decomposition of the key. IE, at the root node you look up the 1st character and
875follow that branch repeat until you find the end of the branches. Nodes can be
876marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 877
786e8c11 878 /he|she|his|hers/
72f13be8 879
786e8c11
YO
880would convert into the following structure. Numbers represent states, letters
881following numbers represent valid transitions on the letter from that state, if
882the number is in square brackets it represents an accepting state, otherwise it
883will be in parenthesis.
07be1b83 884
786e8c11
YO
885 +-h->+-e->[3]-+-r->(8)-+-s->[9]
886 | |
887 | (2)
888 | |
889 (1) +-i->(6)-+-s->[7]
890 |
891 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 892
786e8c11
YO
893 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
894
895This shows that when matching against the string 'hers' we will begin at state 1
896read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
897then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
898is also accepting. Thus we know that we can match both 'he' and 'hers' with a
899single traverse. We store a mapping from accepting to state to which word was
900matched, and then when we have multiple possibilities we try to complete the
901rest of the regex in the order in which they occured in the alternation.
902
903The only prior NFA like behaviour that would be changed by the TRIE support is
904the silent ignoring of duplicate alternations which are of the form:
905
906 / (DUPE|DUPE) X? (?{ ... }) Y /x
907
908Thus EVAL blocks follwing a trie may be called a different number of times with
909and without the optimisation. With the optimisations dupes will be silently
910ignored. This inconsistant behaviour of EVAL type nodes is well established as
911the following demonstrates:
912
913 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
914
915which prints out 'word' three times, but
916
917 'words'=~/(word|word|word)(?{ print $1 })S/
918
919which doesnt print it out at all. This is due to other optimisations kicking in.
920
921Example of what happens on a structural level:
922
923The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
924
925 1: CURLYM[1] {1,32767}(18)
926 5: BRANCH(8)
927 6: EXACT <ac>(16)
928 8: BRANCH(11)
929 9: EXACT <ad>(16)
930 11: BRANCH(14)
931 12: EXACT <ab>(16)
932 16: SUCCEED(0)
933 17: NOTHING(18)
934 18: END(0)
935
936This would be optimizable with startbranch=5, first=5, last=16, tail=16
937and should turn into:
938
939 1: CURLYM[1] {1,32767}(18)
940 5: TRIE(16)
941 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
942 <ac>
943 <ad>
944 <ab>
945 16: SUCCEED(0)
946 17: NOTHING(18)
947 18: END(0)
948
949Cases where tail != last would be like /(?foo|bar)baz/:
950
951 1: BRANCH(4)
952 2: EXACT <foo>(8)
953 4: BRANCH(7)
954 5: EXACT <bar>(8)
955 7: TAIL(8)
956 8: EXACT <baz>(10)
957 10: END(0)
958
959which would be optimizable with startbranch=1, first=1, last=7, tail=8
960and would end up looking like:
961
962 1: TRIE(8)
963 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
964 <foo>
965 <bar>
966 7: TAIL(8)
967 8: EXACT <baz>(10)
968 10: END(0)
969
970 d = uvuni_to_utf8_flags(d, uv, 0);
971
972is the recommended Unicode-aware way of saying
973
974 *(d++) = uv;
975*/
976
977#define TRIE_STORE_REVCHAR \
978 STMT_START { \
979 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
980 if (UTF) SvUTF8_on(tmp); \
981 av_push( TRIE_REVCHARMAP(trie), tmp ); \
982 } STMT_END
983
984#define TRIE_READ_CHAR STMT_START { \
985 wordlen++; \
986 if ( UTF ) { \
987 if ( folder ) { \
988 if ( foldlen > 0 ) { \
989 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
990 foldlen -= len; \
991 scan += len; \
992 len = 0; \
993 } else { \
994 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
995 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
996 foldlen -= UNISKIP( uvc ); \
997 scan = foldbuf + UNISKIP( uvc ); \
998 } \
999 } else { \
1000 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1001 } \
1002 } else { \
1003 uvc = (U32)*uc; \
1004 len = 1; \
1005 } \
1006} STMT_END
1007
1008
1009
1010#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1011 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1012 TRIE_LIST_LEN( state ) *= 2; \
1013 Renew( trie->states[ state ].trans.list, \
1014 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
1015 } \
1016 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1017 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1018 TRIE_LIST_CUR( state )++; \
1019} STMT_END
07be1b83 1020
786e8c11
YO
1021#define TRIE_LIST_NEW(state) STMT_START { \
1022 Newxz( trie->states[ state ].trans.list, \
1023 4, reg_trie_trans_le ); \
1024 TRIE_LIST_CUR( state ) = 1; \
1025 TRIE_LIST_LEN( state ) = 4; \
1026} STMT_END
07be1b83 1027
786e8c11
YO
1028#define TRIE_HANDLE_WORD(state) STMT_START { \
1029 U16 dupe= trie->states[ state ].wordnum; \
1030 regnode * const noper_next = regnext( noper ); \
1031 \
1032 if (trie->wordlen) \
1033 trie->wordlen[ curword ] = wordlen; \
1034 DEBUG_r({ \
1035 /* store the word for dumping */ \
1036 SV* tmp; \
1037 if (OP(noper) != NOTHING) \
1038 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1039 else \
1040 tmp = newSVpvn( "", 0 ); \
1041 if ( UTF ) SvUTF8_on( tmp ); \
1042 av_push( trie->words, tmp ); \
1043 }); \
1044 \
1045 curword++; \
1046 \
1047 if ( noper_next < tail ) { \
1048 if (!trie->jump) \
1049 Newxz( trie->jump, word_count + 1, U16); \
1050 trie->jump[curword] = (U16)(tail - noper_next); \
1051 if (!jumper) \
1052 jumper = noper_next; \
1053 if (!nextbranch) \
1054 nextbranch= regnext(cur); \
1055 } \
1056 \
1057 if ( dupe ) { \
1058 /* So it's a dupe. This means we need to maintain a */\
1059 /* linked-list from the first to the next. */\
1060 /* we only allocate the nextword buffer when there */\
1061 /* a dupe, so first time we have to do the allocation */\
1062 if (!trie->nextword) \
1063 Newxz( trie->nextword, word_count + 1, U16); \
1064 while ( trie->nextword[dupe] ) \
1065 dupe= trie->nextword[dupe]; \
1066 trie->nextword[dupe]= curword; \
1067 } else { \
1068 /* we haven't inserted this word yet. */ \
1069 trie->states[ state ].wordnum = curword; \
1070 } \
1071} STMT_END
07be1b83 1072
3dab1dad 1073
786e8c11
YO
1074#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1075 ( ( base + charid >= ucharcount \
1076 && base + charid < ubound \
1077 && state == trie->trans[ base - ucharcount + charid ].check \
1078 && trie->trans[ base - ucharcount + charid ].next ) \
1079 ? trie->trans[ base - ucharcount + charid ].next \
1080 : ( state==1 ? special : 0 ) \
1081 )
3dab1dad 1082
786e8c11
YO
1083#define MADE_TRIE 1
1084#define MADE_JUMP_TRIE 2
1085#define MADE_EXACT_TRIE 4
3dab1dad 1086
a3621e74 1087STATIC I32
786e8c11 1088S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1089{
27da23d5 1090 dVAR;
a3621e74
YO
1091 /* first pass, loop through and scan words */
1092 reg_trie_data *trie;
1093 regnode *cur;
9f7f3913 1094 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1095 STRLEN len = 0;
1096 UV uvc = 0;
1097 U16 curword = 0;
1098 U32 next_alloc = 0;
786e8c11
YO
1099 regnode *jumper = NULL;
1100 regnode *nextbranch = NULL;
a3621e74 1101 /* we just use folder as a flag in utf8 */
e1ec3a88 1102 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1103 ? PL_fold
1104 : ( flags == EXACTFL
1105 ? PL_fold_locale
1106 : NULL
1107 )
1108 );
1109
e1ec3a88 1110 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1111 SV *re_trie_maxbuff;
3dab1dad
YO
1112#ifndef DEBUGGING
1113 /* these are only used during construction but are useful during
8e11feef 1114 * debugging so we store them in the struct when debugging.
8e11feef 1115 */
3dab1dad 1116 STRLEN trie_charcount=0;
3dab1dad
YO
1117 AV *trie_revcharmap;
1118#endif
a3621e74 1119 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1120#ifndef DEBUGGING
1121 PERL_UNUSED_ARG(depth);
1122#endif
a3621e74 1123
a02a5408 1124 Newxz( trie, 1, reg_trie_data );
a3621e74 1125 trie->refcount = 1;
3dab1dad 1126 trie->startstate = 1;
786e8c11 1127 trie->wordcount = word_count;
a3621e74 1128 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1129 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1130 if (!(UTF && folder))
1131 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1132 DEBUG_r({
1133 trie->words = newAV();
a3621e74 1134 });
3dab1dad 1135 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1136
0111c4fd 1137 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1138 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1139 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1140 }
3dab1dad
YO
1141 DEBUG_OPTIMISE_r({
1142 PerlIO_printf( Perl_debug_log,
786e8c11 1143 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1144 (int)depth * 2 + 2, "",
1145 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11
YO
1146 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1147 depth);
3dab1dad 1148 });
a3621e74
YO
1149 /* -- First loop and Setup --
1150
1151 We first traverse the branches and scan each word to determine if it
1152 contains widechars, and how many unique chars there are, this is
1153 important as we have to build a table with at least as many columns as we
1154 have unique chars.
1155
1156 We use an array of integers to represent the character codes 0..255
1157 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1158 native representation of the character value as the key and IV's for the
1159 coded index.
1160
1161 *TODO* If we keep track of how many times each character is used we can
1162 remap the columns so that the table compression later on is more
1163 efficient in terms of memory by ensuring most common value is in the
1164 middle and the least common are on the outside. IMO this would be better
1165 than a most to least common mapping as theres a decent chance the most
1166 common letter will share a node with the least common, meaning the node
1167 will not be compressable. With a middle is most common approach the worst
1168 case is when we have the least common nodes twice.
1169
1170 */
1171
a3621e74 1172 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1173 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1174 const U8 *uc = (U8*)STRING( noper );
a28509cc 1175 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1176 STRLEN foldlen = 0;
1177 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1178 const U8 *scan = (U8*)NULL;
07be1b83 1179 U32 wordlen = 0; /* required init */
3dab1dad 1180 STRLEN chars=0;
a3621e74 1181
3dab1dad
YO
1182 if (OP(noper) == NOTHING) {
1183 trie->minlen= 0;
1184 continue;
1185 }
1186 if (trie->bitmap) {
1187 TRIE_BITMAP_SET(trie,*uc);
1188 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1189 }
a3621e74 1190 for ( ; uc < e ; uc += len ) {
3dab1dad 1191 TRIE_CHARCOUNT(trie)++;
a3621e74 1192 TRIE_READ_CHAR;
3dab1dad 1193 chars++;
a3621e74
YO
1194 if ( uvc < 256 ) {
1195 if ( !trie->charmap[ uvc ] ) {
1196 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1197 if ( folder )
1198 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1199 TRIE_STORE_REVCHAR;
a3621e74
YO
1200 }
1201 } else {
1202 SV** svpp;
1203 if ( !trie->widecharmap )
1204 trie->widecharmap = newHV();
1205
1206 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1207
1208 if ( !svpp )
e4584336 1209 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1210
1211 if ( !SvTRUE( *svpp ) ) {
1212 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1213 TRIE_STORE_REVCHAR;
a3621e74
YO
1214 }
1215 }
1216 }
3dab1dad
YO
1217 if( cur == first ) {
1218 trie->minlen=chars;
1219 trie->maxlen=chars;
1220 } else if (chars < trie->minlen) {
1221 trie->minlen=chars;
1222 } else if (chars > trie->maxlen) {
1223 trie->maxlen=chars;
1224 }
1225
a3621e74
YO
1226 } /* end first pass */
1227 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1228 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1229 (int)depth * 2 + 2,"",
786e8c11 1230 ( trie->widecharmap ? "UTF8" : "NATIVE" ), word_count,
be8e71aa
YO
1231 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1232 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1233 );
786e8c11 1234 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1235
1236 /*
1237 We now know what we are dealing with in terms of unique chars and
1238 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1239 representation using a flat table will take. If it's over a reasonable
1240 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1241 conservative but potentially much slower representation using an array
1242 of lists.
1243
1244 At the end we convert both representations into the same compressed
1245 form that will be used in regexec.c for matching with. The latter
1246 is a form that cannot be used to construct with but has memory
1247 properties similar to the list form and access properties similar
1248 to the table form making it both suitable for fast searches and
1249 small enough that its feasable to store for the duration of a program.
1250
1251 See the comment in the code where the compressed table is produced
1252 inplace from the flat tabe representation for an explanation of how
1253 the compression works.
1254
1255 */
1256
1257
3dab1dad 1258 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1259 /*
1260 Second Pass -- Array Of Lists Representation
1261
1262 Each state will be represented by a list of charid:state records
1263 (reg_trie_trans_le) the first such element holds the CUR and LEN
1264 points of the allocated array. (See defines above).
1265
1266 We build the initial structure using the lists, and then convert
1267 it into the compressed table form which allows faster lookups
1268 (but cant be modified once converted).
a3621e74
YO
1269 */
1270
a3621e74
YO
1271 STRLEN transcount = 1;
1272
3dab1dad 1273 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1274 TRIE_LIST_NEW(1);
1275 next_alloc = 2;
1276
1277 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1278
c445ea15
AL
1279 regnode * const noper = NEXTOPER( cur );
1280 U8 *uc = (U8*)STRING( noper );
1281 const U8 * const e = uc + STR_LEN( noper );
1282 U32 state = 1; /* required init */
1283 U16 charid = 0; /* sanity init */
1284 U8 *scan = (U8*)NULL; /* sanity init */
1285 STRLEN foldlen = 0; /* required init */
07be1b83 1286 U32 wordlen = 0; /* required init */
c445ea15
AL
1287 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1288
3dab1dad 1289 if (OP(noper) != NOTHING) {
786e8c11 1290 for ( ; uc < e ; uc += len ) {
c445ea15 1291
786e8c11 1292 TRIE_READ_CHAR;
c445ea15 1293
786e8c11
YO
1294 if ( uvc < 256 ) {
1295 charid = trie->charmap[ uvc ];
c445ea15 1296 } else {
786e8c11
YO
1297 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1298 if ( !svpp ) {
1299 charid = 0;
1300 } else {
1301 charid=(U16)SvIV( *svpp );
1302 }
c445ea15 1303 }
786e8c11
YO
1304 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1305 if ( charid ) {
a3621e74 1306
786e8c11
YO
1307 U16 check;
1308 U32 newstate = 0;
a3621e74 1309
786e8c11
YO
1310 charid--;
1311 if ( !trie->states[ state ].trans.list ) {
1312 TRIE_LIST_NEW( state );
c445ea15 1313 }
786e8c11
YO
1314 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1315 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1316 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1317 break;
1318 }
1319 }
1320 if ( ! newstate ) {
1321 newstate = next_alloc++;
1322 TRIE_LIST_PUSH( state, charid, newstate );
1323 transcount++;
1324 }
1325 state = newstate;
1326 } else {
1327 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1328 }
a28509cc 1329 }
c445ea15 1330 }
3dab1dad 1331 TRIE_HANDLE_WORD(state);
a3621e74
YO
1332
1333 } /* end second pass */
1334
786e8c11 1335 trie->laststate = next_alloc;
a3621e74
YO
1336 Renew( trie->states, next_alloc, reg_trie_state );
1337
3dab1dad
YO
1338 /* and now dump it out before we compress it */
1339 DEBUG_TRIE_COMPILE_MORE_r(
1340 dump_trie_interim_list(trie,next_alloc,depth+1)
a3621e74 1341 );
a3621e74 1342
a02a5408 1343 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1344 {
1345 U32 state;
a3621e74
YO
1346 U32 tp = 0;
1347 U32 zp = 0;
1348
1349
1350 for( state=1 ; state < next_alloc ; state ++ ) {
1351 U32 base=0;
1352
1353 /*
1354 DEBUG_TRIE_COMPILE_MORE_r(
1355 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1356 );
1357 */
1358
1359 if (trie->states[state].trans.list) {
1360 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1361 U16 maxid=minid;
a28509cc 1362 U16 idx;
a3621e74
YO
1363
1364 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1365 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1366 if ( forid < minid ) {
1367 minid=forid;
1368 } else if ( forid > maxid ) {
1369 maxid=forid;
1370 }
a3621e74
YO
1371 }
1372 if ( transcount < tp + maxid - minid + 1) {
1373 transcount *= 2;
1374 Renew( trie->trans, transcount, reg_trie_trans );
1375 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1376 }
1377 base = trie->uniquecharcount + tp - minid;
1378 if ( maxid == minid ) {
1379 U32 set = 0;
1380 for ( ; zp < tp ; zp++ ) {
1381 if ( ! trie->trans[ zp ].next ) {
1382 base = trie->uniquecharcount + zp - minid;
1383 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1384 trie->trans[ zp ].check = state;
1385 set = 1;
1386 break;
1387 }
1388 }
1389 if ( !set ) {
1390 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1391 trie->trans[ tp ].check = state;
1392 tp++;
1393 zp = tp;
1394 }
1395 } else {
1396 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1397 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1398 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1399 trie->trans[ tid ].check = state;
1400 }
1401 tp += ( maxid - minid + 1 );
1402 }
1403 Safefree(trie->states[ state ].trans.list);
1404 }
1405 /*
1406 DEBUG_TRIE_COMPILE_MORE_r(
1407 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1408 );
1409 */
1410 trie->states[ state ].trans.base=base;
1411 }
cc601c31 1412 trie->lasttrans = tp + 1;
a3621e74
YO
1413 }
1414 } else {
1415 /*
1416 Second Pass -- Flat Table Representation.
1417
1418 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1419 We know that we will need Charcount+1 trans at most to store the data
1420 (one row per char at worst case) So we preallocate both structures
1421 assuming worst case.
1422
1423 We then construct the trie using only the .next slots of the entry
1424 structs.
1425
1426 We use the .check field of the first entry of the node temporarily to
1427 make compression both faster and easier by keeping track of how many non
1428 zero fields are in the node.
1429
1430 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1431 transition.
1432
1433 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1434 number representing the first entry of the node, and state as a
1435 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1436 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1437 are 2 entrys per node. eg:
1438
1439 A B A B
1440 1. 2 4 1. 3 7
1441 2. 0 3 3. 0 5
1442 3. 0 0 5. 0 0
1443 4. 0 0 7. 0 0
1444
1445 The table is internally in the right hand, idx form. However as we also
1446 have to deal with the states array which is indexed by nodenum we have to
1447 use TRIE_NODENUM() to convert.
1448
1449 */
1450
3dab1dad
YO
1451
1452 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1453 reg_trie_trans );
3dab1dad 1454 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1455 next_alloc = trie->uniquecharcount + 1;
1456
3dab1dad 1457
a3621e74
YO
1458 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1459
c445ea15 1460 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1461 const U8 *uc = (U8*)STRING( noper );
1462 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1463
1464 U32 state = 1; /* required init */
1465
1466 U16 charid = 0; /* sanity init */
1467 U32 accept_state = 0; /* sanity init */
1468 U8 *scan = (U8*)NULL; /* sanity init */
1469
1470 STRLEN foldlen = 0; /* required init */
07be1b83 1471 U32 wordlen = 0; /* required init */
a3621e74
YO
1472 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1473
3dab1dad 1474 if ( OP(noper) != NOTHING ) {
786e8c11 1475 for ( ; uc < e ; uc += len ) {
a3621e74 1476
786e8c11 1477 TRIE_READ_CHAR;
a3621e74 1478
786e8c11
YO
1479 if ( uvc < 256 ) {
1480 charid = trie->charmap[ uvc ];
1481 } else {
1482 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1483 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1484 }
786e8c11
YO
1485 if ( charid ) {
1486 charid--;
1487 if ( !trie->trans[ state + charid ].next ) {
1488 trie->trans[ state + charid ].next = next_alloc;
1489 trie->trans[ state ].check++;
1490 next_alloc += trie->uniquecharcount;
1491 }
1492 state = trie->trans[ state + charid ].next;
1493 } else {
1494 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1495 }
1496 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1497 }
a3621e74 1498 }
3dab1dad
YO
1499 accept_state = TRIE_NODENUM( state );
1500 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1501
1502 } /* end second pass */
1503
3dab1dad
YO
1504 /* and now dump it out before we compress it */
1505 DEBUG_TRIE_COMPILE_MORE_r(
1506 dump_trie_interim_table(trie,next_alloc,depth+1)
1507 );
a3621e74 1508
a3621e74
YO
1509 {
1510 /*
1511 * Inplace compress the table.*
1512
1513 For sparse data sets the table constructed by the trie algorithm will
1514 be mostly 0/FAIL transitions or to put it another way mostly empty.
1515 (Note that leaf nodes will not contain any transitions.)
1516
1517 This algorithm compresses the tables by eliminating most such
1518 transitions, at the cost of a modest bit of extra work during lookup:
1519
1520 - Each states[] entry contains a .base field which indicates the
1521 index in the state[] array wheres its transition data is stored.
1522
1523 - If .base is 0 there are no valid transitions from that node.
1524
1525 - If .base is nonzero then charid is added to it to find an entry in
1526 the trans array.
1527
1528 -If trans[states[state].base+charid].check!=state then the
1529 transition is taken to be a 0/Fail transition. Thus if there are fail
1530 transitions at the front of the node then the .base offset will point
1531 somewhere inside the previous nodes data (or maybe even into a node
1532 even earlier), but the .check field determines if the transition is
1533 valid.
1534
786e8c11 1535 XXX - wrong maybe?
a3621e74
YO
1536 The following process inplace converts the table to the compressed
1537 table: We first do not compress the root node 1,and mark its all its
1538 .check pointers as 1 and set its .base pointer as 1 as well. This
1539 allows to do a DFA construction from the compressed table later, and
1540 ensures that any .base pointers we calculate later are greater than
1541 0.
1542
1543 - We set 'pos' to indicate the first entry of the second node.
1544
1545 - We then iterate over the columns of the node, finding the first and
1546 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1547 and set the .check pointers accordingly, and advance pos
1548 appropriately and repreat for the next node. Note that when we copy
1549 the next pointers we have to convert them from the original
1550 NODEIDX form to NODENUM form as the former is not valid post
1551 compression.
1552
1553 - If a node has no transitions used we mark its base as 0 and do not
1554 advance the pos pointer.
1555
1556 - If a node only has one transition we use a second pointer into the
1557 structure to fill in allocated fail transitions from other states.
1558 This pointer is independent of the main pointer and scans forward
1559 looking for null transitions that are allocated to a state. When it
1560 finds one it writes the single transition into the "hole". If the
786e8c11 1561 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1562
1563 - Once compressed we can Renew/realloc the structures to release the
1564 excess space.
1565
1566 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1567 specifically Fig 3.47 and the associated pseudocode.
1568
1569 demq
1570 */
a3b680e6 1571 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1572 U32 state, charid;
a3621e74 1573 U32 pos = 0, zp=0;
786e8c11 1574 trie->laststate = laststate;
a3621e74
YO
1575
1576 for ( state = 1 ; state < laststate ; state++ ) {
1577 U8 flag = 0;
a28509cc
AL
1578 const U32 stateidx = TRIE_NODEIDX( state );
1579 const U32 o_used = trie->trans[ stateidx ].check;
1580 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1581 trie->trans[ stateidx ].check = 0;
1582
1583 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1584 if ( flag || trie->trans[ stateidx + charid ].next ) {
1585 if ( trie->trans[ stateidx + charid ].next ) {
1586 if (o_used == 1) {
1587 for ( ; zp < pos ; zp++ ) {
1588 if ( ! trie->trans[ zp ].next ) {
1589 break;
1590 }
1591 }
1592 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1593 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1594 trie->trans[ zp ].check = state;
1595 if ( ++zp > pos ) pos = zp;
1596 break;
1597 }
1598 used--;
1599 }
1600 if ( !flag ) {
1601 flag = 1;
1602 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1603 }
1604 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1605 trie->trans[ pos ].check = state;
1606 pos++;
1607 }
1608 }
1609 }
cc601c31 1610 trie->lasttrans = pos + 1;
a3621e74
YO
1611 Renew( trie->states, laststate + 1, reg_trie_state);
1612 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1613 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1614 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1615 (int)depth * 2 + 2,"",
1616 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1617 (IV)next_alloc,
1618 (IV)pos,
a3621e74
YO
1619 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1620 );
1621
1622 } /* end table compress */
1623 }
cc601c31
YO
1624 /* resize the trans array to remove unused space */
1625 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1626
3dab1dad
YO
1627 /* and now dump out the compressed format */
1628 DEBUG_TRIE_COMPILE_r(
1629 dump_trie(trie,depth+1)
1630 );
07be1b83 1631
3dab1dad
YO
1632 { /* Modify the program and insert the new TRIE node*/
1633 regnode *convert;
1634 U8 nodetype =(U8)(flags & 0xFF);
1635 char *str=NULL;
786e8c11 1636
07be1b83 1637#ifdef DEBUGGING
786e8c11 1638
07be1b83
YO
1639 U32 mjd_offset;
1640 U32 mjd_nodelen;
1641#endif
a3621e74 1642 /*
3dab1dad
YO
1643 This means we convert either the first branch or the first Exact,
1644 depending on whether the thing following (in 'last') is a branch
1645 or not and whther first is the startbranch (ie is it a sub part of
1646 the alternation or is it the whole thing.)
1647 Assuming its a sub part we conver the EXACT otherwise we convert
1648 the whole branch sequence, including the first.
a3621e74 1649 */
3dab1dad
YO
1650 /* Find the node we are going to overwrite */
1651 if ( first == startbranch && OP( last ) != BRANCH ) {
07be1b83 1652 /* whole branch chain */
3dab1dad 1653 convert = first;
07be1b83
YO
1654 DEBUG_r({
1655 const regnode *nop = NEXTOPER( convert );
1656 mjd_offset= Node_Offset((nop));
1657 mjd_nodelen= Node_Length((nop));
1658 });
1659 } else {
1660 /* branch sub-chain */
3dab1dad
YO
1661 convert = NEXTOPER( first );
1662 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1663 DEBUG_r({
1664 mjd_offset= Node_Offset((convert));
1665 mjd_nodelen= Node_Length((convert));
1666 });
1667 }
1668 DEBUG_OPTIMISE_r(
1669 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1670 (int)depth * 2 + 2, "",
786e8c11 1671 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1672 );
a3621e74 1673
3dab1dad
YO
1674 /* But first we check to see if there is a common prefix we can
1675 split out as an EXACT and put in front of the TRIE node. */
1676 trie->startstate= 1;
786e8c11 1677 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad
YO
1678 U32 state;
1679 DEBUG_OPTIMISE_r(
8e11feef
RGS
1680 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1681 (int)depth * 2 + 2, "",
786e8c11 1682 (UV)trie->laststate)
8e11feef 1683 );
786e8c11 1684 for ( state = 1 ; state < trie->laststate-1 ; state++ ) {
a3621e74 1685 U32 ofs = 0;
8e11feef
RGS
1686 I32 idx = -1;
1687 U32 count = 0;
1688 const U32 base = trie->states[ state ].trans.base;
a3621e74 1689
3dab1dad 1690 if ( trie->states[state].wordnum )
8e11feef 1691 count = 1;
a3621e74 1692
8e11feef 1693 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1694 if ( ( base + ofs >= trie->uniquecharcount ) &&
1695 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1696 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1697 {
3dab1dad 1698 if ( ++count > 1 ) {
8e11feef 1699 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1700 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1701 if ( state == 1 ) break;
3dab1dad
YO
1702 if ( count == 2 ) {
1703 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1704 DEBUG_OPTIMISE_r(
8e11feef
RGS
1705 PerlIO_printf(Perl_debug_log,
1706 "%*sNew Start State=%"UVuf" Class: [",
1707 (int)depth * 2 + 2, "",
786e8c11 1708 (UV)state));
be8e71aa
YO
1709 if (idx >= 0) {
1710 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1711 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1712
3dab1dad 1713 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1714 if ( folder )
1715 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1716 DEBUG_OPTIMISE_r(
07be1b83 1717 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1718 );
8e11feef
RGS
1719 }
1720 }
1721 TRIE_BITMAP_SET(trie,*ch);
1722 if ( folder )
1723 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1724 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1725 }
1726 idx = ofs;
1727 }
3dab1dad
YO
1728 }
1729 if ( count == 1 ) {
1730 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1731 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1732 DEBUG_OPTIMISE_r(
8e11feef
RGS
1733 PerlIO_printf( Perl_debug_log,
1734 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1735 (int)depth * 2 + 2, "",
786e8c11 1736 (UV)state, (UV)idx, ch)
3dab1dad
YO
1737 );
1738 if ( state==1 ) {
1739 OP( convert ) = nodetype;
1740 str=STRING(convert);
1741 STR_LEN(convert)=0;
1742 }
1743 *str++=*ch;
1744 STR_LEN(convert)++;
a3621e74 1745
8e11feef 1746 } else {
f9049ba1 1747#ifdef DEBUGGING
8e11feef
RGS
1748 if (state>1)
1749 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1750#endif
8e11feef
RGS
1751 break;
1752 }
1753 }
3dab1dad 1754 if (str) {
8e11feef 1755 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1756 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1757 trie->startstate = state;
07be1b83
YO
1758 trie->minlen -= (state - 1);
1759 trie->maxlen -= (state - 1);
1760 DEBUG_r({
1761 regnode *fix = convert;
1762 mjd_nodelen++;
1763 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1764 while( ++fix < n ) {
1765 Set_Node_Offset_Length(fix, 0, 0);
1766 }
1767 });
8e11feef
RGS
1768 if (trie->maxlen) {
1769 convert = n;
1770 } else {
3dab1dad
YO
1771 NEXT_OFF(convert) = (U16)(tail - convert);
1772 }
1773 }
1774 }
1775 if ( trie->maxlen ) {
8e11feef
RGS
1776 NEXT_OFF( convert ) = (U16)(tail - convert);
1777 ARG_SET( convert, data_slot );
786e8c11
YO
1778 /* Store the offset to the first unabsorbed branch in
1779 jump[0], which is otherwise unused by the jump logic.
1780 We use this when dumping a trie and during optimisation. */
1781 if (trie->jump)
1782 trie->jump[0] = (U16)(tail - nextbranch);
1783 if (!jumper)
1784 jumper= last;
1785 /* XXXX */
1786 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1787 ( (char *)jumper - (char *)convert) >= sizeof(struct regnode_charclass) )
1788 {
1789 OP( convert ) = TRIEC;
1790 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1791 Safefree(trie->bitmap);
1792 trie->bitmap= NULL;
1793 } else
1794 OP( convert ) = TRIE;
a3621e74 1795
3dab1dad
YO
1796 /* store the type in the flags */
1797 convert->flags = nodetype;
1798 /* XXX We really should free up the resource in trie now, as we wont use them */
1799 }
a3621e74
YO
1800 /* needed for dumping*/
1801 DEBUG_r({
786e8c11
YO
1802 regnode *optimize = convert
1803 + NODE_STEP_REGNODE
1804 + regarglen[ OP( convert ) ];
07be1b83
YO
1805 regnode *opt = convert;
1806 while (++opt<optimize) {
1807 Set_Node_Offset_Length(opt,0,0);
1808 }
786e8c11
YO
1809 /*
1810 Try to clean up some of the debris left after the
1811 optimisation.
a3621e74 1812 */
786e8c11 1813 while( optimize < jumper ) {
07be1b83 1814 mjd_nodelen += Node_Length((optimize));
a3621e74 1815 OP( optimize ) = OPTIMIZED;
07be1b83 1816 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1817 optimize++;
1818 }
07be1b83 1819 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1820 });
1821 } /* end node insert */
07be1b83 1822#ifndef DEBUGGING
6e8b4190 1823 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1824#endif
786e8c11
YO
1825 return trie->jump
1826 ? MADE_JUMP_TRIE
1827 : trie->startstate>1
1828 ? MADE_EXACT_TRIE
1829 : MADE_TRIE;
1830}
1831
1832STATIC void
1833S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1834{
1835/* The Trie is constructed and compressed now so we can build a fail array now if its needed
1836
1837 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1838 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1839 ISBN 0-201-10088-6
1840
1841 We find the fail state for each state in the trie, this state is the longest proper
1842 suffix of the current states 'word' that is also a proper prefix of another word in our
1843 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1844 the DFA not to have to restart after its tried and failed a word at a given point, it
1845 simply continues as though it had been matching the other word in the first place.
1846 Consider
1847 'abcdgu'=~/abcdefg|cdgu/
1848 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1849 fail, which would bring use to the state representing 'd' in the second word where we would
1850 try 'g' and succeed, prodceding to match 'cdgu'.
1851 */
1852 /* add a fail transition */
1853 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1854 U32 *q;
1855 const U32 ucharcount = trie->uniquecharcount;
1856 const U32 numstates = trie->laststate;
1857 const U32 ubound = trie->lasttrans + ucharcount;
1858 U32 q_read = 0;
1859 U32 q_write = 0;
1860 U32 charid;
1861 U32 base = trie->states[ 1 ].trans.base;
1862 U32 *fail;
1863 reg_ac_data *aho;
1864 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1865 GET_RE_DEBUG_FLAGS_DECL;
1866#ifndef DEBUGGING
1867 PERL_UNUSED_ARG(depth);
1868#endif
1869
1870
1871 ARG_SET( stclass, data_slot );
1872 Newxz( aho, 1, reg_ac_data );
1873 RExC_rx->data->data[ data_slot ] = (void*)aho;
1874 aho->trie=trie;
1875 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1876 (trie->laststate+1)*sizeof(reg_trie_state));
1877 Newxz( q, numstates, U32);
1878 Newxz( aho->fail, numstates, U32 );
1879 aho->refcount = 1;
1880 fail = aho->fail;
1881 /* initialize fail[0..1] to be 1 so that we always have
1882 a valid final fail state */
1883 fail[ 0 ] = fail[ 1 ] = 1;
1884
1885 for ( charid = 0; charid < ucharcount ; charid++ ) {
1886 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1887 if ( newstate ) {
1888 q[ q_write ] = newstate;
1889 /* set to point at the root */
1890 fail[ q[ q_write++ ] ]=1;
1891 }
1892 }
1893 while ( q_read < q_write) {
1894 const U32 cur = q[ q_read++ % numstates ];
1895 base = trie->states[ cur ].trans.base;
1896
1897 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
1898 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1899 if (ch_state) {
1900 U32 fail_state = cur;
1901 U32 fail_base;
1902 do {
1903 fail_state = fail[ fail_state ];
1904 fail_base = aho->states[ fail_state ].trans.base;
1905 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1906
1907 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1908 fail[ ch_state ] = fail_state;
1909 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1910 {
1911 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1912 }
1913 q[ q_write++ % numstates] = ch_state;
1914 }
1915 }
1916 }
1917 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
1918 when we fail in state 1, this allows us to use the
1919 charclass scan to find a valid start char. This is based on the principle
1920 that theres a good chance the string being searched contains lots of stuff
1921 that cant be a start char.
1922 */
1923 fail[ 0 ] = fail[ 1 ] = 0;
1924 DEBUG_TRIE_COMPILE_r({
1925 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable: 0", (int)(depth * 2), "");
1926 for( q_read=1; q_read<numstates; q_read++ ) {
1927 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
1928 }
1929 PerlIO_printf(Perl_debug_log, "\n");
1930 });
1931 Safefree(q);
1932 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
1933}
1934
786e8c11 1935
a3621e74 1936/*
5d1c421c
JH
1937 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1938 * These need to be revisited when a newer toolchain becomes available.
1939 */
1940#if defined(__sparc64__) && defined(__GNUC__)
1941# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1942# undef SPARC64_GCC_WORKAROUND
1943# define SPARC64_GCC_WORKAROUND 1
1944# endif
1945#endif
1946
07be1b83
YO
1947#define DEBUG_PEEP(str,scan,depth) \
1948 DEBUG_OPTIMISE_r({ \
1949 SV * const mysv=sv_newmortal(); \
1950 regnode *Next = regnext(scan); \
1951 regprop(RExC_rx, mysv, scan); \
1952 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
1953 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
1954 Next ? (REG_NODE_NUM(Next)) : 0 ); \
1955 });
1956
1957#define JOIN_EXACT(scan,min,flags) \
1958 if (PL_regkind[OP(scan)] == EXACT) \
1959 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
1960
be8e71aa 1961STATIC U32
07be1b83
YO
1962S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
1963 /* Merge several consecutive EXACTish nodes into one. */
1964 regnode *n = regnext(scan);
1965 U32 stringok = 1;
1966 regnode *next = scan + NODE_SZ_STR(scan);
1967 U32 merged = 0;
1968 U32 stopnow = 0;
1969#ifdef DEBUGGING
1970 regnode *stop = scan;
72f13be8 1971 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 1972#else
d47053eb
RGS
1973 PERL_UNUSED_ARG(depth);
1974#endif
1975#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
1976 PERL_UNUSED_ARG(flags);
1977 PERL_UNUSED_ARG(val);
07be1b83 1978#endif
07be1b83
YO
1979 DEBUG_PEEP("join",scan,depth);
1980
1981 /* Skip NOTHING, merge EXACT*. */
1982 while (n &&
1983 ( PL_regkind[OP(n)] == NOTHING ||
1984 (stringok && (OP(n) == OP(scan))))
1985 && NEXT_OFF(n)
1986 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1987
1988 if (OP(n) == TAIL || n > next)
1989 stringok = 0;
1990 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
1991 DEBUG_PEEP("skip:",n,depth);
1992 NEXT_OFF(scan) += NEXT_OFF(n);
1993 next = n + NODE_STEP_REGNODE;
1994#ifdef DEBUGGING
1995 if (stringok)
1996 stop = n;
1997#endif
1998 n = regnext(n);
1999 }
2000 else if (stringok) {
786e8c11 2001 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2002 regnode * const nnext = regnext(n);
2003
2004 DEBUG_PEEP("merg",n,depth);
2005
2006 merged++;
2007 if (oldl + STR_LEN(n) > U8_MAX)
2008 break;
2009 NEXT_OFF(scan) += NEXT_OFF(n);
2010 STR_LEN(scan) += STR_LEN(n);
2011 next = n + NODE_SZ_STR(n);
2012 /* Now we can overwrite *n : */
2013 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2014#ifdef DEBUGGING
2015 stop = next - 1;
2016#endif
2017 n = nnext;
2018 if (stopnow) break;
2019 }
2020
d47053eb
RGS
2021#ifdef EXPERIMENTAL_INPLACESCAN
2022 if (flags && !NEXT_OFF(n)) {
2023 DEBUG_PEEP("atch", val, depth);
2024 if (reg_off_by_arg[OP(n)]) {
2025 ARG_SET(n, val - n);
2026 }
2027 else {
2028 NEXT_OFF(n) = val - n;
2029 }
2030 stopnow = 1;
2031 }
07be1b83
YO
2032#endif
2033 }
2034
2035 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2036 /*
2037 Two problematic code points in Unicode casefolding of EXACT nodes:
2038
2039 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2040 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2041
2042 which casefold to
2043
2044 Unicode UTF-8
2045
2046 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2047 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2048
2049 This means that in case-insensitive matching (or "loose matching",
2050 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2051 length of the above casefolded versions) can match a target string
2052 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2053 This would rather mess up the minimum length computation.
2054
2055 What we'll do is to look for the tail four bytes, and then peek
2056 at the preceding two bytes to see whether we need to decrease
2057 the minimum length by four (six minus two).
2058
2059 Thanks to the design of UTF-8, there cannot be false matches:
2060 A sequence of valid UTF-8 bytes cannot be a subsequence of
2061 another valid sequence of UTF-8 bytes.
2062
2063 */
2064 char * const s0 = STRING(scan), *s, *t;
2065 char * const s1 = s0 + STR_LEN(scan) - 1;
2066 char * const s2 = s1 - 4;
e294cc5d
JH
2067#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2068 const char t0[] = "\xaf\x49\xaf\x42";
2069#else
07be1b83 2070 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2071#endif
07be1b83
YO
2072 const char * const t1 = t0 + 3;
2073
2074 for (s = s0 + 2;
2075 s < s2 && (t = ninstr(s, s1, t0, t1));
2076 s = t + 4) {
e294cc5d
JH
2077#ifdef EBCDIC
2078 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2079 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2080#else
07be1b83
YO
2081 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2082 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2083#endif
07be1b83
YO
2084 *min -= 4;
2085 }
2086 }
2087
2088#ifdef DEBUGGING
2089 /* Allow dumping */
2090 n = scan + NODE_SZ_STR(scan);
2091 while (n <= stop) {
2092 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2093 OP(n) = OPTIMIZED;
2094 NEXT_OFF(n) = 0;
2095 }
2096 n++;
2097 }
2098#endif
2099 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2100 return stopnow;
2101}
2102
653099ff
GS
2103/* REx optimizer. Converts nodes into quickier variants "in place".
2104 Finds fixed substrings. */
2105
a0288114 2106/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2107 to the position after last scanned or to NULL. */
2108
07be1b83
YO
2109
2110
76e3520e 2111STATIC I32
9a957fbc
AL
2112S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
2113 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
2114 /* scanp: Start here (read-write). */
2115 /* deltap: Write maxlen-minlen here. */
2116 /* last: Stop before this one. */
2117{
97aff369 2118 dVAR;
c277df42
IZ
2119 I32 min = 0, pars = 0, code;
2120 regnode *scan = *scanp, *next;
2121 I32 delta = 0;
2122 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2123 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2124 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2125 scan_data_t data_fake;
653099ff 2126 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74 2127 SV *re_trie_maxbuff = NULL;
786e8c11
YO
2128 regnode *first_non_open = scan;
2129
a3621e74
YO
2130
2131 GET_RE_DEBUG_FLAGS_DECL;
13a24bad
YO
2132#ifdef DEBUGGING
2133 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2134#endif
786e8c11
YO
2135 if ( depth == 0 ) {
2136 while (first_non_open && OP(first_non_open) == OPEN)
2137 first_non_open=regnext(first_non_open);
2138 }
2139
b81d288d 2140
c277df42
IZ
2141 while (scan && OP(scan) != END && scan < last) {
2142 /* Peephole optimizer: */
07be1b83 2143 DEBUG_PEEP("Peep",scan,depth);
a3621e74 2144
07be1b83 2145 JOIN_EXACT(scan,&min,0);
a3621e74 2146
653099ff
GS
2147 /* Follow the next-chain of the current node and optimize
2148 away all the NOTHINGs from it. */
c277df42 2149 if (OP(scan) != CURLYX) {
a3b680e6 2150 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2151 ? I32_MAX
2152 /* I32 may be smaller than U16 on CRAYs! */
2153 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2154 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2155 int noff;
2156 regnode *n = scan;
b81d288d 2157
c277df42
IZ
2158 /* Skip NOTHING and LONGJMP. */
2159 while ((n = regnext(n))
3dab1dad 2160 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2161 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2162 && off + noff < max)
2163 off += noff;
2164 if (reg_off_by_arg[OP(scan)])
2165 ARG(scan) = off;
b81d288d 2166 else
c277df42
IZ
2167 NEXT_OFF(scan) = off;
2168 }
a3621e74 2169
07be1b83 2170
3dab1dad 2171
653099ff
GS
2172 /* The principal pseudo-switch. Cannot be a switch, since we
2173 look into several different things. */
b81d288d 2174 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2175 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2176 next = regnext(scan);
2177 code = OP(scan);
a3621e74 2178 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2179
2180 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2181 /* NOTE - There is similar code to this block below for handling
2182 TRIE nodes on a re-study. If you change stuff here check there
2183 too. */
c277df42 2184 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2185 struct regnode_charclass_class accum;
d4c19fe8 2186 regnode * const startbranch=scan;
c277df42 2187
653099ff 2188 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 2189 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 2190 if (flags & SCF_DO_STCLASS)
830247a4 2191 cl_init_zero(pRExC_state, &accum);
a3621e74 2192
c277df42 2193 while (OP(scan) == code) {
830247a4 2194 I32 deltanext, minnext, f = 0, fake;
653099ff 2195 struct regnode_charclass_class this_class;
c277df42
IZ
2196
2197 num++;
2198 data_fake.flags = 0;
b81d288d 2199 if (data) {
2c2d71f5 2200 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2201 data_fake.last_closep = data->last_closep;
2202 }
2203 else
2204 data_fake.last_closep = &fake;
c277df42
IZ
2205 next = regnext(scan);
2206 scan = NEXTOPER(scan);
2207 if (code != BRANCH)
2208 scan = NEXTOPER(scan);
653099ff 2209 if (flags & SCF_DO_STCLASS) {
830247a4 2210 cl_init(pRExC_state, &this_class);
653099ff
GS
2211 data_fake.start_class = &this_class;
2212 f = SCF_DO_STCLASS_AND;
b81d288d 2213 }
e1901655
IZ
2214 if (flags & SCF_WHILEM_VISITED_POS)
2215 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2216
653099ff 2217 /* we suppose the run is continuous, last=next...*/
830247a4 2218 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 2219 next, &data_fake, f,depth+1);
b81d288d 2220 if (min1 > minnext)
c277df42
IZ
2221 min1 = minnext;
2222 if (max1 < minnext + deltanext)
2223 max1 = minnext + deltanext;
2224 if (deltanext == I32_MAX)
aca2d497 2225 is_inf = is_inf_internal = 1;
c277df42
IZ
2226 scan = next;
2227 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2228 pars++;
3dab1dad
YO
2229 if (data) {
2230 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2231 data->flags |= SF_HAS_EVAL;
2c2d71f5 2232 data->whilem_c = data_fake.whilem_c;
3dab1dad 2233 }
653099ff 2234 if (flags & SCF_DO_STCLASS)
830247a4 2235 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2236 if (code == SUSPEND)
c277df42
IZ
2237 break;
2238 }
2239 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2240 min1 = 0;
2241 if (flags & SCF_DO_SUBSTR) {
2242 data->pos_min += min1;
2243 data->pos_delta += max1 - min1;
2244 if (max1 != min1 || is_inf)
2245 data->longest = &(data->longest_float);
2246 }
2247 min += min1;
2248 delta += max1 - min1;
653099ff 2249 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2250 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
2251 if (min1) {
2252 cl_and(data->start_class, &and_with);
2253 flags &= ~SCF_DO_STCLASS;
2254 }
2255 }
2256 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2257 if (min1) {
2258 cl_and(data->start_class, &accum);
653099ff 2259 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2260 }
2261 else {
b81d288d 2262 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
2263 * data->start_class */
2264 StructCopy(data->start_class, &and_with,
2265 struct regnode_charclass_class);
2266 flags &= ~SCF_DO_STCLASS_AND;
2267 StructCopy(&accum, data->start_class,
2268 struct regnode_charclass_class);
2269 flags |= SCF_DO_STCLASS_OR;
2270 data->start_class->flags |= ANYOF_EOS;
2271 }
653099ff 2272 }
a3621e74 2273
786e8c11 2274 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2275 /* demq.
2276
2277 Assuming this was/is a branch we are dealing with: 'scan' now
2278 points at the item that follows the branch sequence, whatever
2279 it is. We now start at the beginning of the sequence and look
2280 for subsequences of
2281
786e8c11
YO
2282 BRANCH->EXACT=>x1
2283 BRANCH->EXACT=>x2
2284 tail
a3621e74
YO
2285
2286 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2287
2288 If we can find such a subseqence we need to turn the first
2289 element into a trie and then add the subsequent branch exact
2290 strings to the trie.
2291
2292 We have two cases
2293
786e8c11 2294 1. patterns where the whole set of branch can be converted.
a3621e74 2295
786e8c11 2296 2. patterns where only a subset can be converted.
a3621e74
YO
2297
2298 In case 1 we can replace the whole set with a single regop
2299 for the trie. In case 2 we need to keep the start and end
2300 branchs so
2301
2302 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2303 becomes BRANCH TRIE; BRANCH X;
2304
786e8c11
YO
2305 There is an additional case, that being where there is a
2306 common prefix, which gets split out into an EXACT like node
2307 preceding the TRIE node.
2308
2309 If x(1..n)==tail then we can do a simple trie, if not we make
2310 a "jump" trie, such that when we match the appropriate word
2311 we "jump" to the appopriate tail node. Essentailly we turn
2312 a nested if into a case structure of sorts.
a3621e74
YO
2313
2314 */
786e8c11 2315
3dab1dad 2316 int made=0;
0111c4fd
RGS
2317 if (!re_trie_maxbuff) {
2318 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2319 if (!SvIOK(re_trie_maxbuff))
2320 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2321 }
786e8c11 2322 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2323 regnode *cur;
2324 regnode *first = (regnode *)NULL;
2325 regnode *last = (regnode *)NULL;
2326 regnode *tail = scan;
2327 U8 optype = 0;
2328 U32 count=0;
2329
2330#ifdef DEBUGGING
c445ea15 2331 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2332#endif
2333 /* var tail is used because there may be a TAIL
2334 regop in the way. Ie, the exacts will point to the
2335 thing following the TAIL, but the last branch will
2336 point at the TAIL. So we advance tail. If we
2337 have nested (?:) we may have to move through several
2338 tails.
2339 */
2340
2341 while ( OP( tail ) == TAIL ) {
2342 /* this is the TAIL generated by (?:) */
2343 tail = regnext( tail );
2344 }
2345
3dab1dad 2346
a3621e74 2347 DEBUG_OPTIMISE_r({
32fc9b6a 2348 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2349 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2350 (int)depth * 2 + 2, "",
2351 "Looking for TRIE'able sequences. Tail node is: ",
2352 SvPV_nolen_const( mysv )
a3621e74
YO
2353 );
2354 });
3dab1dad 2355
a3621e74
YO
2356 /*
2357
2358 step through the branches, cur represents each
2359 branch, noper is the first thing to be matched
2360 as part of that branch and noper_next is the
2361 regnext() of that node. if noper is an EXACT
2362 and noper_next is the same as scan (our current
2363 position in the regex) then the EXACT branch is
2364 a possible optimization target. Once we have
2365 two or more consequetive such branches we can
2366 create a trie of the EXACT's contents and stich
2367 it in place. If the sequence represents all of
2368 the branches we eliminate the whole thing and
2369 replace it with a single TRIE. If it is a
2370 subsequence then we need to stitch it in. This
2371 means the first branch has to remain, and needs
2372 to be repointed at the item on the branch chain
2373 following the last branch optimized. This could
2374 be either a BRANCH, in which case the
2375 subsequence is internal, or it could be the
2376 item following the branch sequence in which
2377 case the subsequence is at the end.
2378
2379 */
2380
2381 /* dont use tail as the end marker for this traverse */
2382 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
2383 regnode * const noper = NEXTOPER( cur );
2384 regnode * const noper_next = regnext( noper );
a3621e74 2385
a3621e74 2386 DEBUG_OPTIMISE_r({
32fc9b6a 2387 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2388 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2389 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2390
32fc9b6a 2391 regprop(RExC_rx, mysv, noper);
a3621e74 2392 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2393 SvPV_nolen_const(mysv));
a3621e74
YO
2394
2395 if ( noper_next ) {
32fc9b6a 2396 regprop(RExC_rx, mysv, noper_next );
a3621e74 2397 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2398 SvPV_nolen_const(mysv));
a3621e74 2399 }
3dab1dad
YO
2400 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2401 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2402 });
3dab1dad
YO
2403 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2404 : PL_regkind[ OP( noper ) ] == EXACT )
2405 || OP(noper) == NOTHING )
786e8c11
YO
2406#ifdef NOJUMPTRIE
2407 && noper_next == tail
2408#endif
2409 && count < U16_MAX)
a3621e74
YO
2410 {
2411 count++;
3dab1dad
YO
2412 if ( !first || optype == NOTHING ) {
2413 if (!first) first = cur;
a3621e74
YO
2414 optype = OP( noper );
2415 } else {
a3621e74 2416 last = cur;
a3621e74
YO
2417 }
2418 } else {
2419 if ( last ) {
786e8c11
YO
2420 make_trie( pRExC_state,
2421 startbranch, first, cur, tail, count,
2422 optype, depth+1 );
a3621e74 2423 }
3dab1dad 2424 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2425#ifdef NOJUMPTRIE
2426 && noper_next == tail
2427#endif
2428 ){
a3621e74
YO
2429 count = 1;
2430 first = cur;
2431 optype = OP( noper );
2432 } else {
2433 count = 0;
2434 first = NULL;
2435 optype = 0;
2436 }
2437 last = NULL;
2438 }
2439 }
2440 DEBUG_OPTIMISE_r({
32fc9b6a 2441 regprop(RExC_rx, mysv, cur);
a3621e74 2442 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2443 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2444 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2445
2446 });
2447 if ( last ) {
786e8c11 2448 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2449#ifdef TRIE_STUDY_OPT
786e8c11
YO
2450 if ( ((made == MADE_EXACT_TRIE &&
2451 startbranch == first)
2452 || ( first_non_open == first )) &&
2453 depth==0 )
2454 flags |= SCF_TRIE_RESTUDY;
3dab1dad 2455#endif
07be1b83 2456 }
a3621e74 2457 }
3dab1dad
YO
2458
2459 } /* do trie */
786e8c11 2460
a0ed51b3 2461 }
a3621e74 2462 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2463 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2464 } else /* single branch is optimized. */
c277df42
IZ
2465 scan = NEXTOPER(scan);
2466 continue;
a0ed51b3
LW
2467 }
2468 else if (OP(scan) == EXACT) {
cd439c50 2469 I32 l = STR_LEN(scan);
c445ea15 2470 UV uc;
a0ed51b3 2471 if (UTF) {
a3b680e6 2472 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2473 l = utf8_length(s, s + l);
9041c2e3 2474 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2475 } else {
2476 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2477 }
2478 min += l;
c277df42 2479 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2480 /* The code below prefers earlier match for fixed
2481 offset, later match for variable offset. */
2482 if (data->last_end == -1) { /* Update the start info. */
2483 data->last_start_min = data->pos_min;
2484 data->last_start_max = is_inf
b81d288d 2485 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2486 }
cd439c50 2487 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2488 if (UTF)
2489 SvUTF8_on(data->last_found);
0eda9292 2490 {
9a957fbc 2491 SV * const sv = data->last_found;
a28509cc 2492 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2493 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2494 if (mg && mg->mg_len >= 0)
5e43f467
JH
2495 mg->mg_len += utf8_length((U8*)STRING(scan),
2496 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2497 }
c277df42
IZ
2498 data->last_end = data->pos_min + l;
2499 data->pos_min += l; /* As in the first entry. */
2500 data->flags &= ~SF_BEFORE_EOL;
2501 }
653099ff
GS
2502 if (flags & SCF_DO_STCLASS_AND) {
2503 /* Check whether it is compatible with what we know already! */
2504 int compat = 1;
2505
1aa99e6b 2506 if (uc >= 0x100 ||
516a5887 2507 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2508 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2509 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2510 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2511 )
653099ff
GS
2512 compat = 0;
2513 ANYOF_CLASS_ZERO(data->start_class);
2514 ANYOF_BITMAP_ZERO(data->start_class);
2515 if (compat)
1aa99e6b 2516 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2517 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2518 if (uc < 0x100)
2519 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2520 }
2521 else if (flags & SCF_DO_STCLASS_OR) {
2522 /* false positive possible if the class is case-folded */
1aa99e6b 2523 if (uc < 0x100)
9b877dbb
IH
2524 ANYOF_BITMAP_SET(data->start_class, uc);
2525 else
2526 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2527 data->start_class->flags &= ~ANYOF_EOS;
2528 cl_and(data->start_class, &and_with);
2529 }
2530 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2531 }
3dab1dad 2532 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2533 I32 l = STR_LEN(scan);
1aa99e6b 2534 UV uc = *((U8*)STRING(scan));
653099ff
GS
2535
2536 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2537 if (flags & SCF_DO_SUBSTR) {
2538 assert(data);
830247a4 2539 scan_commit(pRExC_state, data);
ecaa9b9c 2540 }
a0ed51b3 2541 if (UTF) {
6136c704 2542 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2543 l = utf8_length(s, s + l);
9041c2e3 2544 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2545 }
2546 min += l;
ecaa9b9c 2547 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2548 data->pos_min += l;
653099ff
GS
2549 if (flags & SCF_DO_STCLASS_AND) {
2550 /* Check whether it is compatible with what we know already! */
2551 int compat = 1;
2552
1aa99e6b 2553 if (uc >= 0x100 ||
516a5887 2554 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2555 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2556 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2557 compat = 0;
2558 ANYOF_CLASS_ZERO(data->start_class);
2559 ANYOF_BITMAP_ZERO(data->start_class);
2560 if (compat) {
1aa99e6b 2561 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2562 data->start_class->flags &= ~ANYOF_EOS;
2563 data->start_class->flags |= ANYOF_FOLD;
2564 if (OP(scan) == EXACTFL)
2565 data->start_class->flags |= ANYOF_LOCALE;
2566 }
2567 }
2568 else if (flags & SCF_DO_STCLASS_OR) {
2569 if (data->start_class->flags & ANYOF_FOLD) {
2570 /* false positive possible if the class is case-folded.
2571 Assume that the locale settings are the same... */
1aa99e6b
IH
2572 if (uc < 0x100)
2573 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2574 data->start_class->flags &= ~ANYOF_EOS;
2575 }
2576 cl_and(data->start_class, &and_with);
2577 }
2578 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2579 }
bfed75c6 2580 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2581 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2582 I32 f = flags, pos_before = 0;
d4c19fe8 2583 regnode * const oscan = scan;
653099ff
GS
2584 struct regnode_charclass_class this_class;
2585 struct regnode_charclass_class *oclass = NULL;
727f22e3 2586 I32 next_is_eval = 0;
653099ff 2587
3dab1dad 2588 switch (PL_regkind[OP(scan)]) {
653099ff 2589 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2590 scan = NEXTOPER(scan);
2591 goto finish;
2592 case PLUS:
653099ff 2593 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2594 next = NEXTOPER(scan);
653099ff 2595 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2596 mincount = 1;
2597 maxcount = REG_INFTY;
c277df42
IZ
2598 next = regnext(scan);
2599 scan = NEXTOPER(scan);
2600 goto do_curly;
2601 }
2602 }
2603 if (flags & SCF_DO_SUBSTR)
2604 data->pos_min++;
2605 min++;
2606 /* Fall through. */
2607 case STAR:
653099ff
GS
2608 if (flags & SCF_DO_STCLASS) {
2609 mincount = 0;
b81d288d 2610 maxcount = REG_INFTY;
653099ff
GS
2611 next = regnext(scan);
2612 scan = NEXTOPER(scan);
2613 goto do_curly;
2614 }
b81d288d 2615 is_inf = is_inf_internal = 1;
c277df42
IZ
2616 scan = regnext(scan);
2617 if (flags & SCF_DO_SUBSTR) {
830247a4 2618 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2619 data->longest = &(data->longest_float);
2620 }
2621 goto optimize_curly_tail;
2622 case CURLY:
b81d288d 2623 mincount = ARG1(scan);
c277df42
IZ
2624 maxcount = ARG2(scan);
2625 next = regnext(scan);
cb434fcc
IZ
2626 if (OP(scan) == CURLYX) {
2627 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2628 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2629 }
c277df42 2630 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2631 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2632 do_curly:
2633 if (flags & SCF_DO_SUBSTR) {
830247a4 2634 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2635 pos_before = data->pos_min;
2636 }
2637 if (data) {
2638 fl = data->flags;
2639 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2640 if (is_inf)
2641 data->flags |= SF_IS_INF;
2642 }
653099ff 2643 if (flags & SCF_DO_STCLASS) {
830247a4 2644 cl_init(pRExC_state, &this_class);
653099ff
GS
2645 oclass = data->start_class;
2646 data->start_class = &this_class;
2647 f |= SCF_DO_STCLASS_AND;
2648 f &= ~SCF_DO_STCLASS_OR;
2649 }
e1901655
IZ
2650 /* These are the cases when once a subexpression
2651 fails at a particular position, it cannot succeed
2652 even after backtracking at the enclosing scope.
b81d288d 2653
e1901655
IZ
2654 XXXX what if minimal match and we are at the
2655 initial run of {n,m}? */
2656 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2657 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2658
c277df42 2659 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2660 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2661 (mincount == 0
2662 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2663
2664 if (flags & SCF_DO_STCLASS)
2665 data->start_class = oclass;
2666 if (mincount == 0 || minnext == 0) {
2667 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2668 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2669 }
2670 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2671 /* Switch to OR mode: cache the old value of
653099ff
GS
2672 * data->start_class */
2673 StructCopy(data->start_class, &and_with,
2674 struct regnode_charclass_class);
2675 flags &= ~SCF_DO_STCLASS_AND;
2676 StructCopy(&this_class, data->start_class,
2677 struct regnode_charclass_class);
2678 flags |= SCF_DO_STCLASS_OR;
2679 data->start_class->flags |= ANYOF_EOS;
2680 }
2681 } else { /* Non-zero len */
2682 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2683 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2684 cl_and(data->start_class, &and_with);
2685 }
2686 else if (flags & SCF_DO_STCLASS_AND)
2687 cl_and(data->start_class, &this_class);
2688 flags &= ~SCF_DO_STCLASS;
2689 }
c277df42
IZ
2690 if (!scan) /* It was not CURLYX, but CURLY. */
2691 scan = next;
041457d9
DM
2692 if ( /* ? quantifier ok, except for (?{ ... }) */
2693 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2694 && (minnext == 0) && (deltanext == 0)
99799961 2695 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2696 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2697 && ckWARN(WARN_REGEXP))
b45f050a 2698 {
830247a4 2699 vWARN(RExC_parse,
b45f050a
JF
2700 "Quantifier unexpected on zero-length expression");
2701 }
2702
c277df42 2703 min += minnext * mincount;
b81d288d 2704 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2705 && (minnext + deltanext) > 0)
2706 || deltanext == I32_MAX);
aca2d497 2707 is_inf |= is_inf_internal;
c277df42
IZ
2708 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2709
2710 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2711 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2712 && data->flags & SF_IN_PAR
2713 && !(data->flags & SF_HAS_EVAL)
2714 && !deltanext && minnext == 1 ) {
2715 /* Try to optimize to CURLYN. */
2716 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2717 regnode * const nxt1 = nxt;
497b47a8
JH
2718#ifdef DEBUGGING
2719 regnode *nxt2;
2720#endif
c277df42
IZ
2721
2722 /* Skip open. */
2723 nxt = regnext(nxt);
bfed75c6 2724 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2725 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2726 && STR_LEN(nxt) == 1))
c277df42 2727 goto nogo;
497b47a8 2728#ifdef DEBUGGING
c277df42 2729 nxt2 = nxt;
497b47a8 2730#endif
c277df42 2731 nxt = regnext(nxt);
b81d288d 2732 if (OP(nxt) != CLOSE)
c277df42
IZ
2733 goto nogo;
2734 /* Now we know that nxt2 is the only contents: */
eb160463 2735 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2736 OP(oscan) = CURLYN;
2737 OP(nxt1) = NOTHING; /* was OPEN. */
2738#ifdef DEBUGGING
2739 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2740 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2741 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2742 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2743 OP(nxt + 1) = OPTIMIZED; /* was count. */
2744 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2745#endif
c277df42 2746 }
c277df42
IZ
2747 nogo:
2748
2749 /* Try optimization CURLYX => CURLYM. */
b81d288d 2750 if ( OP(oscan) == CURLYX && data
c277df42 2751 && !(data->flags & SF_HAS_PAR)
c277df42 2752 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2753 && !deltanext /* atom is fixed width */
2754 && minnext != 0 /* CURLYM can't handle zero width */
2755 ) {
c277df42
IZ
2756 /* XXXX How to optimize if data == 0? */
2757 /* Optimize to a simpler form. */
2758 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2759 regnode *nxt2;
2760
2761 OP(oscan) = CURLYM;
2762 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2763 && (OP(nxt2) != WHILEM))
c277df42
IZ
2764 nxt = nxt2;
2765 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2766 /* Need to optimize away parenths. */
2767 if (data->flags & SF_IN_PAR) {
2768 /* Set the parenth number. */
2769 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2770
b81d288d 2771 if (OP(nxt) != CLOSE)
b45f050a 2772 FAIL("Panic opt close");
eb160463 2773 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2774 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2775 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2776#ifdef DEBUGGING
2777 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2778 OP(nxt + 1) = OPTIMIZED; /* was count. */
2779 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2780 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2781#endif
c277df42
IZ
2782#if 0
2783 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2784 regnode *nnxt = regnext(nxt1);
b81d288d 2785
c277df42
IZ
2786 if (nnxt == nxt) {
2787 if (reg_off_by_arg[OP(nxt1)])
2788 ARG_SET(nxt1, nxt2 - nxt1);
2789 else if (nxt2 - nxt1 < U16_MAX)
2790 NEXT_OFF(nxt1) = nxt2 - nxt1;
2791 else
2792 OP(nxt) = NOTHING; /* Cannot beautify */
2793 }
2794 nxt1 = nnxt;
2795 }
2796#endif
2797 /* Optimize again: */
b81d288d 2798 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2799 NULL, 0,depth+1);
a0ed51b3
LW
2800 }
2801 else
c277df42 2802 oscan->flags = 0;
c277df42 2803 }
e1901655
IZ
2804 else if ((OP(oscan) == CURLYX)
2805 && (flags & SCF_WHILEM_VISITED_POS)
2806 /* See the comment on a similar expression above.
2807 However, this time it not a subexpression
2808 we care about, but the expression itself. */
2809 && (maxcount == REG_INFTY)
2810 && data && ++data->whilem_c < 16) {
2811 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2812 /* Find WHILEM (as in regexec.c) */
2813 regnode *nxt = oscan + NEXT_OFF(oscan);
2814
2815 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2816 nxt += ARG(nxt);
eb160463
GS
2817 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2818 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2819 }
b81d288d 2820 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2821 pars++;
2822 if (flags & SCF_DO_SUBSTR) {
c445ea15 2823 SV *last_str = NULL;
c277df42
IZ
2824 int counted = mincount != 0;
2825
2826 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2827#if defined(SPARC64_GCC_WORKAROUND)
2828 I32 b = 0;
2829 STRLEN l = 0;
cfd0369c 2830 const char *s = NULL;
5d1c421c
JH
2831 I32 old = 0;
2832
2833 if (pos_before >= data->last_start_min)
2834 b = pos_before;
2835 else
2836 b = data->last_start_min;
2837
2838 l = 0;
cfd0369c 2839 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2840 old = b - data->last_start_min;
2841
2842#else
b81d288d 2843 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2844 ? pos_before : data->last_start_min;
2845 STRLEN l;
d4c19fe8 2846 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2847 I32 old = b - data->last_start_min;
5d1c421c 2848#endif
a0ed51b3
LW
2849
2850 if (UTF)
2851 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2852
a0ed51b3 2853 l -= old;
c277df42 2854 /* Get the added string: */
79cb57f6 2855 last_str = newSVpvn(s + old, l);
0e933229
IH
2856 if (UTF)
2857 SvUTF8_on(last_str);
c277df42
IZ
2858 if (deltanext == 0 && pos_before == b) {
2859 /* What was added is a constant string */
2860 if (mincount > 1) {
2861 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2862 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2863 SvPVX_const(last_str), l, mincount - 1);
b162af07 2864 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2865 /* Add additional parts. */
b81d288d 2866 SvCUR_set(data->last_found,
c277df42
IZ
2867 SvCUR(data->last_found) - l);
2868 sv_catsv(data->last_found, last_str);
0eda9292
JH
2869 {
2870 SV * sv = data->last_found;
2871 MAGIC *mg =
2872 SvUTF8(sv) && SvMAGICAL(sv) ?
2873 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2874 if (mg && mg->mg_len >= 0)
2875 mg->mg_len += CHR_SVLEN(last_str);
2876 }
c277df42
IZ
2877 data->last_end += l * (mincount - 1);
2878 }
2a8d9689
HS
2879 } else {
2880 /* start offset must point into the last copy */
2881 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2882 data->last_start_max += is_inf ? I32_MAX
2883 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2884 }
2885 }
2886 /* It is counted once already... */
2887 data->pos_min += minnext * (mincount - counted);
2888 data->pos_delta += - counted * deltanext +
2889 (minnext + deltanext) * maxcount - minnext * mincount;
2890 if (mincount != maxcount) {
653099ff
GS
2891 /* Cannot extend fixed substrings found inside
2892 the group. */
830247a4 2893 scan_commit(pRExC_state,data);
c277df42 2894 if (mincount && last_str) {
d4c19fe8
AL
2895 SV * const sv = data->last_found;
2896 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
2897 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2898
2899 if (mg)
2900 mg->mg_len = -1;
2901 sv_setsv(sv, last_str);
c277df42 2902 data->last_end = data->pos_min;
b81d288d 2903 data->last_start_min =
a0ed51b3 2904 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2905 data->last_start_max = is_inf
2906 ? I32_MAX
c277df42 2907 : data->pos_min + data->pos_delta
a0ed51b3 2908 - CHR_SVLEN(last_str);
c277df42
IZ
2909 }
2910 data->longest = &(data->longest_float);
2911 }
aca2d497 2912 SvREFCNT_dec(last_str);
c277df42 2913 }
405ff068 2914 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2915 data->flags |= SF_HAS_EVAL;
2916 optimize_curly_tail:
c277df42 2917 if (OP(oscan) != CURLYX) {
3dab1dad 2918 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2919 && NEXT_OFF(next))
2920 NEXT_OFF(oscan) += NEXT_OFF(next);
2921 }
c277df42 2922 continue;
653099ff 2923 default: /* REF and CLUMP only? */
c277df42 2924 if (flags & SCF_DO_SUBSTR) {
830247a4 2925 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2926 data->longest = &(data->longest_float);
2927 }
aca2d497 2928 is_inf = is_inf_internal = 1;
653099ff 2929 if (flags & SCF_DO_STCLASS_OR)
830247a4 2930 cl_anything(pRExC_state, data->start_class);
653099ff 2931 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2932 break;
2933 }
a0ed51b3 2934 }
bfed75c6 2935 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2936 int value = 0;
653099ff 2937
c277df42 2938 if (flags & SCF_DO_SUBSTR) {
830247a4 2939 scan_commit(pRExC_state,data);
c277df42
IZ
2940 data->pos_min++;
2941 }
2942 min++;
653099ff
GS
2943 if (flags & SCF_DO_STCLASS) {
2944 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2945
2946 /* Some of the logic below assumes that switching
2947 locale on will only add false positives. */
3dab1dad 2948 switch (PL_regkind[OP(scan)]) {
653099ff 2949 case SANY:
653099ff
GS
2950 default:
2951 do_default:
2952 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2953 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2954 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2955 break;
2956 case REG_ANY:
2957 if (OP(scan) == SANY)
2958 goto do_default;
2959 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2960 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2961 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2962 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2963 }
2964 if (flags & SCF_DO_STCLASS_AND || !value)
2965 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2966 break;
2967 case ANYOF:
2968 if (flags & SCF_DO_STCLASS_AND)
2969 cl_and(data->start_class,
2970 (struct regnode_charclass_class*)scan);
2971 else
830247a4 2972 cl_or(pRExC_state, data->start_class,
653099ff
GS
2973 (struct regnode_charclass_class*)scan);
2974 break;
2975 case ALNUM:
2976 if (flags & SCF_DO_STCLASS_AND) {
2977 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2978 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2979 for (value = 0; value < 256; value++)
2980 if (!isALNUM(value))
2981 ANYOF_BITMAP_CLEAR(data->start_class, value);
2982 }
2983 }
2984 else {
2985 if (data->start_class->flags & ANYOF_LOCALE)
2986 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2987 else {
2988 for (value = 0; value < 256; value++)
2989 if (isALNUM(value))
b81d288d 2990 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2991 }
2992 }
2993 break;
2994 case ALNUML:
2995 if (flags & SCF_DO_STCLASS_AND) {
2996 if (data->start_class->flags & ANYOF_LOCALE)
2997 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2998 }
2999 else {
3000 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3001 data->start_class->flags |= ANYOF_LOCALE;
3002 }
3003 break;
3004 case NALNUM:
3005 if (flags & SCF_DO_STCLASS_AND) {
3006 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3007 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3008 for (value = 0; value < 256; value++)
3009 if (isALNUM(value))
3010 ANYOF_BITMAP_CLEAR(data->start_class, value);
3011 }
3012 }
3013 else {
3014 if (data->start_class->flags & ANYOF_LOCALE)
3015 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3016 else {
3017 for (value = 0; value < 256; value++)
3018 if (!isALNUM(value))
b81d288d 3019 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3020 }
3021 }
3022 break;
3023 case NALNUML:
3024 if (flags & SCF_DO_STCLASS_AND) {
3025 if (data->start_class->flags & ANYOF_LOCALE)
3026 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3027 }
3028 else {
3029 data->start_class->flags |= ANYOF_LOCALE;
3030 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3031 }
3032 break;
3033 case SPACE:
3034 if (flags & SCF_DO_STCLASS_AND) {
3035 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3036 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3037 for (value = 0; value < 256; value++)
3038 if (!isSPACE(value))
3039 ANYOF_BITMAP_CLEAR(data->start_class, value);
3040 }
3041 }
3042 else {
3043 if (data->start_class->flags & ANYOF_LOCALE)
3044 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3045 else {
3046 for (value = 0; value < 256; value++)
3047 if (isSPACE(value))
b81d288d 3048 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3049 }
3050 }
3051 break;
3052 case SPACEL:
3053 if (flags & SCF_DO_STCLASS_AND) {
3054 if (data->start_class->flags & ANYOF_LOCALE)
3055 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3056 }
3057 else {
3058 data->start_class->flags |= ANYOF_LOCALE;
3059 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3060 }
3061 break;
3062 case NSPACE:
3063 if (flags & SCF_DO_STCLASS_AND) {
3064 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3065 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3066 for (value = 0; value < 256; value++)
3067 if (isSPACE(value))
3068 ANYOF_BITMAP_CLEAR(data->start_class, value);
3069 }
3070 }
3071 else {
3072 if (data->start_class->flags & ANYOF_LOCALE)
3073 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3074 else {
3075 for (value = 0; value < 256; value++)
3076 if (!isSPACE(value))
b81d288d 3077 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3078 }
3079 }
3080 break;
3081 case NSPACEL:
3082 if (flags & SCF_DO_STCLASS_AND) {
3083 if (data->start_class->flags & ANYOF_LOCALE) {
3084 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3085 for (value = 0; value < 256; value++)
3086 if (!isSPACE(value))
3087 ANYOF_BITMAP_CLEAR(data->start_class, value);
3088 }
3089 }
3090 else {
3091 data->start_class->flags |= ANYOF_LOCALE;
3092 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3093 }
3094 break;
3095 case DIGIT:
3096 if (flags & SCF_DO_STCLASS_AND) {
3097 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3098 for (value = 0; value < 256; value++)
3099 if (!isDIGIT(value))
3100 ANYOF_BITMAP_CLEAR(data->start_class, value);
3101 }
3102 else {
3103 if (data->start_class->flags & ANYOF_LOCALE)
3104 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3105 else {
3106 for (value = 0; value < 256; value++)
3107 if (isDIGIT(value))
b81d288d 3108 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3109 }
3110 }
3111 break;
3112 case NDIGIT:
3113 if (flags & SCF_DO_STCLASS_AND) {
3114 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3115 for (value = 0; value < 256; value++)
3116 if (isDIGIT(value))
3117 ANYOF_BITMAP_CLEAR(data->start_class, value);
3118 }
3119 else {
3120 if (data->start_class->flags & ANYOF_LOCALE)
3121 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3122 else {
3123 for (value = 0; value < 256; value++)
3124 if (!isDIGIT(value))
b81d288d 3125 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3126 }
3127 }
3128 break;
3129 }
3130 if (flags & SCF_DO_STCLASS_OR)
3131 cl_and(data->start_class, &and_with);
3132 flags &= ~SCF_DO_STCLASS;
3133 }
a0ed51b3 3134 }
3dab1dad 3135 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3136 data->flags |= (OP(scan) == MEOL
3137 ? SF_BEFORE_MEOL
3138 : SF_BEFORE_SEOL);
a0ed51b3 3139 }
3dab1dad 3140 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3141 /* Lookbehind, or need to calculate parens/evals/stclass: */
3142 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3143 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 3144 /* Lookahead/lookbehind */
cb434fcc 3145 I32 deltanext, minnext, fake = 0;
c277df42 3146 regnode *nscan;
653099ff
GS
3147 struct regnode_charclass_class intrnl;
3148 int f = 0;
c277df42
IZ
3149
3150 data_fake.flags = 0;
b81d288d 3151 if (data) {
2c2d71f5 3152 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
3153 data_fake.last_closep = data->last_closep;
3154 }
3155 else
3156 data_fake.last_closep = &fake;
653099ff
GS
3157 if ( flags & SCF_DO_STCLASS && !scan->flags
3158 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 3159 cl_init(pRExC_state, &intrnl);
653099ff 3160 data_fake.start_class = &intrnl;
e1901655 3161 f |= SCF_DO_STCLASS_AND;
653099ff 3162 }
e1901655
IZ
3163 if (flags & SCF_WHILEM_VISITED_POS)
3164 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
3165 next = regnext(scan);
3166 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 3167 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
3168 if (scan->flags) {
3169 if (deltanext) {
9baa0206 3170 vFAIL("Variable length lookbehind not implemented");
a0ed51b3 3171 }
786e8c11 3172 else if (minnext > (I32)U8_MAX) {
9baa0206 3173 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 3174 }
eb160463 3175 scan->flags = (U8)minnext;
c277df42 3176 }
be8e71aa
YO
3177 if (data) {
3178 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3179 pars++;
3180 if (data_fake.flags & SF_HAS_EVAL)
3181 data->flags |= SF_HAS_EVAL;
2c2d71f5 3182 data->whilem_c = data_fake.whilem_c;
be8e71aa 3183 }
e1901655 3184 if (f & SCF_DO_STCLASS_AND) {
a28509cc 3185 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
3186
3187 cl_and(data->start_class, &intrnl);
3188 if (was)
3189 data->start_class->flags |= ANYOF_EOS;
3190 }
a0ed51b3
LW
3191 }
3192 else if (OP(scan) == OPEN) {
c277df42 3193 pars++;
a0ed51b3 3194 }
cb434fcc 3195 else if (OP(scan) == CLOSE) {
eb160463 3196 if ((I32)ARG(scan) == is_par) {
cb434fcc 3197 next = regnext(scan);
c277df42 3198
cb434fcc
IZ
3199 if ( next && (OP(next) != WHILEM) && next < last)
3200 is_par = 0; /* Disable optimization */
3201 }
3202 if (data)
3203 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
3204 }
3205 else if (OP(scan) == EVAL) {
c277df42
IZ
3206 if (data)
3207 data->flags |= SF_HAS_EVAL;
3208 }
96776eda 3209 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 3210 if (flags & SCF_DO_SUBSTR) {
830247a4 3211 scan_commit(pRExC_state,data);
0f5d15d6
IZ
3212 data->longest = &(data->longest_float);
3213 }
3214 is_inf = is_inf_internal = 1;
653099ff 3215 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3216 cl_anything(pRExC_state, data->start_class);
96776eda 3217 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3218 }
786e8c11
YO
3219#ifdef TRIE_STUDY_OPT
3220#ifdef FULL_TRIE_STUDY
3221 else if (PL_regkind[OP(scan)] == TRIE) {
3222 /* NOTE - There is similar code to this block above for handling
3223 BRANCH nodes on the initial study. If you change stuff here
3224 check there too. */
3225 regnode *tail= regnext(scan);
3226 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3227 I32 max1 = 0, min1 = I32_MAX;
3228 struct regnode_charclass_class accum;
3229
3230 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3231 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
3232 if (flags & SCF_DO_STCLASS)
3233 cl_init_zero(pRExC_state, &accum);
3234
3235 if (!trie->jump) {
3236 min1= trie->minlen;
3237 max1= trie->maxlen;
3238 } else {
3239 const regnode *nextbranch= NULL;
3240 U32 word;
3241
3242 for ( word=1 ; word <= trie->wordcount ; word++)
3243 {
3244 I32 deltanext=0, minnext=0, f = 0, fake;
3245 struct regnode_charclass_class this_class;
3246
3247 data_fake.flags = 0;
3248 if (data) {
3249 data_fake.whilem_c = data->whilem_c;
3250 data_fake.last_closep = data->last_closep;
3251 }
3252 else
3253 data_fake.last_closep = &fake;
3254
3255 if (flags & SCF_DO_STCLASS) {
3256 cl_init(pRExC_state, &this_class);
3257 data_fake.start_class = &this_class;
3258 f = SCF_DO_STCLASS_AND;
3259 }
3260 if (flags & SCF_WHILEM_VISITED_POS)
3261 f |= SCF_WHILEM_VISITED_POS;
3262
3263 if (trie->jump[word]) {
3264 if (!nextbranch)
3265 nextbranch = tail - trie->jump[0];
3266 scan= tail - trie->jump[word];
3267 /* We go from the jump point to the branch that follows
3268 it. Note this means we need the vestigal unused branches
3269 even though they arent otherwise used.
3270 */
3271 minnext = study_chunk(pRExC_state, &scan, &deltanext,
3272 (regnode *)nextbranch, &data_fake, f,depth+1);
3273 }
3274 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3275 nextbranch= regnext((regnode*)nextbranch);
3276
3277 if (min1 > (I32)(minnext + trie->minlen))
3278 min1 = minnext + trie->minlen;
3279 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3280 max1 = minnext + deltanext + trie->maxlen;
3281 if (deltanext == I32_MAX)
3282 is_inf = is_inf_internal = 1;
3283
3284 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3285 pars++;
3286
3287 if (data) {
3288 if (data_fake.flags & SF_HAS_EVAL)
3289 data->flags |= SF_HAS_EVAL;
3290 data->whilem_c = data_fake.whilem_c;
3291 }
3292 if (flags & SCF_DO_STCLASS)
3293 cl_or(pRExC_state, &accum, &this_class);
3294 }
3295 }
3296 if (flags & SCF_DO_SUBSTR) {
3297 data->pos_min += min1;
3298 data->pos_delta += max1 - min1;
3299 if (max1 != min1 || is_inf)
3300 data->longest = &(data->longest_float);
3301 }
3302 min += min1;
3303 delta += max1 - min1;
3304 if (flags & SCF_DO_STCLASS_OR) {
3305 cl_or(pRExC_state, data->start_class, &accum);
3306 if (min1) {
3307 cl_and(data->start_class, &and_with);
3308 flags &= ~SCF_DO_STCLASS;
3309 }
3310 }
3311 else if (flags & SCF_DO_STCLASS_AND) {
3312 if (min1) {
3313 cl_and(data->start_class, &accum);
3314 flags &= ~SCF_DO_STCLASS;
3315 }
3316 else {
3317 /* Switch to OR mode: cache the old value of
3318 * data->start_class */
3319 StructCopy(data->start_class, &and_with,
3320 struct regnode_charclass_class);
3321 flags &= ~SCF_DO_STCLASS_AND;
3322 StructCopy(&accum, data->start_class,
3323 struct regnode_charclass_class);
3324 flags |= SCF_DO_STCLASS_OR;
3325 data->start_class->flags |= ANYOF_EOS;
3326 }
3327 }
3328 scan= tail;
3329 continue;
3330 }
3331#else
3332 else if (PL_regkind[OP(scan)] == TRIE) {
3333 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3334 U8*bang=NULL;
3335
3336 min += trie->minlen;
3337 delta += (trie->maxlen - trie->minlen);
3338 flags &= ~SCF_DO_STCLASS; /* xxx */
3339 if (flags & SCF_DO_SUBSTR) {
3340 scan_commit(pRExC_state,data); /* Cannot expect anything... */
3341 data->pos_min += trie->minlen;
3342 data->pos_delta += (trie->maxlen - trie->minlen);
3343 if (trie->maxlen != trie->minlen)
3344 data->longest = &(data->longest_float);
3345 }
3346 if (trie->jump) /* no more substrings -- for now /grr*/
3347 flags &= ~SCF_DO_SUBSTR;
3348 }
3349#endif /* old or new */
3350#endif /* TRIE_STUDY_OPT */
c277df42
IZ
3351 /* Else: zero-length, ignore. */
3352 scan = regnext(scan);
3353 }
3354
3355 finish:
3356 *scanp = scan;
aca2d497 3357 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3358 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3359 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3360 if (is_par > (I32)U8_MAX)
c277df42
IZ
3361 is_par = 0;
3362 if (is_par && pars==1 && data) {
3363 data->flags |= SF_IN_PAR;
3364 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3365 }
3366 else if (pars && data) {
c277df42
IZ
3367 data->flags |= SF_HAS_PAR;
3368 data->flags &= ~SF_IN_PAR;
3369 }
653099ff
GS
3370 if (flags & SCF_DO_STCLASS_OR)
3371 cl_and(data->start_class, &and_with);
786e8c11
YO
3372 if (flags & SCF_TRIE_RESTUDY)
3373 data->flags |= SCF_TRIE_RESTUDY;
c277df42
IZ
3374 return min;
3375}
3376
76e3520e 3377STATIC I32
5f66b61c 3378S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3379{
830247a4 3380 if (RExC_rx->data) {
b81d288d
AB
3381 Renewc(RExC_rx->data,
3382 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 3383 char, struct reg_data);
830247a4
IZ
3384 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3385 RExC_rx->data->count += n;
a0ed51b3
LW
3386 }
3387 else {
a02a5408 3388 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3389 char, struct reg_data);
a02a5408 3390 Newx(RExC_rx->data->what, n, U8);
830247a4 3391 RExC_rx->data->count = n;
c277df42 3392 }
830247a4
IZ
3393 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3394 return RExC_rx->data->count - n;
c277df42
IZ
3395}
3396
76234dfb 3397#ifndef PERL_IN_XSUB_RE
d88dccdf 3398void
864dbfa3 3399Perl_reginitcolors(pTHX)
d88dccdf 3400{
97aff369 3401 dVAR;
1df70142 3402 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3403 if (s) {
1df70142
AL
3404 char *t = savepv(s);
3405 int i = 0;
3406 PL_colors[0] = t;
d88dccdf 3407 while (++i < 6) {
1df70142
AL
3408 t = strchr(t, '\t');
3409 if (t) {
3410 *t = '\0';
3411 PL_colors[i] = ++t;
d88dccdf
IZ
3412 }
3413 else
1df70142 3414 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3415 }
3416 } else {
1df70142 3417 int i = 0;
b81d288d 3418 while (i < 6)
06b5626a 3419 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3420 }
3421 PL_colorset = 1;
3422}
76234dfb 3423#endif
8615cb43 3424
07be1b83 3425
786e8c11
YO
3426#ifdef TRIE_STUDY_OPT
3427#define CHECK_RESTUDY_GOTO \
3428 if ( \
3429 (data.flags & SCF_TRIE_RESTUDY) \
3430 && ! restudied++ \
3431 ) goto reStudy
3432#else
3433#define CHECK_RESTUDY_GOTO
3434#endif
a687059c 3435/*
e50aee73 3436 - pregcomp - compile a regular expression into internal code
a687059c
LW
3437 *
3438 * We can't allocate space until we know how big the compiled form will be,
3439 * but we can't compile it (and thus know how big it is) until we've got a
3440 * place to put the code. So we cheat: we compile it twice, once with code
3441 * generation turned off and size counting turned on, and once "for real".
3442 * This also means that we don't allocate space until we are sure that the
3443 * thing really will compile successfully, and we never have to move the
3444 * code and thus invalidate pointers into it. (Note that it has to be in
3445 * one piece because free() must be able to free it all.) [NB: not true in perl]
3446 *
3447 * Beware that the optimization-preparation code in here knows about some
3448 * of the structure of the compiled regexp. [I'll say.]
3449 */
3450regexp *
864dbfa3 3451Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3452{
97aff369 3453 dVAR;
a0d0e21e 3454 register regexp *r;
c277df42 3455 regnode *scan;
c277df42 3456 regnode *first;
a0d0e21e 3457 I32 flags;
a0d0e21e
LW
3458 I32 minlen = 0;
3459 I32 sawplus = 0;
3460 I32 sawopen = 0;
2c2d71f5 3461 scan_data_t data;
830247a4 3462 RExC_state_t RExC_state;
be8e71aa 3463 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3464#ifdef TRIE_STUDY_OPT
3465 int restudied= 0;
3466 RExC_state_t copyRExC_state;
3467#endif
a0d0e21e 3468
a3621e74
YO
3469 GET_RE_DEBUG_FLAGS_DECL;
3470
a0d0e21e 3471 if (exp == NULL)
c277df42 3472 FAIL("NULL regexp argument");
a0d0e21e 3473
a5961de5 3474 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3475
5cfc7842 3476 RExC_precomp = exp;
a3621e74
YO
3477 DEBUG_r(if (!PL_colorset) reginitcolors());
3478 DEBUG_COMPILE_r({
ab3bbdeb
YO
3479 SV *dsv= sv_newmortal();
3480 RE_PV_QUOTED_DECL(s, RExC_utf8,
3481 dsv, RExC_precomp, (xend - exp), 60);
3482 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3483 PL_colors[4],PL_colors[5],s);
a5961de5 3484 });
e2509266 3485 RExC_flags = pm->op_pmflags;
830247a4 3486 RExC_sawback = 0;
bbce6d69 3487
830247a4
IZ
3488 RExC_seen = 0;
3489 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3490 RExC_seen_evals = 0;
3491 RExC_extralen = 0;
c277df42 3492
bbce6d69 3493 /* First pass: determine size, legality. */
830247a4 3494 RExC_parse = exp;
fac92740 3495 RExC_start = exp;
830247a4
IZ
3496 RExC_end = xend;
3497 RExC_naughty = 0;
3498 RExC_npar = 1;
3499 RExC_size = 0L;
3500 RExC_emit = &PL_regdummy;
3501 RExC_whilem_seen = 0;
85ddcde9
JH
3502#if 0 /* REGC() is (currently) a NOP at the first pass.
3503 * Clever compilers notice this and complain. --jhi */
830247a4 3504 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3505#endif
3dab1dad
YO
3506 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3507 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3508 RExC_precomp = NULL;
a0d0e21e
LW
3509 return(NULL);
3510 }
3dab1dad
YO
3511 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3512 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3513 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
07be1b83
YO
3514 DEBUG_PARSE_r({
3515 RExC_lastnum=0;
3516 RExC_lastparse=NULL;
3517 });
c277df42 3518
07be1b83 3519
c277df42
IZ
3520 /* Small enough for pointer-storage convention?
3521 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3522 if (RExC_size >= 0x10000L && RExC_extralen)
3523 RExC_size += RExC_extralen;
c277df42 3524 else
830247a4
IZ
3525 RExC_extralen = 0;
3526 if (RExC_whilem_seen > 15)
3527 RExC_whilem_seen = 15;
a0d0e21e 3528
bbce6d69 3529 /* Allocate space and initialize. */
a02a5408 3530 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3531 char, regexp);
a0d0e21e 3532 if (r == NULL)
b45f050a
JF
3533 FAIL("Regexp out of space");
3534
0f79a09d
GS
3535#ifdef DEBUGGING
3536 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3537 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3538#endif
c277df42 3539 r->refcnt = 1;
bbce6d69 3540 r->prelen = xend - exp;
5cfc7842 3541 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3542 r->subbeg = NULL;
f8c7b90f 3543#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3544 r->saved_copy = NULL;
ed252734 3545#endif
cf93c79d 3546 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3547 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3548 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3549
3550 r->substrs = 0; /* Useful during FAIL. */
3551 r->startp = 0; /* Useful during FAIL. */
3552 r->endp = 0; /* Useful during FAIL. */
3553
a02a5408 3554 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3555 if (r->offsets) {
2af232bd 3556 r->offsets[0] = RExC_size;
fac92740 3557 }
a3621e74 3558 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3559 "%s %"UVuf" bytes for offset annotations.\n",
3560 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3561 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 3562
830247a4 3563 RExC_rx = r;
bbce6d69 3564
3565 /* Second pass: emit code. */
e2509266 3566 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
3567 RExC_parse = exp;
3568 RExC_end = xend;
3569 RExC_naughty = 0;
3570 RExC_npar = 1;
fac92740 3571 RExC_emit_start = r->program;
830247a4 3572 RExC_emit = r->program;
2cd61cdb 3573 /* Store the count of eval-groups for security checks: */
786e8c11 3574 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 3575 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 3576 r->data = 0;
3dab1dad 3577 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 3578 return(NULL);
07be1b83
YO
3579 /* XXXX To minimize changes to RE engine we always allocate
3580 3-units-long substrs field. */
3581 Newx(r->substrs, 1, struct reg_substr_data);
a0d0e21e 3582
07be1b83 3583reStudy:
786e8c11 3584 minlen=sawplus=sawopen=0;
07be1b83
YO
3585 Zero(r->substrs, 1, struct reg_substr_data);
3586 StructCopy(&zero_scan_data, &data, scan_data_t);
a3621e74 3587
07be1b83
YO
3588#ifdef TRIE_STUDY_OPT
3589 if ( restudied ) {
3590 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3591 RExC_state=copyRExC_state;
3592 if (data.longest_fixed)
3593 SvREFCNT_dec(data.longest_fixed);
3594 if (data.longest_float)
3595 SvREFCNT_dec(data.longest_float);
3596 if (data.last_found)
3597 SvREFCNT_dec(data.last_found);
3598 } else {
3599 copyRExC_state=RExC_state;
3600 }
3601#endif
a0d0e21e 3602 /* Dig out information for optimizations. */
cf93c79d 3603 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 3604 pm->op_pmflags = RExC_flags;
a0ed51b3 3605 if (UTF)
5ff6fc6d 3606 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 3607 r->regstclass = NULL;
830247a4 3608 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 3609 r->reganch |= ROPT_NAUGHTY;
c277df42 3610 scan = r->program + 1; /* First BRANCH. */
2779dcf1 3611
653099ff 3612 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 3613 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 3614 I32 fake;
c5254dd6 3615 STRLEN longest_float_length, longest_fixed_length;
07be1b83 3616 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 3617 int stclass_flag;
07be1b83 3618 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
3619
3620 first = scan;
c277df42 3621 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 3622 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 3623 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 3624 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
3625 /* for now we can't handle lookbehind IFMATCH*/
3626 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
3627 (OP(first) == PLUS) ||
3628 (OP(first) == MINMOD) ||
653099ff 3629 /* An {n,m} with n>0 */
07be1b83
YO
3630 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3631 {
786e8c11 3632
a0d0e21e
LW
3633 if (OP(first) == PLUS)
3634 sawplus = 1;
3635 else
3dab1dad 3636 first += regarglen[OP(first)];
07be1b83
YO
3637 if (OP(first) == IFMATCH) {
3638 first = NEXTOPER(first);
3639 first += EXTRA_STEP_2ARGS;
7c167cea 3640 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 3641 first = NEXTOPER(first);
a687059c
LW
3642 }
3643
a0d0e21e
LW
3644 /* Starting-point info. */
3645 again:
786e8c11 3646 DEBUG_PEEP("first:",first,0);
07be1b83 3647 /* Ignore EXACT as we deal with it later. */
3dab1dad 3648 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 3649 if (OP(first) == EXACT)
6f207bd3 3650 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 3651 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
3652 r->regstclass = first;
3653 }
07be1b83 3654#ifdef TRIE_STCLASS
786e8c11 3655 else if (PL_regkind[OP(first)] == TRIE &&
07be1b83
YO
3656 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3657 {
786e8c11 3658 regnode *trie_op;
07be1b83 3659 /* this can happen only on restudy */
786e8c11
YO
3660 if ( OP(first) == TRIE ) {
3661 struct regnode_1 *trieop;
3662 Newxz(trieop,1,struct regnode_1);
3663 StructCopy(first,trieop,struct regnode_1);
3664 trie_op=(regnode *)trieop;
3665 } else {
3666 struct regnode_charclass *trieop;
3667 Newxz(trieop,1,struct regnode_charclass);
3668 StructCopy(first,trieop,struct regnode_charclass);
3669 trie_op=(regnode *)trieop;
3670 }
3671 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
3672 r->regstclass = trie_op;
07be1b83
YO
3673 }
3674#endif
bfed75c6 3675 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 3676 r->regstclass = first;
3dab1dad
YO
3677 else if (PL_regkind[OP(first)] == BOUND ||
3678 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 3679 r->regstclass = first;
3dab1dad 3680 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
3681 r->reganch |= (OP(first) == MBOL
3682 ? ROPT_ANCH_MBOL
3683 : (OP(first) == SBOL
3684 ? ROPT_ANCH_SBOL
3685 : ROPT_ANCH_BOL));
a0d0e21e 3686 first = NEXTOPER(first);
774d564b 3687 goto again;
3688 }
3689 else if (OP(first) == GPOS) {
3690 r->reganch |= ROPT_ANCH_GPOS;
3691 first = NEXTOPER(first);
3692 goto again;
a0d0e21e 3693 }
e09294f4 3694 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 3695 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3696 !(r->reganch & ROPT_ANCH) )
3697 {
3698 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
3699 const int type =
3700 (OP(NEXTOPER(first)) == REG_ANY)
3701 ? ROPT_ANCH_MBOL
3702 : ROPT_ANCH_SBOL;
cad2e5aa 3703 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3704 first = NEXTOPER(first);
774d564b 3705 goto again;
a0d0e21e 3706 }
b81d288d 3707 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3708 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3709 /* x+ must match at the 1st pos of run of x's */
3710 r->reganch |= ROPT_SKIP;
a0d0e21e 3711
c277df42 3712 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa
YO
3713#ifdef TRIE_STUDY_OPT
3714 DEBUG_COMPILE_r(
3715 if (!restudied)
3716 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3717 (IV)(first - scan + 1))
3718 );
3719#else
3720 DEBUG_COMPILE_r(
3721 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3722 (IV)(first - scan + 1))
3723 );
3724#endif
3725
3726
a0d0e21e
LW
3727 /*
3728 * If there's something expensive in the r.e., find the
3729 * longest literal string that must appear and make it the
3730 * regmust. Resolve ties in favor of later strings, since
3731 * the regstart check works with the beginning of the r.e.
3732 * and avoiding duplication strengthens checking. Not a
3733 * strong reason, but sufficient in the absence of others.
3734 * [Now we resolve ties in favor of the earlier string if
c277df42 3735 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3736 * earlier string may buy us something the later one won't.]
3737 */
a0d0e21e 3738 minlen = 0;
a687059c 3739
396482e1
GA
3740 data.longest_fixed = newSVpvs("");
3741 data.longest_float = newSVpvs("");
3742 data.last_found = newSVpvs("");
c277df42
IZ
3743 data.longest = &(data.longest_fixed);
3744 first = scan;
653099ff 3745 if (!r->regstclass) {
830247a4 3746 cl_init(pRExC_state, &ch_class);
653099ff
GS
3747 data.start_class = &ch_class;
3748 stclass_flag = SCF_DO_STCLASS_AND;
3749 } else /* XXXX Check for BOUND? */
3750 stclass_flag = 0;
cb434fcc 3751 data.last_closep = &last_close;
653099ff 3752
830247a4 3753 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3754 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 3755
07be1b83 3756
786e8c11
YO
3757 CHECK_RESTUDY_GOTO;
3758
3759
830247a4 3760 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3761 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3762 && !RExC_seen_zerolen
3763 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3764 r->reganch |= ROPT_CHECK_ALL;
830247a4 3765 scan_commit(pRExC_state, &data);
c277df42
IZ
3766 SvREFCNT_dec(data.last_found);
3767
a0ed51b3 3768 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3769 if (longest_float_length
c277df42
IZ
3770 || (data.flags & SF_FL_BEFORE_EOL
3771 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3772 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3773 int t;
3774
a0ed51b3 3775 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3776 && data.offset_fixed == data.offset_float_min
3777 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3778 goto remove_float; /* As in (a)+. */
3779
33b8afdf
JH
3780 if (SvUTF8(data.longest_float)) {
3781 r->float_utf8 = data.longest_float;
c445ea15 3782 r->float_substr = NULL;
33b8afdf
JH
3783 } else {
3784 r->float_substr = data.longest_float;
c445ea15 3785 r->float_utf8 = NULL;
33b8afdf 3786 }
c277df42
IZ
3787 r->float_min_offset = data.offset_float_min;
3788 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3789 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3790 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3791 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3792 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3793 }
3794 else {
aca2d497 3795 remove_float:
c445ea15 3796 r->float_substr = r->float_utf8 = NULL;
c277df42 3797 SvREFCNT_dec(data.longest_float);
c5254dd6 3798 longest_float_length = 0;
a0d0e21e 3799 }
c277df42 3800
a0ed51b3 3801 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3802 if (longest_fixed_length
c277df42
IZ
3803 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3804 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3805 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3806 int t;
3807
33b8afdf
JH
3808 if (SvUTF8(data.longest_fixed)) {
3809 r->anchored_utf8 = data.longest_fixed;
c445ea15 3810 r->anchored_substr = NULL;
33b8afdf
JH
3811 } else {
3812 r->anchored_substr = data.longest_fixed;
c445ea15 3813 r->anchored_utf8 = NULL;
33b8afdf 3814 }
c277df42 3815 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3816 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3817 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3818 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3819 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3820 }
3821 else {
c445ea15 3822 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 3823 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3824 longest_fixed_length = 0;
a0d0e21e 3825 }
b81d288d 3826 if (r->regstclass
ffc61ed2 3827 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3828 r->regstclass = NULL;
33b8afdf
JH
3829 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3830 && stclass_flag
653099ff 3831 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3832 && !cl_is_anything(data.start_class))
3833 {
1df70142 3834 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3835
a02a5408 3836 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3837 struct regnode_charclass_class);
3838 StructCopy(data.start_class,
830247a4 3839 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3840 struct regnode_charclass_class);
830247a4 3841 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3842 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3843 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 3844 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3845 PerlIO_printf(Perl_debug_log,
a0288114 3846 "synthetic stclass \"%s\".\n",
3f7c398e 3847 SvPVX_const(sv));});
653099ff 3848 }
c277df42
IZ
3849
3850 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3851 if (longest_fixed_length > longest_float_length) {
c277df42 3852 r->check_substr = r->anchored_substr;
33b8afdf 3853 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3854 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3855 if (r->reganch & ROPT_ANCH_SINGLE)
3856 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3857 }
3858 else {
c277df42 3859 r->check_substr = r->float_substr;
33b8afdf 3860 r->check_utf8 = r->float_utf8;
c277df42
IZ
3861 r->check_offset_min = data.offset_float_min;
3862 r->check_offset_max = data.offset_float_max;
a0d0e21e 3863 }
30382c73
IZ
3864 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3865 This should be changed ASAP! */
33b8afdf 3866 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3867 r->reganch |= RE_USE_INTUIT;
33b8afdf 3868 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3869 r->reganch |= RE_INTUIT_TAIL;
3870 }
a0ed51b3
LW
3871 }
3872 else {
c277df42
IZ
3873 /* Several toplevels. Best we can is to set minlen. */
3874 I32 fake;
653099ff 3875 struct regnode_charclass_class ch_class;
cb434fcc 3876 I32 last_close = 0;
c277df42 3877
a3621e74 3878 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
07be1b83 3879
c277df42 3880 scan = r->program + 1;
830247a4 3881 cl_init(pRExC_state, &ch_class);
653099ff 3882 data.start_class = &ch_class;
cb434fcc 3883 data.last_closep = &last_close;
07be1b83
YO
3884
3885 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size,
3886 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
3887
786e8c11 3888 CHECK_RESTUDY_GOTO;
07be1b83 3889
33b8afdf 3890 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 3891 = r->float_substr = r->float_utf8 = NULL;
653099ff 3892 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3893 && !cl_is_anything(data.start_class))
3894 {
1df70142 3895 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3896
a02a5408 3897 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3898 struct regnode_charclass_class);
3899 StructCopy(data.start_class,
830247a4 3900 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3901 struct regnode_charclass_class);
830247a4 3902 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3903 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3904 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 3905 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3906 PerlIO_printf(Perl_debug_log,
a0288114 3907 "synthetic stclass \"%s\".\n",
3f7c398e 3908 SvPVX_const(sv));});
653099ff 3909 }
a0d0e21e
LW
3910 }
3911
a0d0e21e 3912 r->minlen = minlen;
b81d288d 3913 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3914 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3915 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3916 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3917 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3918 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3919 if (RExC_seen & REG_SEEN_CANY)
3920 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3921 Newxz(r->startp, RExC_npar, I32);
3922 Newxz(r->endp, RExC_npar, I32);
07be1b83 3923
f2278c82 3924 DEBUG_r( RX_DEBUG_on(r) );
be8e71aa
YO
3925 DEBUG_DUMP_r({
3926 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
3927 regdump(r);
3928 });
8e9a8a48
YO
3929 DEBUG_OFFSETS_r(if (r->offsets) {
3930 const U32 len = r->offsets[0];
3931 U32 i;
3932 GET_RE_DEBUG_FLAGS_DECL;
3933 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
3934 for (i = 1; i <= len; i++) {
3935 if (r->offsets[i*2-1] || r->offsets[i*2])
3936 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
786e8c11 3937 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
8e9a8a48
YO
3938 }
3939 PerlIO_printf(Perl_debug_log, "\n");
3940 });
a0d0e21e 3941 return(r);
a687059c
LW
3942}
3943
3dab1dad
YO
3944
3945#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
3946 int rem=(int)(RExC_end - RExC_parse); \
3947 int cut; \
3948 int num; \
3949 int iscut=0; \
3950 if (rem>10) { \
3951 rem=10; \
3952 iscut=1; \
3953 } \
3954 cut=10-rem; \
3955 if (RExC_lastparse!=RExC_parse) \
3956 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
3957 rem, RExC_parse, \
3958 cut + 4, \
3959 iscut ? "..." : "<" \
3960 ); \
3961 else \
3962 PerlIO_printf(Perl_debug_log,"%16s",""); \
3963 \
3964 if (SIZE_ONLY) \
3965 num=RExC_size; \
3966 else \
3967 num=REG_NODE_NUM(RExC_emit); \
3968 if (RExC_lastnum!=num) \
be8e71aa 3969 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 3970 else \
be8e71aa
YO
3971 PerlIO_printf(Perl_debug_log,"|%4s",""); \
3972 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
3973 (int)((depth*2)), "", \
3dab1dad
YO
3974 (funcname) \
3975 ); \
3976 RExC_lastnum=num; \
3977 RExC_lastparse=RExC_parse; \
3978})
3979
07be1b83
YO
3980
3981
3dab1dad
YO
3982#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
3983 DEBUG_PARSE_MSG((funcname)); \
3984 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
3985})
a687059c
LW
3986/*
3987 - reg - regular expression, i.e. main body or parenthesized thing
3988 *
3989 * Caller must absorb opening parenthesis.
3990 *
3991 * Combining parenthesis handling with the base level of regular expression
3992 * is a trifle forced, but the need to tie the tails of the branches to what
3993 * follows makes it hard to avoid.
3994 */
07be1b83
YO
3995#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
3996#ifdef DEBUGGING
3997#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
3998#else
3999#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4000#endif
3dab1dad 4001
76e3520e 4002STATIC regnode *
3dab1dad 4003S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4004 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4005{
27da23d5 4006 dVAR;
c277df42
IZ
4007 register regnode *ret; /* Will be the head of the group. */
4008 register regnode *br;
4009 register regnode *lastbr;
cbbf8932 4010 register regnode *ender = NULL;
a0d0e21e 4011 register I32 parno = 0;
cbbf8932
AL
4012 I32 flags;
4013 const I32 oregflags = RExC_flags;
6136c704
AL
4014 bool have_branch = 0;
4015 bool is_open = 0;
9d1d55b5
JP
4016
4017 /* for (?g), (?gc), and (?o) warnings; warning
4018 about (?c) will warn about (?g) -- japhy */
4019
6136c704
AL
4020#define WASTED_O 0x01
4021#define WASTED_G 0x02
4022#define WASTED_C 0x04
4023#define WASTED_GC (0x02|0x04)
cbbf8932 4024 I32 wastedflags = 0x00;
9d1d55b5 4025
fac92740 4026 char * parse_start = RExC_parse; /* MJD */
a28509cc 4027 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4028
3dab1dad
YO
4029 GET_RE_DEBUG_FLAGS_DECL;
4030 DEBUG_PARSE("reg ");
4031
4032
821b33a5 4033 *flagp = 0; /* Tentatively. */
a0d0e21e 4034
9d1d55b5 4035
a0d0e21e
LW
4036 /* Make an OPEN node, if parenthesized. */
4037 if (paren) {
fac92740 4038 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4039 U32 posflags = 0, negflags = 0;
4040 U32 *flagsp = &posflags;
6136c704 4041 bool is_logical = 0;
a28509cc 4042 const char * const seqstart = RExC_parse;
ca9dfc88 4043
830247a4
IZ
4044 RExC_parse++;
4045 paren = *RExC_parse++;
c277df42 4046 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4047 switch (paren) {
fac92740 4048 case '<': /* (?<...) */
830247a4 4049 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 4050 if (*RExC_parse == '!')
c277df42 4051 paren = ',';
b81d288d 4052 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 4053 goto unknown;
830247a4 4054 RExC_parse++;
fac92740
MJD
4055 case '=': /* (?=...) */
4056 case '!': /* (?!...) */
830247a4 4057 RExC_seen_zerolen++;
fac92740
MJD
4058 case ':': /* (?:...) */
4059 case '>': /* (?>...) */
a0d0e21e 4060 break;
fac92740
MJD
4061 case '$': /* (?$...) */
4062 case '@': /* (?@...) */
8615cb43 4063 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4064 break;
fac92740 4065 case '#': /* (?#...) */
830247a4
IZ
4066 while (*RExC_parse && *RExC_parse != ')')
4067 RExC_parse++;
4068 if (*RExC_parse != ')')
c277df42 4069 FAIL("Sequence (?#... not terminated");
830247a4 4070 nextchar(pRExC_state);
a0d0e21e
LW
4071 *flagp = TRYAGAIN;
4072 return NULL;
fac92740 4073 case 'p': /* (?p...) */
9014280d 4074 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 4075 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 4076 /* FALL THROUGH*/
fac92740 4077 case '?': /* (??...) */
6136c704 4078 is_logical = 1;
438a3801
YST
4079 if (*RExC_parse != '{')
4080 goto unknown;
830247a4 4081 paren = *RExC_parse++;
0f5d15d6 4082 /* FALL THROUGH */
fac92740 4083 case '{': /* (?{...}) */
c277df42 4084 {
c277df42
IZ
4085 I32 count = 1, n = 0;
4086 char c;
830247a4 4087 char *s = RExC_parse;
c277df42 4088
830247a4
IZ
4089 RExC_seen_zerolen++;
4090 RExC_seen |= REG_SEEN_EVAL;
4091 while (count && (c = *RExC_parse)) {
6136c704
AL
4092 if (c == '\\') {
4093 if (RExC_parse[1])
4094 RExC_parse++;
4095 }
b81d288d 4096 else if (c == '{')
c277df42 4097 count++;
b81d288d 4098 else if (c == '}')
c277df42 4099 count--;
830247a4 4100 RExC_parse++;
c277df42 4101 }
6136c704 4102 if (*RExC_parse != ')') {
b81d288d 4103 RExC_parse = s;
b45f050a
JF
4104 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
4105 }
c277df42 4106 if (!SIZE_ONLY) {
f3548bdc 4107 PAD *pad;
6136c704
AL
4108 OP_4tree *sop, *rop;
4109 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 4110
569233ed
SB
4111 ENTER;
4112 Perl_save_re_context(aTHX);
f3548bdc 4113 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
4114 sop->op_private |= OPpREFCOUNTED;
4115 /* re_dup will OpREFCNT_inc */
4116 OpREFCNT_set(sop, 1);
569233ed 4117 LEAVE;
c277df42 4118
830247a4
IZ
4119 n = add_data(pRExC_state, 3, "nop");
4120 RExC_rx->data->data[n] = (void*)rop;
4121 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 4122 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 4123 SvREFCNT_dec(sv);
a0ed51b3 4124 }
e24b16f9 4125 else { /* First pass */
830247a4 4126 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 4127 && IN_PERL_RUNTIME)
2cd61cdb
IZ
4128 /* No compiled RE interpolated, has runtime
4129 components ===> unsafe. */
4130 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 4131 if (PL_tainting && PL_tainted)
cc6b7395 4132 FAIL("Eval-group in insecure regular expression");
54df2634 4133#if PERL_VERSION > 8
923e4eb5 4134 if (IN_PERL_COMPILETIME)
b5c19bd7 4135 PL_cv_has_eval = 1;
54df2634 4136#endif
c277df42 4137 }
b5c19bd7 4138
830247a4 4139 nextchar(pRExC_state);
6136c704 4140 if (is_logical) {
830247a4 4141 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4142 if (!SIZE_ONLY)
4143 ret->flags = 2;
3dab1dad 4144 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 4145 /* deal with the length of this later - MJD */
0f5d15d6
IZ
4146 return ret;
4147 }
ccb2c380
MP
4148 ret = reganode(pRExC_state, EVAL, n);
4149 Set_Node_Length(ret, RExC_parse - parse_start + 1);
4150 Set_Node_Offset(ret, parse_start);
4151 return ret;
c277df42 4152 }
fac92740 4153 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 4154 {
fac92740 4155 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
4156 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
4157 || RExC_parse[1] == '<'
830247a4 4158 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
4159 I32 flag;
4160
830247a4 4161 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4162 if (!SIZE_ONLY)
4163 ret->flags = 1;
3dab1dad 4164 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 4165 goto insert_if;
b81d288d 4166 }
a0ed51b3 4167 }
830247a4 4168 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 4169 /* (?(1)...) */
6136c704 4170 char c;
830247a4 4171 parno = atoi(RExC_parse++);
c277df42 4172
830247a4
IZ
4173 while (isDIGIT(*RExC_parse))
4174 RExC_parse++;
fac92740 4175 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 4176
830247a4 4177 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 4178 vFAIL("Switch condition not recognized");
c277df42 4179 insert_if:
3dab1dad
YO
4180 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
4181 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 4182 if (br == NULL)
830247a4 4183 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 4184 else
3dab1dad 4185 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 4186 c = *nextchar(pRExC_state);
d1b80229
IZ
4187 if (flags&HASWIDTH)
4188 *flagp |= HASWIDTH;
c277df42 4189 if (c == '|') {
830247a4 4190 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
4191 regbranch(pRExC_state, &flags, 1,depth+1);
4192 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
4193 if (flags&HASWIDTH)
4194 *flagp |= HASWIDTH;
830247a4 4195 c = *nextchar(pRExC_state);
a0ed51b3
LW
4196 }
4197 else
c277df42
IZ
4198 lastbr = NULL;
4199 if (c != ')')
8615cb43 4200 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 4201 ender = reg_node(pRExC_state, TAIL);
3dab1dad 4202 REGTAIL(pRExC_state, br, ender);
c277df42 4203 if (lastbr) {
3dab1dad
YO
4204 REGTAIL(pRExC_state, lastbr, ender);
4205 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
4206 }
4207 else
3dab1dad 4208 REGTAIL(pRExC_state, ret, ender);
c277df42 4209 return ret;
a0ed51b3
LW
4210 }
4211 else {
830247a4 4212 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
4213 }
4214 }
1b1626e4 4215 case 0:
830247a4 4216 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 4217 vFAIL("Sequence (? incomplete");
1b1626e4 4218 break;
a0d0e21e 4219 default:
830247a4 4220 --RExC_parse;
fac92740 4221 parse_flags: /* (?i) */
830247a4 4222 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
4223 /* (?g), (?gc) and (?o) are useless here
4224 and must be globally applied -- japhy */
4225
4226 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
4227 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 4228 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
4229 if (! (wastedflags & wflagbit) ) {
4230 wastedflags |= wflagbit;
4231 vWARN5(
4232 RExC_parse + 1,
4233 "Useless (%s%c) - %suse /%c modifier",
4234 flagsp == &negflags ? "?-" : "?",
4235 *RExC_parse,
4236 flagsp == &negflags ? "don't " : "",
4237 *RExC_parse
4238 );
4239 }
4240 }
4241 }
4242 else if (*RExC_parse == 'c') {
4243 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
4244 if (! (wastedflags & WASTED_C) ) {
4245 wastedflags |= WASTED_GC;
9d1d55b5
JP
4246 vWARN3(
4247 RExC_parse + 1,
4248 "Useless (%sc) - %suse /gc modifier",
4249 flagsp == &negflags ? "?-" : "?",
4250 flagsp == &negflags ? "don't " : ""
4251 );
4252 }
4253 }
4254 }
4255 else { pmflag(flagsp, *RExC_parse); }
4256
830247a4 4257 ++RExC_parse;
ca9dfc88 4258 }
830247a4 4259 if (*RExC_parse == '-') {
ca9dfc88 4260 flagsp = &negflags;
9d1d55b5 4261 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 4262 ++RExC_parse;
ca9dfc88 4263 goto parse_flags;
48c036b1 4264 }
e2509266
JH
4265 RExC_flags |= posflags;
4266 RExC_flags &= ~negflags;
830247a4
IZ
4267 if (*RExC_parse == ':') {
4268 RExC_parse++;
ca9dfc88
IZ
4269 paren = ':';
4270 break;
4271 }
c277df42 4272 unknown:
830247a4
IZ
4273 if (*RExC_parse != ')') {
4274 RExC_parse++;
4275 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 4276 }
830247a4 4277 nextchar(pRExC_state);
a0d0e21e
LW
4278 *flagp = TRYAGAIN;
4279 return NULL;
4280 }
4281 }
fac92740 4282 else { /* (...) */
830247a4
IZ
4283 parno = RExC_npar;
4284 RExC_npar++;
4285 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
4286 Set_Node_Length(ret, 1); /* MJD */
4287 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 4288 is_open = 1;
a0d0e21e 4289 }
a0ed51b3 4290 }
fac92740 4291 else /* ! paren */
a0d0e21e
LW
4292 ret = NULL;
4293
4294 /* Pick up the branches, linking them together. */
fac92740 4295 parse_start = RExC_parse; /* MJD */
3dab1dad 4296 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 4297 /* branch_len = (paren != 0); */
2af232bd 4298
a0d0e21e
LW
4299 if (br == NULL)
4300 return(NULL);
830247a4
IZ
4301 if (*RExC_parse == '|') {
4302 if (!SIZE_ONLY && RExC_extralen) {
4303 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 4304 }
fac92740 4305 else { /* MJD */
830247a4 4306 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
4307 Set_Node_Length(br, paren != 0);
4308 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4309 }
c277df42
IZ
4310 have_branch = 1;
4311 if (SIZE_ONLY)
830247a4 4312 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
4313 }
4314 else if (paren == ':') {
c277df42
IZ
4315 *flagp |= flags&SIMPLE;
4316 }
6136c704 4317 if (is_open) { /* Starts with OPEN. */
3dab1dad 4318 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
4319 }
4320 else if (paren != '?') /* Not Conditional */
a0d0e21e 4321 ret = br;
32a0ca98 4322 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 4323 lastbr = br;
830247a4
IZ
4324 while (*RExC_parse == '|') {
4325 if (!SIZE_ONLY && RExC_extralen) {
4326 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 4327 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
4328 }
4329 if (SIZE_ONLY)
830247a4
IZ
4330 RExC_extralen += 2; /* Account for LONGJMP. */
4331 nextchar(pRExC_state);
3dab1dad 4332 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 4333
a687059c 4334 if (br == NULL)
a0d0e21e 4335 return(NULL);
3dab1dad 4336 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 4337 lastbr = br;
821b33a5
IZ
4338 if (flags&HASWIDTH)
4339 *flagp |= HASWIDTH;
a687059c 4340 *flagp |= flags&SPSTART;
a0d0e21e
LW
4341 }
4342
c277df42
IZ
4343 if (have_branch || paren != ':') {
4344 /* Make a closing node, and hook it on the end. */
4345 switch (paren) {
4346 case ':':
830247a4 4347 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
4348 break;
4349 case 1:
830247a4 4350 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
4351 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4352 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
4353 break;
4354 case '<':
c277df42
IZ
4355 case ',':
4356 case '=':
4357 case '!':
c277df42 4358 *flagp &= ~HASWIDTH;
821b33a5
IZ
4359 /* FALL THROUGH */
4360 case '>':
830247a4 4361 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
4362 break;
4363 case 0:
830247a4 4364 ender = reg_node(pRExC_state, END);
c277df42
IZ
4365 break;
4366 }
07be1b83 4367 REGTAIL_STUDY(pRExC_state, lastbr, ender);
a0d0e21e 4368
9674d46a 4369 if (have_branch && !SIZE_ONLY) {
c277df42 4370 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
4371 for (br = ret; br; br = regnext(br)) {
4372 const U8 op = PL_regkind[OP(br)];
4373 if (op == BRANCH) {
07be1b83 4374 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
4375 }
4376 else if (op == BRANCHJ) {
07be1b83 4377 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 4378 }
c277df42
IZ
4379 }
4380 }
a0d0e21e 4381 }
c277df42
IZ
4382
4383 {
e1ec3a88
AL
4384 const char *p;
4385 static const char parens[] = "=!<,>";
c277df42
IZ
4386
4387 if (paren && (p = strchr(parens, paren))) {
eb160463 4388 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
4389 int flag = (p - parens) > 1;
4390
4391 if (paren == '>')
4392 node = SUSPEND, flag = 0;
830247a4 4393 reginsert(pRExC_state, node,ret);
45948336
EP
4394 Set_Node_Cur_Length(ret);
4395 Set_Node_Offset(ret, parse_start + 1);
c277df42 4396 ret->flags = flag;
07be1b83 4397 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 4398 }
a0d0e21e
LW
4399 }
4400
4401 /* Check for proper termination. */
ce3e6498 4402 if (paren) {
e2509266 4403 RExC_flags = oregflags;
830247a4
IZ
4404 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4405 RExC_parse = oregcomp_parse;
380a0633 4406 vFAIL("Unmatched (");
ce3e6498 4407 }
a0ed51b3 4408 }
830247a4
IZ
4409 else if (!paren && RExC_parse < RExC_end) {
4410 if (*RExC_parse == ')') {
4411 RExC_parse++;
380a0633 4412 vFAIL("Unmatched )");
a0ed51b3
LW
4413 }
4414 else
b45f050a 4415 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
4416 /* NOTREACHED */
4417 }
a687059c 4418
a0d0e21e 4419 return(ret);
a687059c
LW
4420}
4421
4422/*
4423 - regbranch - one alternative of an | operator
4424 *
4425 * Implements the concatenation operator.
4426 */
76e3520e 4427STATIC regnode *
3dab1dad 4428S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 4429{
97aff369 4430 dVAR;
c277df42
IZ
4431 register regnode *ret;
4432 register regnode *chain = NULL;
4433 register regnode *latest;
4434 I32 flags = 0, c = 0;
3dab1dad
YO
4435 GET_RE_DEBUG_FLAGS_DECL;
4436 DEBUG_PARSE("brnc");
b81d288d 4437 if (first)
c277df42
IZ
4438 ret = NULL;
4439 else {
b81d288d 4440 if (!SIZE_ONLY && RExC_extralen)
830247a4 4441 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 4442 else {
830247a4 4443 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
4444 Set_Node_Length(ret, 1);
4445 }
c277df42
IZ
4446 }
4447
b81d288d 4448 if (!first && SIZE_ONLY)
830247a4 4449 RExC_extralen += 1; /* BRANCHJ */
b81d288d 4450
c277df42 4451 *flagp = WORST; /* Tentatively. */
a0d0e21e 4452
830247a4
IZ
4453 RExC_parse--;
4454 nextchar(pRExC_state);
4455 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 4456 flags &= ~TRYAGAIN;
3dab1dad 4457 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4458 if (latest == NULL) {
4459 if (flags & TRYAGAIN)
4460 continue;
4461 return(NULL);
a0ed51b3
LW
4462 }
4463 else if (ret == NULL)
c277df42 4464 ret = latest;
a0d0e21e 4465 *flagp |= flags&HASWIDTH;
c277df42 4466 if (chain == NULL) /* First piece. */
a0d0e21e
LW
4467 *flagp |= flags&SPSTART;
4468 else {
830247a4 4469 RExC_naughty++;
3dab1dad 4470 REGTAIL(pRExC_state, chain, latest);
a687059c 4471 }
a0d0e21e 4472 chain = latest;
c277df42
IZ
4473 c++;
4474 }
4475 if (chain == NULL) { /* Loop ran zero times. */
830247a4 4476 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
4477 if (ret == NULL)
4478 ret = chain;
4479 }
4480 if (c == 1) {
4481 *flagp |= flags&SIMPLE;
a0d0e21e 4482 }
a687059c 4483
d4c19fe8 4484 return ret;
a687059c
LW
4485}
4486
4487/*
4488 - regpiece - something followed by possible [*+?]
4489 *
4490 * Note that the branching code sequences used for ? and the general cases
4491 * of * and + are somewhat optimized: they use the same NOTHING node as
4492 * both the endmarker for their branch list and the body of the last branch.
4493 * It might seem that this node could be dispensed with entirely, but the
4494 * endmarker role is not redundant.
4495 */
76e3520e 4496STATIC regnode *
3dab1dad 4497S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4498{
97aff369 4499 dVAR;
c277df42 4500 register regnode *ret;
a0d0e21e
LW
4501 register char op;
4502 register char *next;
4503 I32 flags;
1df70142 4504 const char * const origparse = RExC_parse;
a0d0e21e 4505 I32 min;
c277df42 4506 I32 max = REG_INFTY;
fac92740 4507 char *parse_start;
10edeb5d 4508 const char *maxpos = NULL;
3dab1dad
YO
4509 GET_RE_DEBUG_FLAGS_DECL;
4510 DEBUG_PARSE("piec");
a0d0e21e 4511
3dab1dad 4512 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4513 if (ret == NULL) {
4514 if (flags & TRYAGAIN)
4515 *flagp |= TRYAGAIN;
4516 return(NULL);
4517 }
4518
830247a4 4519 op = *RExC_parse;
a0d0e21e 4520
830247a4 4521 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 4522 maxpos = NULL;
fac92740 4523 parse_start = RExC_parse; /* MJD */
830247a4 4524 next = RExC_parse + 1;
a0d0e21e
LW
4525 while (isDIGIT(*next) || *next == ',') {
4526 if (*next == ',') {
4527 if (maxpos)
4528 break;
4529 else
4530 maxpos = next;
a687059c 4531 }
a0d0e21e
LW
4532 next++;
4533 }
4534 if (*next == '}') { /* got one */
4535 if (!maxpos)
4536 maxpos = next;
830247a4
IZ
4537 RExC_parse++;
4538 min = atoi(RExC_parse);
a0d0e21e
LW
4539 if (*maxpos == ',')
4540 maxpos++;
4541 else
830247a4 4542 maxpos = RExC_parse;
a0d0e21e
LW
4543 max = atoi(maxpos);
4544 if (!max && *maxpos != '0')
c277df42
IZ
4545 max = REG_INFTY; /* meaning "infinity" */
4546 else if (max >= REG_INFTY)
8615cb43 4547 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
4548 RExC_parse = next;
4549 nextchar(pRExC_state);
a0d0e21e
LW
4550
4551 do_curly:
4552 if ((flags&SIMPLE)) {
830247a4
IZ
4553 RExC_naughty += 2 + RExC_naughty / 2;
4554 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
4555 Set_Node_Offset(ret, parse_start+1); /* MJD */
4556 Set_Node_Cur_Length(ret);
a0d0e21e
LW
4557 }
4558 else {
3dab1dad 4559 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
4560
4561 w->flags = 0;
3dab1dad 4562 REGTAIL(pRExC_state, ret, w);
830247a4
IZ
4563 if (!SIZE_ONLY && RExC_extralen) {
4564 reginsert(pRExC_state, LONGJMP,ret);
4565 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
4566 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4567 }
830247a4 4568 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
4569 /* MJD hk */
4570 Set_Node_Offset(ret, parse_start+1);
2af232bd 4571 Set_Node_Length(ret,
fac92740 4572 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 4573
830247a4 4574 if (!SIZE_ONLY && RExC_extralen)
c277df42 4575 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 4576 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 4577 if (SIZE_ONLY)
830247a4
IZ
4578 RExC_whilem_seen++, RExC_extralen += 3;
4579 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 4580 }
c277df42 4581 ret->flags = 0;
a0d0e21e
LW
4582
4583 if (min > 0)
821b33a5
IZ
4584 *flagp = WORST;
4585 if (max > 0)
4586 *flagp |= HASWIDTH;
a0d0e21e 4587 if (max && max < min)
8615cb43 4588 vFAIL("Can't do {n,m} with n > m");
c277df42 4589 if (!SIZE_ONLY) {
eb160463
GS
4590 ARG1_SET(ret, (U16)min);
4591 ARG2_SET(ret, (U16)max);
a687059c 4592 }
a687059c 4593
a0d0e21e 4594 goto nest_check;
a687059c 4595 }
a0d0e21e 4596 }
a687059c 4597
a0d0e21e
LW
4598 if (!ISMULT1(op)) {
4599 *flagp = flags;
a687059c 4600 return(ret);
a0d0e21e 4601 }
bb20fd44 4602
c277df42 4603#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
4604
4605 /* if this is reinstated, don't forget to put this back into perldiag:
4606
4607 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4608
4609 (F) The part of the regexp subject to either the * or + quantifier
4610 could match an empty string. The {#} shows in the regular
4611 expression about where the problem was discovered.
4612
4613 */
4614
bb20fd44 4615 if (!(flags&HASWIDTH) && op != '?')
b45f050a 4616 vFAIL("Regexp *+ operand could be empty");
b81d288d 4617#endif
bb20fd44 4618
fac92740 4619 parse_start = RExC_parse;
830247a4 4620 nextchar(pRExC_state);
a0d0e21e 4621
821b33a5 4622 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
4623
4624 if (op == '*' && (flags&SIMPLE)) {
830247a4 4625 reginsert(pRExC_state, STAR, ret);
c277df42 4626 ret->flags = 0;
830247a4 4627 RExC_naughty += 4;
a0d0e21e
LW
4628 }
4629 else if (op == '*') {
4630 min = 0;
4631 goto do_curly;
a0ed51b3
LW
4632 }
4633 else if (op == '+' && (flags&SIMPLE)) {
830247a4 4634 reginsert(pRExC_state, PLUS, ret);
c277df42 4635 ret->flags = 0;
830247a4 4636 RExC_naughty += 3;
a0d0e21e
LW
4637 }
4638 else if (op == '+') {
4639 min = 1;
4640 goto do_curly;
a0ed51b3
LW
4641 }
4642 else if (op == '?') {
a0d0e21e
LW
4643 min = 0; max = 1;
4644 goto do_curly;
4645 }
4646 nest_check:
041457d9 4647 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 4648 vWARN3(RExC_parse,
b45f050a 4649 "%.*s matches null string many times",
afd78fd5 4650 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 4651 origparse);
a0d0e21e
LW
4652 }
4653
830247a4
IZ
4654 if (*RExC_parse == '?') {
4655 nextchar(pRExC_state);
4656 reginsert(pRExC_state, MINMOD, ret);
3dab1dad 4657 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 4658 }
830247a4
IZ
4659 if (ISMULT2(RExC_parse)) {
4660 RExC_parse++;
b45f050a
JF
4661 vFAIL("Nested quantifiers");
4662 }
a0d0e21e
LW
4663
4664 return(ret);
a687059c
LW
4665}
4666
4667/*
4668 - regatom - the lowest level
4669 *
4670 * Optimization: gobbles an entire sequence of ordinary characters so that
4671 * it can turn them into a single node, which is smaller to store and
4672 * faster to run. Backslashed characters are exceptions, each becoming a
4673 * separate node; the code is simpler that way and it's not worth fixing.
4674 *
7f6f358c
YO
4675 * [Yes, it is worth fixing, some scripts can run twice the speed.]
4676 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
4677 */
76e3520e 4678STATIC regnode *
3dab1dad 4679S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4680{
97aff369 4681 dVAR;
cbbf8932 4682 register regnode *ret = NULL;
a0d0e21e 4683 I32 flags;
45948336 4684 char *parse_start = RExC_parse;
3dab1dad
YO
4685 GET_RE_DEBUG_FLAGS_DECL;
4686 DEBUG_PARSE("atom");
a0d0e21e
LW
4687 *flagp = WORST; /* Tentatively. */
4688
4689tryagain:
830247a4 4690 switch (*RExC_parse) {
a0d0e21e 4691 case '^':
830247a4
IZ
4692 RExC_seen_zerolen++;
4693 nextchar(pRExC_state);
e2509266 4694 if (RExC_flags & PMf_MULTILINE)
830247a4 4695 ret = reg_node(pRExC_state, MBOL);
e2509266 4696 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4697 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 4698 else
830247a4 4699 ret = reg_node(pRExC_state, BOL);
fac92740 4700 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4701 break;
4702 case '$':
830247a4 4703 nextchar(pRExC_state);
b81d288d 4704 if (*RExC_parse)
830247a4 4705 RExC_seen_zerolen++;
e2509266 4706 if (RExC_flags & PMf_MULTILINE)
830247a4 4707 ret = reg_node(pRExC_state, MEOL);
e2509266 4708 else if (RExC_flags & PMf_SINGLELINE)
830247a4 4709 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 4710 else
830247a4 4711 ret = reg_node(pRExC_state, EOL);
fac92740 4712 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4713 break;
4714 case '.':
830247a4 4715 nextchar(pRExC_state);
e2509266 4716 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
4717 ret = reg_node(pRExC_state, SANY);
4718 else
4719 ret = reg_node(pRExC_state, REG_ANY);
4720 *flagp |= HASWIDTH|SIMPLE;
830247a4 4721 RExC_naughty++;
fac92740 4722 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
4723 break;
4724 case '[':
b45f050a 4725 {
3dab1dad
YO
4726 char * const oregcomp_parse = ++RExC_parse;
4727 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
4728 if (*RExC_parse != ']') {
4729 RExC_parse = oregcomp_parse;
b45f050a
JF
4730 vFAIL("Unmatched [");
4731 }
830247a4 4732 nextchar(pRExC_state);
a0d0e21e 4733 *flagp |= HASWIDTH|SIMPLE;
fac92740 4734 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 4735 break;
b45f050a 4736 }
a0d0e21e 4737 case '(':
830247a4 4738 nextchar(pRExC_state);
3dab1dad 4739 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 4740 if (ret == NULL) {
bf93d4cc 4741 if (flags & TRYAGAIN) {
830247a4 4742 if (RExC_parse == RExC_end) {
bf93d4cc
GS
4743 /* Make parent create an empty node if needed. */
4744 *flagp |= TRYAGAIN;
4745 return(NULL);
4746 }
a0d0e21e 4747 goto tryagain;
bf93d4cc 4748 }
a0d0e21e
LW
4749 return(NULL);
4750 }
c277df42 4751 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
4752 break;
4753 case '|':
4754 case ')':
4755 if (flags & TRYAGAIN) {
4756 *flagp |= TRYAGAIN;
4757 return NULL;
4758 }
b45f050a 4759 vFAIL("Internal urp");
a0d0e21e
LW
4760 /* Supposed to be caught earlier. */
4761 break;
85afd4ae 4762 case '{':
830247a4
IZ
4763 if (!regcurly(RExC_parse)) {
4764 RExC_parse++;
85afd4ae
CS
4765 goto defchar;
4766 }
4767 /* FALL THROUGH */
a0d0e21e
LW
4768 case '?':
4769 case '+':
4770 case '*':
830247a4 4771 RExC_parse++;
b45f050a 4772 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
4773 break;
4774 case '\\':
830247a4 4775 switch (*++RExC_parse) {
a0d0e21e 4776 case 'A':
830247a4
IZ
4777 RExC_seen_zerolen++;
4778 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 4779 *flagp |= SIMPLE;
830247a4 4780 nextchar(pRExC_state);
fac92740 4781 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4782 break;
4783 case 'G':
830247a4
IZ
4784 ret = reg_node(pRExC_state, GPOS);
4785 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 4786 *flagp |= SIMPLE;
830247a4 4787 nextchar(pRExC_state);
fac92740 4788 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4789 break;
4790 case 'Z':
830247a4 4791 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 4792 *flagp |= SIMPLE;
a1917ab9 4793 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 4794 nextchar(pRExC_state);
a0d0e21e 4795 break;
b85d18e9 4796 case 'z':
830247a4 4797 ret = reg_node(pRExC_state, EOS);
b85d18e9 4798 *flagp |= SIMPLE;
830247a4
IZ
4799 RExC_seen_zerolen++; /* Do not optimize RE away */
4800 nextchar(pRExC_state);
fac92740 4801 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 4802 break;
4a2d328f 4803 case 'C':
f33976b4
DB
4804 ret = reg_node(pRExC_state, CANY);
4805 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 4806 *flagp |= HASWIDTH|SIMPLE;
830247a4 4807 nextchar(pRExC_state);
fac92740 4808 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
4809 break;
4810 case 'X':
830247a4 4811 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 4812 *flagp |= HASWIDTH;
830247a4 4813 nextchar(pRExC_state);
fac92740 4814 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 4815 break;
a0d0e21e 4816 case 'w':
eb160463 4817 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 4818 *flagp |= HASWIDTH|SIMPLE;
830247a4 4819 nextchar(pRExC_state);
fac92740 4820 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4821 break;
4822 case 'W':
eb160463 4823 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4824 *flagp |= HASWIDTH|SIMPLE;
830247a4 4825 nextchar(pRExC_state);
fac92740 4826 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4827 break;
4828 case 'b':
830247a4
IZ
4829 RExC_seen_zerolen++;
4830 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4831 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4832 *flagp |= SIMPLE;
830247a4 4833 nextchar(pRExC_state);
fac92740 4834 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4835 break;
4836 case 'B':
830247a4
IZ
4837 RExC_seen_zerolen++;
4838 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4839 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4840 *flagp |= SIMPLE;
830247a4 4841 nextchar(pRExC_state);
fac92740 4842 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4843 break;
4844 case 's':
eb160463 4845 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4846 *flagp |= HASWIDTH|SIMPLE;
830247a4 4847 nextchar(pRExC_state);
fac92740 4848 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4849 break;
4850 case 'S':
eb160463 4851 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4852 *flagp |= HASWIDTH|SIMPLE;
830247a4 4853 nextchar(pRExC_state);
fac92740 4854 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4855 break;
4856 case 'd':
ffc61ed2 4857 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4858 *flagp |= HASWIDTH|SIMPLE;
830247a4 4859 nextchar(pRExC_state);
fac92740 4860 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4861 break;
4862 case 'D':
ffc61ed2 4863 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4864 *flagp |= HASWIDTH|SIMPLE;
830247a4 4865 nextchar(pRExC_state);
fac92740 4866 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4867 break;
a14b48bc
LW
4868 case 'p':
4869 case 'P':
3568d838 4870 {
3dab1dad 4871 char* const oldregxend = RExC_end;
ccb2c380 4872 char* parse_start = RExC_parse - 2;
a14b48bc 4873
830247a4 4874 if (RExC_parse[1] == '{') {
3568d838 4875 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4876 RExC_end = strchr(RExC_parse, '}');
4877 if (!RExC_end) {
3dab1dad 4878 const U8 c = (U8)*RExC_parse;
830247a4
IZ
4879 RExC_parse += 2;
4880 RExC_end = oldregxend;
0da60cf5 4881 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4882 }
830247a4 4883 RExC_end++;
a14b48bc 4884 }
af6f566e 4885 else {
830247a4 4886 RExC_end = RExC_parse + 2;
af6f566e
HS
4887 if (RExC_end > oldregxend)
4888 RExC_end = oldregxend;
4889 }
830247a4 4890 RExC_parse--;
a14b48bc 4891
3dab1dad 4892 ret = regclass(pRExC_state,depth+1);
a14b48bc 4893
830247a4
IZ
4894 RExC_end = oldregxend;
4895 RExC_parse--;
ccb2c380
MP
4896
4897 Set_Node_Offset(ret, parse_start + 2);
4898 Set_Node_Cur_Length(ret);
830247a4 4899 nextchar(pRExC_state);
a14b48bc
LW
4900 *flagp |= HASWIDTH|SIMPLE;
4901 }
4902 break;
a0d0e21e
LW
4903 case 'n':
4904 case 'r':
4905 case 't':
4906 case 'f':
4907 case 'e':
4908 case 'a':
4909 case 'x':
4910 case 'c':
4911 case '0':
4912 goto defchar;
4913 case '1': case '2': case '3': case '4':
4914 case '5': case '6': case '7': case '8': case '9':
4915 {
1df70142 4916 const I32 num = atoi(RExC_parse);
a0d0e21e 4917
830247a4 4918 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4919 goto defchar;
4920 else {
3dab1dad 4921 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4922 while (isDIGIT(*RExC_parse))
4923 RExC_parse++;
b45f050a 4924
eb160463 4925 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4926 vFAIL("Reference to nonexistent group");
830247a4 4927 RExC_sawback = 1;
eb160463
GS
4928 ret = reganode(pRExC_state,
4929 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4930 num);
a0d0e21e 4931 *flagp |= HASWIDTH;
2af232bd 4932
fac92740 4933 /* override incorrect value set in reganode MJD */
2af232bd 4934 Set_Node_Offset(ret, parse_start+1);
fac92740 4935 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4936 RExC_parse--;
4937 nextchar(pRExC_state);
a0d0e21e
LW
4938 }
4939 }
4940 break;
4941 case '\0':
830247a4 4942 if (RExC_parse >= RExC_end)
b45f050a 4943 FAIL("Trailing \\");
a0d0e21e
LW
4944 /* FALL THROUGH */
4945 default:
a0288114 4946 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 4947 back into the quick-grab loop below */
45948336 4948 parse_start--;
a0d0e21e
LW
4949 goto defchar;
4950 }
4951 break;
4633a7c4
LW
4952
4953 case '#':
e2509266 4954 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
4955 while (RExC_parse < RExC_end && *RExC_parse != '\n')
4956 RExC_parse++;
830247a4 4957 if (RExC_parse < RExC_end)
4633a7c4
LW
4958 goto tryagain;
4959 }
4960 /* FALL THROUGH */
4961
a0d0e21e 4962 default: {
ba210ebe 4963 register STRLEN len;
58ae7d3f 4964 register UV ender;
a0d0e21e 4965 register char *p;
3dab1dad 4966 char *s;
80aecb99 4967 STRLEN foldlen;
89ebb4a3 4968 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4969
4970 parse_start = RExC_parse - 1;
a0d0e21e 4971
830247a4 4972 RExC_parse++;
a0d0e21e
LW
4973
4974 defchar:
58ae7d3f 4975 ender = 0;
eb160463
GS
4976 ret = reg_node(pRExC_state,
4977 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4978 s = STRING(ret);
830247a4
IZ
4979 for (len = 0, p = RExC_parse - 1;
4980 len < 127 && p < RExC_end;
a0d0e21e
LW
4981 len++)
4982 {
3dab1dad 4983 char * const oldp = p;
5b5a24f7 4984
e2509266 4985 if (RExC_flags & PMf_EXTENDED)
830247a4 4986 p = regwhite(p, RExC_end);
a0d0e21e
LW
4987 switch (*p) {
4988 case '^':
4989 case '$':
4990 case '.':
4991 case '[':
4992 case '(':
4993 case ')':
4994 case '|':
4995 goto loopdone;
4996 case '\\':
4997 switch (*++p) {
4998 case 'A':
1ed8eac0
JF
4999 case 'C':
5000 case 'X':
a0d0e21e
LW
5001 case 'G':
5002 case 'Z':
b85d18e9 5003 case 'z':
a0d0e21e
LW
5004 case 'w':
5005 case 'W':
5006 case 'b':
5007 case 'B':
5008 case 's':
5009 case 'S':
5010 case 'd':
5011 case 'D':
a14b48bc
LW
5012 case 'p':
5013 case 'P':
a0d0e21e
LW
5014 --p;
5015 goto loopdone;
5016 case 'n':
5017 ender = '\n';
5018 p++;
a687059c 5019 break;
a0d0e21e
LW
5020 case 'r':
5021 ender = '\r';
5022 p++;
a687059c 5023 break;
a0d0e21e
LW
5024 case 't':
5025 ender = '\t';
5026 p++;
a687059c 5027 break;
a0d0e21e
LW
5028 case 'f':
5029 ender = '\f';
5030 p++;
a687059c 5031 break;
a0d0e21e 5032 case 'e':
c7f1f016 5033 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 5034 p++;
a687059c 5035 break;
a0d0e21e 5036 case 'a':
c7f1f016 5037 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 5038 p++;
a687059c 5039 break;
a0d0e21e 5040 case 'x':
a0ed51b3 5041 if (*++p == '{') {
1df70142 5042 char* const e = strchr(p, '}');
b81d288d 5043
b45f050a 5044 if (!e) {
830247a4 5045 RExC_parse = p + 1;
b45f050a
JF
5046 vFAIL("Missing right brace on \\x{}");
5047 }
de5f0749 5048 else {
a4c04bdc
NC
5049 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5050 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 5051 STRLEN numlen = e - p - 1;
53305cf1 5052 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
5053 if (ender > 0xff)
5054 RExC_utf8 = 1;
a0ed51b3
LW
5055 p = e + 1;
5056 }
a0ed51b3
LW
5057 }
5058 else {
a4c04bdc 5059 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 5060 STRLEN numlen = 2;
53305cf1 5061 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
5062 p += numlen;
5063 }
a687059c 5064 break;
a0d0e21e
LW
5065 case 'c':
5066 p++;
bbce6d69 5067 ender = UCHARAT(p++);
5068 ender = toCTRL(ender);
a687059c 5069 break;
a0d0e21e
LW
5070 case '0': case '1': case '2': case '3':case '4':
5071 case '5': case '6': case '7': case '8':case '9':
5072 if (*p == '0' ||
830247a4 5073 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 5074 I32 flags = 0;
1df70142 5075 STRLEN numlen = 3;
53305cf1 5076 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
5077 p += numlen;
5078 }
5079 else {
5080 --p;
5081 goto loopdone;
a687059c
LW
5082 }
5083 break;
a0d0e21e 5084 case '\0':
830247a4 5085 if (p >= RExC_end)
b45f050a 5086 FAIL("Trailing \\");
a687059c 5087 /* FALL THROUGH */
a0d0e21e 5088 default:
041457d9 5089 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 5090 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 5091 goto normal_default;
a0d0e21e
LW
5092 }
5093 break;
a687059c 5094 default:
a0ed51b3 5095 normal_default:
fd400ab9 5096 if (UTF8_IS_START(*p) && UTF) {
1df70142 5097 STRLEN numlen;
5e12f4fb 5098 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 5099 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
5100 p += numlen;
5101 }
5102 else
5103 ender = *p++;
a0d0e21e 5104 break;
a687059c 5105 }
e2509266 5106 if (RExC_flags & PMf_EXTENDED)
830247a4 5107 p = regwhite(p, RExC_end);
60a8b682
JH
5108 if (UTF && FOLD) {
5109 /* Prime the casefolded buffer. */
ac7e0132 5110 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 5111 }
a0d0e21e
LW
5112 if (ISMULT2(p)) { /* Back off on ?+*. */
5113 if (len)
5114 p = oldp;
16ea2a2e 5115 else if (UTF) {
80aecb99 5116 if (FOLD) {
60a8b682 5117 /* Emit all the Unicode characters. */
1df70142 5118 STRLEN numlen;
80aecb99
JH
5119 for (foldbuf = tmpbuf;
5120 foldlen;
5121 foldlen -= numlen) {
5122 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 5123 if (numlen > 0) {
71207a34 5124 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
5125 s += unilen;
5126 len += unilen;
5127 /* In EBCDIC the numlen
5128 * and unilen can differ. */
9dc45d57 5129 foldbuf += numlen;
47654450
JH
5130 if (numlen >= foldlen)
5131 break;
9dc45d57
JH
5132 }
5133 else
5134 break; /* "Can't happen." */
80aecb99
JH
5135 }
5136 }
5137 else {
71207a34 5138 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 5139 if (unilen > 0) {
0ebc6274
JH
5140 s += unilen;
5141 len += unilen;
9dc45d57 5142 }
80aecb99 5143 }
a0ed51b3 5144 }
a0d0e21e
LW
5145 else {
5146 len++;
eb160463 5147 REGC((char)ender, s++);
a0d0e21e
LW
5148 }
5149 break;
a687059c 5150 }
16ea2a2e 5151 if (UTF) {
80aecb99 5152 if (FOLD) {
60a8b682 5153 /* Emit all the Unicode characters. */
1df70142 5154 STRLEN numlen;
80aecb99
JH
5155 for (foldbuf = tmpbuf;
5156 foldlen;
5157 foldlen -= numlen) {
5158 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 5159 if (numlen > 0) {
71207a34 5160 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
5161 len += unilen;
5162 s += unilen;
5163 /* In EBCDIC the numlen
5164 * and unilen can differ. */
9dc45d57 5165 foldbuf += numlen;
47654450
JH
5166 if (numlen >= foldlen)
5167 break;
9dc45d57
JH
5168 }
5169 else
5170 break;
80aecb99
JH
5171 }
5172 }
5173 else {
71207a34 5174 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 5175 if (unilen > 0) {
0ebc6274
JH
5176 s += unilen;
5177 len += unilen;
9dc45d57 5178 }
80aecb99
JH
5179 }
5180 len--;
a0ed51b3
LW
5181 }
5182 else
eb160463 5183 REGC((char)ender, s++);
a0d0e21e
LW
5184 }
5185 loopdone:
830247a4 5186 RExC_parse = p - 1;
fac92740 5187 Set_Node_Cur_Length(ret); /* MJD */
830247a4 5188 nextchar(pRExC_state);
793db0cb
JH
5189 {
5190 /* len is STRLEN which is unsigned, need to copy to signed */
5191 IV iv = len;
5192 if (iv < 0)
5193 vFAIL("Internal disaster");
5194 }
a0d0e21e
LW
5195 if (len > 0)
5196 *flagp |= HASWIDTH;
090f7165 5197 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 5198 *flagp |= SIMPLE;
3dab1dad 5199
cd439c50 5200 if (SIZE_ONLY)
830247a4 5201 RExC_size += STR_SZ(len);
3dab1dad
YO
5202 else {
5203 STR_LEN(ret) = len;
830247a4 5204 RExC_emit += STR_SZ(len);
07be1b83 5205 }
3dab1dad 5206 }
a0d0e21e
LW
5207 break;
5208 }
a687059c 5209
60a8b682
JH
5210 /* If the encoding pragma is in effect recode the text of
5211 * any EXACT-kind nodes. */
3dab1dad
YO
5212 if (PL_encoding && PL_regkind[OP(ret)] == EXACT) {
5213 const STRLEN oldlen = STR_LEN(ret);
5214 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
d0063567
DK
5215
5216 if (RExC_utf8)
5217 SvUTF8_on(sv);
5218 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
5219 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
5220 const STRLEN newlen = SvCUR(sv);
d0063567
DK
5221
5222 if (SvUTF8(sv))
5223 RExC_utf8 = 1;
5224 if (!SIZE_ONLY) {
a3621e74
YO
5225 GET_RE_DEBUG_FLAGS_DECL;
5226 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
5227 (int)oldlen, STRING(ret),
5228 (int)newlen, s));
5229 Copy(s, STRING(ret), newlen, char);
5230 STR_LEN(ret) += newlen - oldlen;
5231 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
5232 } else
5233 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
5234 }
a72c7584
JH
5235 }
5236
a0d0e21e 5237 return(ret);
a687059c
LW
5238}
5239
873ef191 5240STATIC char *
5f66b61c 5241S_regwhite(char *p, const char *e)
5b5a24f7
CS
5242{
5243 while (p < e) {
5244 if (isSPACE(*p))
5245 ++p;
5246 else if (*p == '#') {
5247 do {
5248 p++;
5249 } while (p < e && *p != '\n');
5250 }
5251 else
5252 break;
5253 }
5254 return p;
5255}
5256
b8c5462f
JH
5257/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
5258 Character classes ([:foo:]) can also be negated ([:^foo:]).
5259 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
5260 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 5261 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
5262
5263#define POSIXCC_DONE(c) ((c) == ':')
5264#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
5265#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
5266
b8c5462f 5267STATIC I32
830247a4 5268S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 5269{
97aff369 5270 dVAR;
936ed897 5271 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 5272
830247a4 5273 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 5274 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 5275 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 5276 const char c = UCHARAT(RExC_parse);
097eb12c 5277 char* const s = RExC_parse++;
b81d288d 5278
9a86a77b 5279 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
5280 RExC_parse++;
5281 if (RExC_parse == RExC_end)
620e46c5 5282 /* Grandfather lone [:, [=, [. */
830247a4 5283 RExC_parse = s;
620e46c5 5284 else {
3dab1dad 5285 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
5286 assert(*t == c);
5287
9a86a77b 5288 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 5289 const char *posixcc = s + 1;
830247a4 5290 RExC_parse++; /* skip over the ending ] */
3dab1dad 5291
b8c5462f 5292 if (*s == ':') {
1df70142
AL
5293 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5294 const I32 skip = t - posixcc;
80916619
NC
5295
5296 /* Initially switch on the length of the name. */
5297 switch (skip) {
5298 case 4:
3dab1dad
YO
5299 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5300 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 5301 break;
80916619
NC
5302 case 5:
5303 /* Names all of length 5. */
5304 /* alnum alpha ascii blank cntrl digit graph lower
5305 print punct space upper */
5306 /* Offset 4 gives the best switch position. */
5307 switch (posixcc[4]) {
5308 case 'a':
3dab1dad
YO
5309 if (memEQ(posixcc, "alph", 4)) /* alpha */
5310 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
5311 break;
5312 case 'e':
3dab1dad
YO
5313 if (memEQ(posixcc, "spac", 4)) /* space */
5314 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
5315 break;
5316 case 'h':
3dab1dad
YO
5317 if (memEQ(posixcc, "grap", 4)) /* graph */
5318 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
5319 break;
5320 case 'i':
3dab1dad
YO
5321 if (memEQ(posixcc, "asci", 4)) /* ascii */
5322 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
5323 break;
5324 case 'k':
3dab1dad
YO
5325 if (memEQ(posixcc, "blan", 4)) /* blank */
5326 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
5327 break;
5328 case 'l':
3dab1dad
YO
5329 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5330 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
5331 break;
5332 case 'm':
3dab1dad
YO
5333 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5334 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
5335 break;
5336 case 'r':
3dab1dad
YO
5337 if (memEQ(posixcc, "lowe", 4)) /* lower */
5338 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5339 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5340 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
5341 break;
5342 case 't':
3dab1dad
YO
5343 if (memEQ(posixcc, "digi", 4)) /* digit */
5344 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5345 else if (memEQ(posixcc, "prin", 4)) /* print */
5346 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5347 else if (memEQ(posixcc, "punc", 4)) /* punct */
5348 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 5349 break;
b8c5462f
JH
5350 }
5351 break;
80916619 5352 case 6:
3dab1dad
YO
5353 if (memEQ(posixcc, "xdigit", 6))
5354 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
5355 break;
5356 }
80916619
NC
5357
5358 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
5359 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5360 t - s - 1, s + 1);
80916619
NC
5361 assert (posixcc[skip] == ':');
5362 assert (posixcc[skip+1] == ']');
b45f050a 5363 } else if (!SIZE_ONLY) {
b8c5462f 5364 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 5365
830247a4 5366 /* adjust RExC_parse so the warning shows after
b45f050a 5367 the class closes */
9a86a77b 5368 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 5369 RExC_parse++;
b45f050a
JF
5370 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5371 }
b8c5462f
JH
5372 } else {
5373 /* Maternal grandfather:
5374 * "[:" ending in ":" but not in ":]" */
830247a4 5375 RExC_parse = s;
767d463e 5376 }
620e46c5
JH
5377 }
5378 }
5379
b8c5462f
JH
5380 return namedclass;
5381}
5382
5383STATIC void
830247a4 5384S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 5385{
97aff369 5386 dVAR;
3dab1dad 5387 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
5388 const char *s = RExC_parse;
5389 const char c = *s++;
b8c5462f 5390
3dab1dad 5391 while (isALNUM(*s))
b8c5462f
JH
5392 s++;
5393 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
5394 if (ckWARN(WARN_REGEXP))
5395 vWARN3(s+2,
5396 "POSIX syntax [%c %c] belongs inside character classes",
5397 c, c);
b45f050a
JF
5398
5399 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 5400 if (POSIXCC_NOTYET(c)) {
830247a4 5401 /* adjust RExC_parse so the error shows after
b45f050a 5402 the class closes */
9a86a77b 5403 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 5404 NOOP;
b45f050a
JF
5405 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5406 }
b8c5462f
JH
5407 }
5408 }
620e46c5
JH
5409}
5410
7f6f358c
YO
5411
5412/*
5413 parse a class specification and produce either an ANYOF node that
5414 matches the pattern. If the pattern matches a single char only and
5415 that char is < 256 then we produce an EXACT node instead.
5416*/
76e3520e 5417STATIC regnode *
3dab1dad 5418S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 5419{
97aff369 5420 dVAR;
07be1b83 5421 register UV value;
9a86a77b 5422 register UV nextvalue;
3568d838 5423 register IV prevvalue = OOB_UNICODE;
ffc61ed2 5424 register IV range = 0;
c277df42 5425 register regnode *ret;
ba210ebe 5426 STRLEN numlen;
ffc61ed2 5427 IV namedclass;
cbbf8932 5428 char *rangebegin = NULL;
936ed897 5429 bool need_class = 0;
c445ea15 5430 SV *listsv = NULL;
ffc61ed2 5431 UV n;
9e55ce06 5432 bool optimize_invert = TRUE;
cbbf8932 5433 AV* unicode_alternate = NULL;
1b2d223b
JH
5434#ifdef EBCDIC
5435 UV literal_endpoint = 0;
5436#endif
7f6f358c 5437 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 5438
3dab1dad 5439 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 5440 case we need to change the emitted regop to an EXACT. */
07be1b83 5441 const char * orig_parse = RExC_parse;
72f13be8 5442 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
5443#ifndef DEBUGGING
5444 PERL_UNUSED_ARG(depth);
5445#endif
72f13be8 5446
3dab1dad 5447 DEBUG_PARSE("clas");
7f6f358c
YO
5448
5449 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
5450 ret = reganode(pRExC_state, ANYOF, 0);
5451
5452 if (!SIZE_ONLY)
5453 ANYOF_FLAGS(ret) = 0;
5454
9a86a77b 5455 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
5456 RExC_naughty++;
5457 RExC_parse++;
5458 if (!SIZE_ONLY)
5459 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
5460 }
a0d0e21e 5461
73060fc4 5462 if (SIZE_ONLY) {
830247a4 5463 RExC_size += ANYOF_SKIP;
73060fc4
JH
5464 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
5465 }
936ed897 5466 else {
830247a4 5467 RExC_emit += ANYOF_SKIP;
936ed897
IZ
5468 if (FOLD)
5469 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
5470 if (LOC)
5471 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 5472 ANYOF_BITMAP_ZERO(ret);
396482e1 5473 listsv = newSVpvs("# comment\n");
a0d0e21e 5474 }
b8c5462f 5475
9a86a77b
JH
5476 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5477
b938889d 5478 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 5479 checkposixcc(pRExC_state);
b8c5462f 5480
f064b6ad
HS
5481 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
5482 if (UCHARAT(RExC_parse) == ']')
5483 goto charclassloop;
ffc61ed2 5484
9a86a77b 5485 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
5486
5487 charclassloop:
5488
5489 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
5490
73b437c8 5491 if (!range)
830247a4 5492 rangebegin = RExC_parse;
ffc61ed2 5493 if (UTF) {
5e12f4fb 5494 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 5495 RExC_end - RExC_parse,
9f7f3913 5496 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
5497 RExC_parse += numlen;
5498 }
5499 else
5500 value = UCHARAT(RExC_parse++);
7f6f358c 5501
9a86a77b
JH
5502 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
5503 if (value == '[' && POSIXCC(nextvalue))
830247a4 5504 namedclass = regpposixcc(pRExC_state, value);
620e46c5 5505 else if (value == '\\') {
ffc61ed2 5506 if (UTF) {
5e12f4fb 5507 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 5508 RExC_end - RExC_parse,
9f7f3913 5509 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
5510 RExC_parse += numlen;
5511 }
5512 else
5513 value = UCHARAT(RExC_parse++);
470c3474 5514 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 5515 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
5516 * be a problem later if we want switch on Unicode.
5517 * A similar issue a little bit later when switching on
5518 * namedclass. --jhi */
ffc61ed2 5519 switch ((I32)value) {
b8c5462f
JH
5520 case 'w': namedclass = ANYOF_ALNUM; break;
5521 case 'W': namedclass = ANYOF_NALNUM; break;
5522 case 's': namedclass = ANYOF_SPACE; break;
5523 case 'S': namedclass = ANYOF_NSPACE; break;
5524 case 'd': namedclass = ANYOF_DIGIT; break;
5525 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
5526 case 'p':
5527 case 'P':
3dab1dad
YO
5528 {
5529 char *e;
af6f566e 5530 if (RExC_parse >= RExC_end)
2a4859cd 5531 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 5532 if (*RExC_parse == '{') {
1df70142 5533 const U8 c = (U8)value;
ffc61ed2
JH
5534 e = strchr(RExC_parse++, '}');
5535 if (!e)
0da60cf5 5536 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
5537 while (isSPACE(UCHARAT(RExC_parse)))
5538 RExC_parse++;
5539 if (e == RExC_parse)
0da60cf5 5540 vFAIL2("Empty \\%c{}", c);
ffc61ed2 5541 n = e - RExC_parse;
ab13f0c7
JH
5542 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
5543 n--;
ffc61ed2
JH
5544 }
5545 else {
5546 e = RExC_parse;
5547 n = 1;
5548 }
5549 if (!SIZE_ONLY) {
ab13f0c7
JH
5550 if (UCHARAT(RExC_parse) == '^') {
5551 RExC_parse++;
5552 n--;
5553 value = value == 'p' ? 'P' : 'p'; /* toggle */
5554 while (isSPACE(UCHARAT(RExC_parse))) {
5555 RExC_parse++;
5556 n--;
5557 }
5558 }
097eb12c
AL
5559 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
5560 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
5561 }
5562 RExC_parse = e + 1;
5563 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 5564 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 5565 }
f81125e2 5566 break;
b8c5462f
JH
5567 case 'n': value = '\n'; break;
5568 case 'r': value = '\r'; break;
5569 case 't': value = '\t'; break;
5570 case 'f': value = '\f'; break;
5571 case 'b': value = '\b'; break;
c7f1f016
NIS
5572 case 'e': value = ASCII_TO_NATIVE('\033');break;
5573 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 5574 case 'x':
ffc61ed2 5575 if (*RExC_parse == '{') {
a4c04bdc
NC
5576 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5577 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 5578 char * const e = strchr(RExC_parse++, '}');
b81d288d 5579 if (!e)
ffc61ed2 5580 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
5581
5582 numlen = e - RExC_parse;
5583 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5584 RExC_parse = e + 1;
5585 }
5586 else {
a4c04bdc 5587 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
5588 numlen = 2;
5589 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
5590 RExC_parse += numlen;
5591 }
b8c5462f
JH
5592 break;
5593 case 'c':
830247a4 5594 value = UCHARAT(RExC_parse++);
b8c5462f
JH
5595 value = toCTRL(value);
5596 break;
5597 case '0': case '1': case '2': case '3': case '4':
5598 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
5599 {
5600 I32 flags = 0;
5601 numlen = 3;
5602 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 5603 RExC_parse += numlen;
b8c5462f 5604 break;
53305cf1 5605 }
1028017a 5606 default:
041457d9 5607 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
5608 vWARN2(RExC_parse,
5609 "Unrecognized escape \\%c in character class passed through",
5610 (int)value);
1028017a 5611 break;
b8c5462f 5612 }
ffc61ed2 5613 } /* end of \blah */
1b2d223b
JH
5614#ifdef EBCDIC
5615 else
5616 literal_endpoint++;
5617#endif
ffc61ed2
JH
5618
5619 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
5620
5621 if (!SIZE_ONLY && !need_class)
936ed897 5622 ANYOF_CLASS_ZERO(ret);
ffc61ed2 5623
936ed897 5624 need_class = 1;
ffc61ed2
JH
5625
5626 /* a bad range like a-\d, a-[:digit:] ? */
5627 if (range) {
73b437c8 5628 if (!SIZE_ONLY) {
afd78fd5 5629 if (ckWARN(WARN_REGEXP)) {
097eb12c 5630 const int w =
afd78fd5
JH
5631 RExC_parse >= rangebegin ?
5632 RExC_parse - rangebegin : 0;
830247a4 5633 vWARN4(RExC_parse,
b45f050a 5634 "False [] range \"%*.*s\"",
097eb12c 5635 w, w, rangebegin);
afd78fd5 5636 }
3568d838
JH
5637 if (prevvalue < 256) {
5638 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
5639 ANYOF_BITMAP_SET(ret, '-');
5640 }
5641 else {
5642 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
5643 Perl_sv_catpvf(aTHX_ listsv,
3568d838 5644 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 5645 }
b8c5462f 5646 }
ffc61ed2
JH
5647
5648 range = 0; /* this was not a true range */
73b437c8 5649 }
ffc61ed2 5650
73b437c8 5651 if (!SIZE_ONLY) {
c49a72a9
NC
5652 const char *what = NULL;
5653 char yesno = 0;
5654
3568d838
JH
5655 if (namedclass > OOB_NAMEDCLASS)
5656 optimize_invert = FALSE;
e2962f66
JH
5657 /* Possible truncation here but in some 64-bit environments
5658 * the compiler gets heartburn about switch on 64-bit values.
5659 * A similar issue a little earlier when switching on value.
98f323fa 5660 * --jhi */
e2962f66 5661 switch ((I32)namedclass) {
73b437c8
JH
5662 case ANYOF_ALNUM:
5663 if (LOC)
936ed897 5664 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
5665 else {
5666 for (value = 0; value < 256; value++)
5667 if (isALNUM(value))
936ed897 5668 ANYOF_BITMAP_SET(ret, value);
73b437c8 5669 }
c49a72a9
NC
5670 yesno = '+';
5671 what = "Word";
73b437c8
JH
5672 break;
5673 case ANYOF_NALNUM:
5674 if (LOC)
936ed897 5675 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
5676 else {
5677 for (value = 0; value < 256; value++)
5678 if (!isALNUM(value))
936ed897 5679 ANYOF_BITMAP_SET(ret, value);
73b437c8 5680 }
c49a72a9
NC
5681 yesno = '!';
5682 what = "Word";
73b437c8 5683 break;
ffc61ed2 5684 case ANYOF_ALNUMC:
73b437c8 5685 if (LOC)
ffc61ed2 5686 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
5687 else {
5688 for (value = 0; value < 256; value++)
ffc61ed2 5689 if (isALNUMC(value))
936ed897 5690 ANYOF_BITMAP_SET(ret, value);
73b437c8 5691 }
c49a72a9
NC
5692 yesno = '+';
5693 what = "Alnum";
73b437c8
JH
5694 break;
5695 case ANYOF_NALNUMC:
5696 if (LOC)
936ed897 5697 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
5698 else {
5699 for (value = 0; value < 256; value++)
5700 if (!isALNUMC(value))
936ed897 5701 ANYOF_BITMAP_SET(ret, value);
73b437c8 5702 }
c49a72a9
NC
5703 yesno = '!';
5704 what = "Alnum";
73b437c8
JH
5705 break;
5706 case ANYOF_ALPHA:
5707 if (LOC)
936ed897 5708 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
5709 else {
5710 for (value = 0; value < 256; value++)
5711 if (isALPHA(value))
936ed897 5712 ANYOF_BITMAP_SET(ret, value);
73b437c8 5713 }
c49a72a9
NC
5714 yesno = '+';
5715 what = "Alpha";
73b437c8
JH
5716 break;
5717 case ANYOF_NALPHA:
5718 if (LOC)
936ed897 5719 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
5720 else {
5721 for (value = 0; value < 256; value++)
5722 if (!isALPHA(value))
936ed897 5723 ANYOF_BITMAP_SET(ret, value);
73b437c8 5724 }
c49a72a9
NC
5725 yesno = '!';
5726 what = "Alpha";
73b437c8
JH
5727 break;
5728 case ANYOF_ASCII:
5729 if (LOC)
936ed897 5730 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 5731 else {
c7f1f016 5732#ifndef EBCDIC
1ba5c669
JH
5733 for (value = 0; value < 128; value++)
5734 ANYOF_BITMAP_SET(ret, value);
5735#else /* EBCDIC */
ffbc6a93 5736 for (value = 0; value < 256; value++) {
3a3c4447
JH
5737 if (isASCII(value))
5738 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 5739 }
1ba5c669 5740#endif /* EBCDIC */
73b437c8 5741 }
c49a72a9
NC
5742 yesno = '+';
5743 what = "ASCII";
73b437c8
JH
5744 break;
5745 case ANYOF_NASCII:
5746 if (LOC)
936ed897 5747 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 5748 else {
c7f1f016 5749#ifndef EBCDIC
1ba5c669
JH
5750 for (value = 128; value < 256; value++)
5751 ANYOF_BITMAP_SET(ret, value);
5752#else /* EBCDIC */
ffbc6a93 5753 for (value = 0; value < 256; value++) {
3a3c4447
JH
5754 if (!isASCII(value))
5755 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 5756 }
1ba5c669 5757#endif /* EBCDIC */
73b437c8 5758 }
c49a72a9
NC
5759 yesno = '!';
5760 what = "ASCII";
73b437c8 5761 break;
aaa51d5e
JF
5762 case ANYOF_BLANK:
5763 if (LOC)
5764 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
5765 else {
5766 for (value = 0; value < 256; value++)
5767 if (isBLANK(value))
5768 ANYOF_BITMAP_SET(ret, value);
5769 }
c49a72a9
NC
5770 yesno = '+';
5771 what = "Blank";
aaa51d5e
JF
5772 break;
5773 case ANYOF_NBLANK:
5774 if (LOC)
5775 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5776 else {
5777 for (value = 0; value < 256; value++)
5778 if (!isBLANK(value))
5779 ANYOF_BITMAP_SET(ret, value);
5780 }
c49a72a9
NC
5781 yesno = '!';
5782 what = "Blank";
aaa51d5e 5783 break;
73b437c8
JH
5784 case ANYOF_CNTRL:
5785 if (LOC)
936ed897 5786 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
5787 else {
5788 for (value = 0; value < 256; value++)
5789 if (isCNTRL(value))
936ed897 5790 ANYOF_BITMAP_SET(ret, value);
73b437c8 5791 }
c49a72a9
NC
5792 yesno = '+';
5793 what = "Cntrl";
73b437c8
JH
5794 break;
5795 case ANYOF_NCNTRL:
5796 if (LOC)
936ed897 5797 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5798 else {
5799 for (value = 0; value < 256; value++)
5800 if (!isCNTRL(value))
936ed897 5801 ANYOF_BITMAP_SET(ret, value);
73b437c8 5802 }
c49a72a9
NC
5803 yesno = '!';
5804 what = "Cntrl";
ffc61ed2
JH
5805 break;
5806 case ANYOF_DIGIT:
5807 if (LOC)
5808 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5809 else {
5810 /* consecutive digits assumed */
5811 for (value = '0'; value <= '9'; value++)
5812 ANYOF_BITMAP_SET(ret, value);
5813 }
c49a72a9
NC
5814 yesno = '+';
5815 what = "Digit";
ffc61ed2
JH
5816 break;
5817 case ANYOF_NDIGIT:
5818 if (LOC)
5819 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5820 else {
5821 /* consecutive digits assumed */
5822 for (value = 0; value < '0'; value++)
5823 ANYOF_BITMAP_SET(ret, value);
5824 for (value = '9' + 1; value < 256; value++)
5825 ANYOF_BITMAP_SET(ret, value);
5826 }
c49a72a9
NC
5827 yesno = '!';
5828 what = "Digit";
73b437c8
JH
5829 break;
5830 case ANYOF_GRAPH:
5831 if (LOC)
936ed897 5832 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5833 else {
5834 for (value = 0; value < 256; value++)
5835 if (isGRAPH(value))
936ed897 5836 ANYOF_BITMAP_SET(ret, value);
73b437c8 5837 }
c49a72a9
NC
5838 yesno = '+';
5839 what = "Graph";
73b437c8
JH
5840 break;
5841 case ANYOF_NGRAPH:
5842 if (LOC)
936ed897 5843 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5844 else {
5845 for (value = 0; value < 256; value++)
5846 if (!isGRAPH(value))
936ed897 5847 ANYOF_BITMAP_SET(ret, value);
73b437c8 5848 }
c49a72a9
NC
5849 yesno = '!';
5850 what = "Graph";
73b437c8
JH
5851 break;
5852 case ANYOF_LOWER:
5853 if (LOC)
936ed897 5854 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5855 else {
5856 for (value = 0; value < 256; value++)
5857 if (isLOWER(value))
936ed897 5858 ANYOF_BITMAP_SET(ret, value);
73b437c8 5859 }
c49a72a9
NC
5860 yesno = '+';
5861 what = "Lower";
73b437c8
JH
5862 break;
5863 case ANYOF_NLOWER:
5864 if (LOC)
936ed897 5865 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5866 else {
5867 for (value = 0; value < 256; value++)
5868 if (!isLOWER(value))
936ed897 5869 ANYOF_BITMAP_SET(ret, value);
73b437c8 5870 }
c49a72a9
NC
5871 yesno = '!';
5872 what = "Lower";
73b437c8
JH
5873 break;
5874 case ANYOF_PRINT:
5875 if (LOC)
936ed897 5876 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5877 else {
5878 for (value = 0; value < 256; value++)
5879 if (isPRINT(value))
936ed897 5880 ANYOF_BITMAP_SET(ret, value);
73b437c8 5881 }
c49a72a9
NC
5882 yesno = '+';
5883 what = "Print";
73b437c8
JH
5884 break;
5885 case ANYOF_NPRINT:
5886 if (LOC)
936ed897 5887 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5888 else {
5889 for (value = 0; value < 256; value++)
5890 if (!isPRINT(value))
936ed897 5891 ANYOF_BITMAP_SET(ret, value);
73b437c8 5892 }
c49a72a9
NC
5893 yesno = '!';
5894 what = "Print";
73b437c8 5895 break;
aaa51d5e
JF
5896 case ANYOF_PSXSPC:
5897 if (LOC)
5898 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5899 else {
5900 for (value = 0; value < 256; value++)
5901 if (isPSXSPC(value))
5902 ANYOF_BITMAP_SET(ret, value);
5903 }
c49a72a9
NC
5904 yesno = '+';
5905 what = "Space";
aaa51d5e
JF
5906 break;
5907 case ANYOF_NPSXSPC:
5908 if (LOC)
5909 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5910 else {
5911 for (value = 0; value < 256; value++)
5912 if (!isPSXSPC(value))
5913 ANYOF_BITMAP_SET(ret, value);
5914 }
c49a72a9
NC
5915 yesno = '!';
5916 what = "Space";
aaa51d5e 5917 break;
73b437c8
JH
5918 case ANYOF_PUNCT:
5919 if (LOC)
936ed897 5920 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5921 else {
5922 for (value = 0; value < 256; value++)
5923 if (isPUNCT(value))
936ed897 5924 ANYOF_BITMAP_SET(ret, value);
73b437c8 5925 }
c49a72a9
NC
5926 yesno = '+';
5927 what = "Punct";
73b437c8
JH
5928 break;
5929 case ANYOF_NPUNCT:
5930 if (LOC)
936ed897 5931 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5932 else {
5933 for (value = 0; value < 256; value++)
5934 if (!isPUNCT(value))
936ed897 5935 ANYOF_BITMAP_SET(ret, value);
73b437c8 5936 }
c49a72a9
NC
5937 yesno = '!';
5938 what = "Punct";
ffc61ed2
JH
5939 break;
5940 case ANYOF_SPACE:
5941 if (LOC)
5942 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5943 else {
5944 for (value = 0; value < 256; value++)
5945 if (isSPACE(value))
5946 ANYOF_BITMAP_SET(ret, value);
5947 }
c49a72a9
NC
5948 yesno = '+';
5949 what = "SpacePerl";
ffc61ed2
JH
5950 break;
5951 case ANYOF_NSPACE:
5952 if (LOC)
5953 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5954 else {
5955 for (value = 0; value < 256; value++)
5956 if (!isSPACE(value))
5957 ANYOF_BITMAP_SET(ret, value);
5958 }
c49a72a9
NC
5959 yesno = '!';
5960 what = "SpacePerl";
73b437c8
JH
5961 break;
5962 case ANYOF_UPPER:
5963 if (LOC)
936ed897 5964 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5965 else {
5966 for (value = 0; value < 256; value++)
5967 if (isUPPER(value))
936ed897 5968 ANYOF_BITMAP_SET(ret, value);
73b437c8 5969 }
c49a72a9
NC
5970 yesno = '+';
5971 what = "Upper";
73b437c8
JH
5972 break;
5973 case ANYOF_NUPPER:
5974 if (LOC)
936ed897 5975 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5976 else {
5977 for (value = 0; value < 256; value++)
5978 if (!isUPPER(value))
936ed897 5979 ANYOF_BITMAP_SET(ret, value);
73b437c8 5980 }
c49a72a9
NC
5981 yesno = '!';
5982 what = "Upper";
73b437c8
JH
5983 break;
5984 case ANYOF_XDIGIT:
5985 if (LOC)
936ed897 5986 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5987 else {
5988 for (value = 0; value < 256; value++)
5989 if (isXDIGIT(value))
936ed897 5990 ANYOF_BITMAP_SET(ret, value);
73b437c8 5991 }
c49a72a9
NC
5992 yesno = '+';
5993 what = "XDigit";
73b437c8
JH
5994 break;
5995 case ANYOF_NXDIGIT:
5996 if (LOC)
936ed897 5997 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5998 else {
5999 for (value = 0; value < 256; value++)
6000 if (!isXDIGIT(value))
936ed897 6001 ANYOF_BITMAP_SET(ret, value);
73b437c8 6002 }
c49a72a9
NC
6003 yesno = '!';
6004 what = "XDigit";
73b437c8 6005 break;
f81125e2
JP
6006 case ANYOF_MAX:
6007 /* this is to handle \p and \P */
6008 break;
73b437c8 6009 default:
b45f050a 6010 vFAIL("Invalid [::] class");
73b437c8 6011 break;
b8c5462f 6012 }
c49a72a9
NC
6013 if (what) {
6014 /* Strings such as "+utf8::isWord\n" */
6015 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
6016 }
b8c5462f 6017 if (LOC)
936ed897 6018 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 6019 continue;
a0d0e21e 6020 }
ffc61ed2
JH
6021 } /* end of namedclass \blah */
6022
a0d0e21e 6023 if (range) {
eb160463 6024 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
6025 const int w = RExC_parse - rangebegin;
6026 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 6027 range = 0; /* not a valid range */
73b437c8 6028 }
a0d0e21e
LW
6029 }
6030 else {
3568d838 6031 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
6032 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
6033 RExC_parse[1] != ']') {
6034 RExC_parse++;
ffc61ed2
JH
6035
6036 /* a bad range like \w-, [:word:]- ? */
6037 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 6038 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 6039 const int w =
afd78fd5
JH
6040 RExC_parse >= rangebegin ?
6041 RExC_parse - rangebegin : 0;
830247a4 6042 vWARN4(RExC_parse,
b45f050a 6043 "False [] range \"%*.*s\"",
097eb12c 6044 w, w, rangebegin);
afd78fd5 6045 }
73b437c8 6046 if (!SIZE_ONLY)
936ed897 6047 ANYOF_BITMAP_SET(ret, '-');
73b437c8 6048 } else
ffc61ed2
JH
6049 range = 1; /* yeah, it's a range! */
6050 continue; /* but do it the next time */
a0d0e21e 6051 }
a687059c 6052 }
ffc61ed2 6053
93733859 6054 /* now is the next time */
07be1b83 6055 /*stored += (value - prevvalue + 1);*/
ae5c130c 6056 if (!SIZE_ONLY) {
3568d838 6057 if (prevvalue < 256) {
1df70142 6058 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 6059 IV i;
3568d838 6060#ifdef EBCDIC
1b2d223b
JH
6061 /* In EBCDIC [\x89-\x91] should include
6062 * the \x8e but [i-j] should not. */
6063 if (literal_endpoint == 2 &&
6064 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
6065 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 6066 {
3568d838
JH
6067 if (isLOWER(prevvalue)) {
6068 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
6069 if (isLOWER(i))
6070 ANYOF_BITMAP_SET(ret, i);
6071 } else {
3568d838 6072 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
6073 if (isUPPER(i))
6074 ANYOF_BITMAP_SET(ret, i);
6075 }
8ada0baa 6076 }
ffc61ed2 6077 else
8ada0baa 6078#endif
07be1b83
YO
6079 for (i = prevvalue; i <= ceilvalue; i++) {
6080 if (!ANYOF_BITMAP_TEST(ret,i)) {
6081 stored++;
6082 ANYOF_BITMAP_SET(ret, i);
6083 }
6084 }
3568d838 6085 }
a5961de5 6086 if (value > 255 || UTF) {
1df70142
AL
6087 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
6088 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 6089 stored+=2; /* can't optimize this class */
ffc61ed2 6090 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 6091 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 6092 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
6093 prevnatvalue, natvalue);
6094 }
6095 else if (prevnatvalue == natvalue) {
6096 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 6097 if (FOLD) {
89ebb4a3 6098 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 6099 STRLEN foldlen;
1df70142 6100 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 6101
e294cc5d
JH
6102#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
6103 if (RExC_precomp[0] == ':' &&
6104 RExC_precomp[1] == '[' &&
6105 (f == 0xDF || f == 0x92)) {
6106 f = NATIVE_TO_UNI(f);
6107 }
6108#endif
c840d2a2
JH
6109 /* If folding and foldable and a single
6110 * character, insert also the folded version
6111 * to the charclass. */
9e55ce06 6112 if (f != value) {
e294cc5d
JH
6113#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
6114 if ((RExC_precomp[0] == ':' &&
6115 RExC_precomp[1] == '[' &&
6116 (f == 0xA2 &&
6117 (value == 0xFB05 || value == 0xFB06))) ?
6118 foldlen == ((STRLEN)UNISKIP(f) - 1) :
6119 foldlen == (STRLEN)UNISKIP(f) )
6120#else
eb160463 6121 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 6122#endif
9e55ce06
JH
6123 Perl_sv_catpvf(aTHX_ listsv,
6124 "%04"UVxf"\n", f);
6125 else {
6126 /* Any multicharacter foldings
6127 * require the following transform:
6128 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
6129 * where E folds into "pq" and F folds
6130 * into "rst", all other characters
6131 * fold to single characters. We save
6132 * away these multicharacter foldings,
6133 * to be later saved as part of the
6134 * additional "s" data. */
6135 SV *sv;
6136
6137 if (!unicode_alternate)
6138 unicode_alternate = newAV();
6139 sv = newSVpvn((char*)foldbuf, foldlen);
6140 SvUTF8_on(sv);
6141 av_push(unicode_alternate, sv);
6142 }
6143 }
254ba52a 6144
60a8b682
JH
6145 /* If folding and the value is one of the Greek
6146 * sigmas insert a few more sigmas to make the
6147 * folding rules of the sigmas to work right.
6148 * Note that not all the possible combinations
6149 * are handled here: some of them are handled
9e55ce06
JH
6150 * by the standard folding rules, and some of
6151 * them (literal or EXACTF cases) are handled
6152 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
6153 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
6154 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 6155 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 6156 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 6157 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
6158 }
6159 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
6160 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 6161 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
6162 }
6163 }
ffc61ed2 6164 }
1b2d223b
JH
6165#ifdef EBCDIC
6166 literal_endpoint = 0;
6167#endif
8ada0baa 6168 }
ffc61ed2
JH
6169
6170 range = 0; /* this range (if it was one) is done now */
a0d0e21e 6171 }
ffc61ed2 6172
936ed897 6173 if (need_class) {
4f66b38d 6174 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 6175 if (SIZE_ONLY)
830247a4 6176 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 6177 else
830247a4 6178 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 6179 }
ffc61ed2 6180
7f6f358c
YO
6181
6182 if (SIZE_ONLY)
6183 return ret;
6184 /****** !SIZE_ONLY AFTER HERE *********/
6185
6186 if( stored == 1 && value < 256
6187 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
6188 ) {
6189 /* optimize single char class to an EXACT node
6190 but *only* when its not a UTF/high char */
07be1b83
YO
6191 const char * cur_parse= RExC_parse;
6192 RExC_emit = (regnode *)orig_emit;
6193 RExC_parse = (char *)orig_parse;
7f6f358c
YO
6194 ret = reg_node(pRExC_state,
6195 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 6196 RExC_parse = (char *)cur_parse;
7f6f358c
YO
6197 *STRING(ret)= (char)value;
6198 STR_LEN(ret)= 1;
6199 RExC_emit += STR_SZ(1);
6200 return ret;
6201 }
ae5c130c 6202 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 6203 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
6204 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
6205 ) {
a0ed51b3 6206 for (value = 0; value < 256; ++value) {
936ed897 6207 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 6208 UV fold = PL_fold[value];
ffc61ed2
JH
6209
6210 if (fold != value)
6211 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
6212 }
6213 }
936ed897 6214 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 6215 }
ffc61ed2 6216
ae5c130c 6217 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 6218 if (optimize_invert &&
ffc61ed2
JH
6219 /* If the only flag is inversion. */
6220 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 6221 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 6222 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 6223 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 6224 }
7f6f358c 6225 {
097eb12c 6226 AV * const av = newAV();
ffc61ed2 6227 SV *rv;
9e55ce06 6228 /* The 0th element stores the character class description
6a0407ee 6229 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
6230 * to initialize the appropriate swash (which gets stored in
6231 * the 1st element), and also useful for dumping the regnode.
6232 * The 2nd element stores the multicharacter foldings,
6a0407ee 6233 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
6234 av_store(av, 0, listsv);
6235 av_store(av, 1, NULL);
9e55ce06 6236 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 6237 rv = newRV_noinc((SV*)av);
19860706 6238 n = add_data(pRExC_state, 1, "s");
830247a4 6239 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 6240 ARG_SET(ret, n);
a0ed51b3 6241 }
a0ed51b3
LW
6242 return ret;
6243}
6244
76e3520e 6245STATIC char*
830247a4 6246S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 6247{
097eb12c 6248 char* const retval = RExC_parse++;
a0d0e21e 6249
4633a7c4 6250 for (;;) {
830247a4
IZ
6251 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
6252 RExC_parse[2] == '#') {
e994fd66
AE
6253 while (*RExC_parse != ')') {
6254 if (RExC_parse == RExC_end)
6255 FAIL("Sequence (?#... not terminated");
830247a4 6256 RExC_parse++;
e994fd66 6257 }
830247a4 6258 RExC_parse++;
4633a7c4
LW
6259 continue;
6260 }
e2509266 6261 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
6262 if (isSPACE(*RExC_parse)) {
6263 RExC_parse++;
748a9306
LW
6264 continue;
6265 }
830247a4 6266 else if (*RExC_parse == '#') {
e994fd66
AE
6267 while (RExC_parse < RExC_end)
6268 if (*RExC_parse++ == '\n') break;
748a9306
LW
6269 continue;
6270 }
748a9306 6271 }
4633a7c4 6272 return retval;
a0d0e21e 6273 }
a687059c
LW
6274}
6275
6276/*
c277df42 6277- reg_node - emit a node
a0d0e21e 6278*/
76e3520e 6279STATIC regnode * /* Location. */
830247a4 6280S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 6281{
97aff369 6282 dVAR;
c277df42 6283 register regnode *ptr;
504618e9 6284 regnode * const ret = RExC_emit;
07be1b83 6285 GET_RE_DEBUG_FLAGS_DECL;
a687059c 6286
c277df42 6287 if (SIZE_ONLY) {
830247a4
IZ
6288 SIZE_ALIGN(RExC_size);
6289 RExC_size += 1;
a0d0e21e
LW
6290 return(ret);
6291 }
c277df42 6292 NODE_ALIGN_FILL(ret);
a0d0e21e 6293 ptr = ret;
c277df42 6294 FILL_ADVANCE_NODE(ptr, op);
fac92740 6295 if (RExC_offsets) { /* MJD */
07be1b83 6296 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
6297 "reg_node", __LINE__,
6298 reg_name[op],
07be1b83
YO
6299 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6300 ? "Overwriting end of array!\n" : "OK",
6301 (UV)(RExC_emit - RExC_emit_start),
6302 (UV)(RExC_parse - RExC_start),
6303 (UV)RExC_offsets[0]));
ccb2c380 6304 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 6305 }
07be1b83 6306
830247a4 6307 RExC_emit = ptr;
a687059c 6308
a0d0e21e 6309 return(ret);
a687059c
LW
6310}
6311
6312/*
a0d0e21e
LW
6313- reganode - emit a node with an argument
6314*/
76e3520e 6315STATIC regnode * /* Location. */
830247a4 6316S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 6317{
97aff369 6318 dVAR;
c277df42 6319 register regnode *ptr;
504618e9 6320 regnode * const ret = RExC_emit;
07be1b83 6321 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 6322
c277df42 6323 if (SIZE_ONLY) {
830247a4
IZ
6324 SIZE_ALIGN(RExC_size);
6325 RExC_size += 2;
a0d0e21e
LW
6326 return(ret);
6327 }
fe14fcc3 6328
c277df42 6329 NODE_ALIGN_FILL(ret);
a0d0e21e 6330 ptr = ret;
c277df42 6331 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 6332 if (RExC_offsets) { /* MJD */
07be1b83 6333 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6334 "reganode",
ccb2c380
MP
6335 __LINE__,
6336 reg_name[op],
07be1b83 6337 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 6338 "Overwriting end of array!\n" : "OK",
07be1b83
YO
6339 (UV)(RExC_emit - RExC_emit_start),
6340 (UV)(RExC_parse - RExC_start),
6341 (UV)RExC_offsets[0]));
ccb2c380 6342 Set_Cur_Node_Offset;
fac92740
MJD
6343 }
6344
830247a4 6345 RExC_emit = ptr;
fe14fcc3 6346
a0d0e21e 6347 return(ret);
fe14fcc3
LW
6348}
6349
6350/*
cd439c50 6351- reguni - emit (if appropriate) a Unicode character
a0ed51b3 6352*/
71207a34
AL
6353STATIC STRLEN
6354S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 6355{
97aff369 6356 dVAR;
71207a34 6357 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
6358}
6359
6360/*
a0d0e21e
LW
6361- reginsert - insert an operator in front of already-emitted operand
6362*
6363* Means relocating the operand.
6364*/
76e3520e 6365STATIC void
830247a4 6366S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 6367{
97aff369 6368 dVAR;
c277df42
IZ
6369 register regnode *src;
6370 register regnode *dst;
6371 register regnode *place;
504618e9 6372 const int offset = regarglen[(U8)op];
07be1b83 6373 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 6374/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
6375
6376 if (SIZE_ONLY) {
830247a4 6377 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
6378 return;
6379 }
a687059c 6380
830247a4
IZ
6381 src = RExC_emit;
6382 RExC_emit += NODE_STEP_REGNODE + offset;
6383 dst = RExC_emit;
fac92740 6384 while (src > opnd) {
c277df42 6385 StructCopy(--src, --dst, regnode);
fac92740 6386 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 6387 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 6388 "reg_insert",
ccb2c380
MP
6389 __LINE__,
6390 reg_name[op],
07be1b83
YO
6391 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6392 ? "Overwriting end of array!\n" : "OK",
6393 (UV)(src - RExC_emit_start),
6394 (UV)(dst - RExC_emit_start),
6395 (UV)RExC_offsets[0]));
ccb2c380
MP
6396 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6397 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
6398 }
6399 }
6400
a0d0e21e
LW
6401
6402 place = opnd; /* Op node, where operand used to be. */
fac92740 6403 if (RExC_offsets) { /* MJD */
07be1b83 6404 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6405 "reginsert",
ccb2c380
MP
6406 __LINE__,
6407 reg_name[op],
07be1b83 6408 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 6409 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
6410 (UV)(place - RExC_emit_start),
6411 (UV)(RExC_parse - RExC_start),
786e8c11 6412 (UV)RExC_offsets[0]));
ccb2c380 6413 Set_Node_Offset(place, RExC_parse);
45948336 6414 Set_Node_Length(place, 1);
fac92740 6415 }
c277df42
IZ
6416 src = NEXTOPER(place);
6417 FILL_ADVANCE_NODE(place, op);
6418 Zero(src, offset, regnode);
a687059c
LW
6419}
6420
6421/*
c277df42 6422- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 6423- SEE ALSO: regtail_study
a0d0e21e 6424*/
097eb12c 6425/* TODO: All three parms should be const */
76e3520e 6426STATIC void
3dab1dad 6427S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 6428{
97aff369 6429 dVAR;
c277df42 6430 register regnode *scan;
72f13be8 6431 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
6432#ifndef DEBUGGING
6433 PERL_UNUSED_ARG(depth);
6434#endif
a0d0e21e 6435
c277df42 6436 if (SIZE_ONLY)
a0d0e21e
LW
6437 return;
6438
6439 /* Find last node. */
6440 scan = p;
6441 for (;;) {
504618e9 6442 regnode * const temp = regnext(scan);
3dab1dad
YO
6443 DEBUG_PARSE_r({
6444 SV * const mysv=sv_newmortal();
6445 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
6446 regprop(RExC_rx, mysv, scan);
6447 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
6448 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
6449 });
6450 if (temp == NULL)
6451 break;
6452 scan = temp;
6453 }
6454
6455 if (reg_off_by_arg[OP(scan)]) {
6456 ARG_SET(scan, val - scan);
6457 }
6458 else {
6459 NEXT_OFF(scan) = val - scan;
6460 }
6461}
6462
07be1b83 6463#ifdef DEBUGGING
3dab1dad
YO
6464/*
6465- regtail_study - set the next-pointer at the end of a node chain of p to val.
6466- Look for optimizable sequences at the same time.
6467- currently only looks for EXACT chains.
07be1b83
YO
6468
6469This is expermental code. The idea is to use this routine to perform
6470in place optimizations on branches and groups as they are constructed,
6471with the long term intention of removing optimization from study_chunk so
6472that it is purely analytical.
6473
6474Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
6475to control which is which.
6476
3dab1dad
YO
6477*/
6478/* TODO: All four parms should be const */
07be1b83 6479
3dab1dad
YO
6480STATIC U8
6481S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
6482{
6483 dVAR;
6484 register regnode *scan;
07be1b83
YO
6485 U8 exact = PSEUDO;
6486#ifdef EXPERIMENTAL_INPLACESCAN
6487 I32 min = 0;
6488#endif
6489
3dab1dad
YO
6490 GET_RE_DEBUG_FLAGS_DECL;
6491
07be1b83 6492
3dab1dad
YO
6493 if (SIZE_ONLY)
6494 return exact;
6495
6496 /* Find last node. */
6497
6498 scan = p;
6499 for (;;) {
6500 regnode * const temp = regnext(scan);
07be1b83
YO
6501#ifdef EXPERIMENTAL_INPLACESCAN
6502 if (PL_regkind[OP(scan)] == EXACT)
6503 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
6504 return EXACT;
6505#endif
3dab1dad
YO
6506 if ( exact ) {
6507 switch (OP(scan)) {
6508 case EXACT:
6509 case EXACTF:
6510 case EXACTFL:
6511 if( exact == PSEUDO )
6512 exact= OP(scan);
07be1b83
YO
6513 else if ( exact != OP(scan) )
6514 exact= 0;
3dab1dad
YO
6515 case NOTHING:
6516 break;
6517 default:
6518 exact= 0;
6519 }
6520 }
6521 DEBUG_PARSE_r({
6522 SV * const mysv=sv_newmortal();
6523 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
6524 regprop(RExC_rx, mysv, scan);
6525 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
6526 SvPV_nolen_const(mysv),
6527 reg_name[exact],
6528 REG_NODE_NUM(scan));
6529 });
a0d0e21e
LW
6530 if (temp == NULL)
6531 break;
6532 scan = temp;
6533 }
07be1b83
YO
6534 DEBUG_PARSE_r({
6535 SV * const mysv_val=sv_newmortal();
6536 DEBUG_PARSE_MSG("");
6537 regprop(RExC_rx, mysv_val, val);
6538 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
6539 SvPV_nolen_const(mysv_val),
6540 REG_NODE_NUM(val),
6541 val - scan
6542 );
6543 });
c277df42
IZ
6544 if (reg_off_by_arg[OP(scan)]) {
6545 ARG_SET(scan, val - scan);
a0ed51b3
LW
6546 }
6547 else {
c277df42
IZ
6548 NEXT_OFF(scan) = val - scan;
6549 }
3dab1dad
YO
6550
6551 return exact;
a687059c 6552}
07be1b83 6553#endif
a687059c
LW
6554
6555/*
a687059c
LW
6556 - regcurly - a little FSA that accepts {\d+,?\d*}
6557 */
79072805 6558STATIC I32
5f66b61c 6559S_regcurly(register const char *s)
a687059c
LW
6560{
6561 if (*s++ != '{')
6562 return FALSE;
f0fcb552 6563 if (!isDIGIT(*s))
a687059c 6564 return FALSE;
f0fcb552 6565 while (isDIGIT(*s))
a687059c
LW
6566 s++;
6567 if (*s == ',')
6568 s++;
f0fcb552 6569 while (isDIGIT(*s))
a687059c
LW
6570 s++;
6571 if (*s != '}')
6572 return FALSE;
6573 return TRUE;
6574}
6575
a687059c
LW
6576
6577/*
fd181c75 6578 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
6579 */
6580void
097eb12c 6581Perl_regdump(pTHX_ const regexp *r)
a687059c 6582{
35ff7856 6583#ifdef DEBUGGING
97aff369 6584 dVAR;
c445ea15 6585 SV * const sv = sv_newmortal();
ab3bbdeb 6586 SV *dsv= sv_newmortal();
a687059c 6587
786e8c11 6588 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
6589
6590 /* Header fields of interest. */
ab3bbdeb
YO
6591 if (r->anchored_substr) {
6592 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
6593 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 6594 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
6595 "anchored %s%s at %"IVdf" ",
6596 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 6597 (IV)r->anchored_offset);
ab3bbdeb
YO
6598 } else if (r->anchored_utf8) {
6599 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
6600 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 6601 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
6602 "anchored utf8 %s%s at %"IVdf" ",
6603 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 6604 (IV)r->anchored_offset);
ab3bbdeb
YO
6605 }
6606 if (r->float_substr) {
6607 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
6608 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 6609 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
6610 "floating %s%s at %"IVdf"..%"UVuf" ",
6611 s, RE_SV_TAIL(r->float_substr),
7b0972df 6612 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
6613 } else if (r->float_utf8) {
6614 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
6615 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 6616 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
6617 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
6618 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 6619 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 6620 }
33b8afdf 6621 if (r->check_substr || r->check_utf8)
b81d288d 6622 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
6623 (const char *)
6624 (r->check_substr == r->float_substr
6625 && r->check_utf8 == r->float_utf8
6626 ? "(checking floating" : "(checking anchored"));
c277df42
IZ
6627 if (r->reganch & ROPT_NOSCAN)
6628 PerlIO_printf(Perl_debug_log, " noscan");
6629 if (r->reganch & ROPT_CHECK_ALL)
6630 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 6631 if (r->check_substr || r->check_utf8)
c277df42
IZ
6632 PerlIO_printf(Perl_debug_log, ") ");
6633
46fc3d4c 6634 if (r->regstclass) {
32fc9b6a 6635 regprop(r, sv, r->regstclass);
3f7c398e 6636 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
46fc3d4c 6637 }
774d564b 6638 if (r->reganch & ROPT_ANCH) {
6639 PerlIO_printf(Perl_debug_log, "anchored");
6640 if (r->reganch & ROPT_ANCH_BOL)
6641 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
6642 if (r->reganch & ROPT_ANCH_MBOL)
6643 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
6644 if (r->reganch & ROPT_ANCH_SBOL)
6645 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 6646 if (r->reganch & ROPT_ANCH_GPOS)
6647 PerlIO_printf(Perl_debug_log, "(GPOS)");
6648 PerlIO_putc(Perl_debug_log, ' ');
6649 }
c277df42
IZ
6650 if (r->reganch & ROPT_GPOS_SEEN)
6651 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 6652 if (r->reganch & ROPT_SKIP)
760ac839 6653 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 6654 if (r->reganch & ROPT_IMPLICIT)
760ac839 6655 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 6656 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
6657 if (r->reganch & ROPT_EVAL_SEEN)
6658 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 6659 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 6660#else
96a5add6 6661 PERL_UNUSED_CONTEXT;
65e66c80 6662 PERL_UNUSED_ARG(r);
17c3b450 6663#endif /* DEBUGGING */
a687059c
LW
6664}
6665
6666/*
a0d0e21e
LW
6667- regprop - printable representation of opcode
6668*/
46fc3d4c 6669void
32fc9b6a 6670Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 6671{
35ff7856 6672#ifdef DEBUGGING
97aff369 6673 dVAR;
9b155405 6674 register int k;
a0d0e21e 6675
54dc92de 6676 sv_setpvn(sv, "", 0);
9b155405 6677 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
6678 /* It would be nice to FAIL() here, but this may be called from
6679 regexec.c, and it would be hard to supply pRExC_state. */
6680 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 6681 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 6682
3dab1dad 6683 k = PL_regkind[OP(o)];
9b155405 6684
2a782b5b 6685 if (k == EXACT) {
396482e1 6686 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
6687 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
6688 * is a crude hack but it may be the best for now since
6689 * we have no flag "this EXACTish node was UTF-8"
6690 * --jhi */
6691 const char * const s =
ddc5bc0f 6692 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
6693 PL_colors[0], PL_colors[1],
6694 PERL_PV_ESCAPE_UNI_DETECT |
6695 PERL_PV_PRETTY_ELIPSES |
6696 PERL_PV_PRETTY_LTGT
6697 );
6698 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 6699 } else if (k == TRIE) {
3dab1dad
YO
6700 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
6701 /* print the details of the trie in dumpuntil instead, as
4f639d21 6702 * prog->data isn't available here */
a3621e74 6703 } else if (k == CURLY) {
cb434fcc 6704 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
6705 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
6706 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 6707 }
2c2d71f5
JH
6708 else if (k == WHILEM && o->flags) /* Ordinal/of */
6709 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 6710 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 6711 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 6712 else if (k == LOGICAL)
04ebc1ab 6713 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
6714 else if (k == ANYOF) {
6715 int i, rangestart = -1;
2d03de9c 6716 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
6717
6718 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
6719 static const char * const anyofs[] = {
653099ff
GS
6720 "\\w",
6721 "\\W",
6722 "\\s",
6723 "\\S",
6724 "\\d",
6725 "\\D",
6726 "[:alnum:]",
6727 "[:^alnum:]",
6728 "[:alpha:]",
6729 "[:^alpha:]",
6730 "[:ascii:]",
6731 "[:^ascii:]",
6732 "[:ctrl:]",
6733 "[:^ctrl:]",
6734 "[:graph:]",
6735 "[:^graph:]",
6736 "[:lower:]",
6737 "[:^lower:]",
6738 "[:print:]",
6739 "[:^print:]",
6740 "[:punct:]",
6741 "[:^punct:]",
6742 "[:upper:]",
aaa51d5e 6743 "[:^upper:]",
653099ff 6744 "[:xdigit:]",
aaa51d5e
JF
6745 "[:^xdigit:]",
6746 "[:space:]",
6747 "[:^space:]",
6748 "[:blank:]",
6749 "[:^blank:]"
653099ff
GS
6750 };
6751
19860706 6752 if (flags & ANYOF_LOCALE)
396482e1 6753 sv_catpvs(sv, "{loc}");
19860706 6754 if (flags & ANYOF_FOLD)
396482e1 6755 sv_catpvs(sv, "{i}");
653099ff 6756 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 6757 if (flags & ANYOF_INVERT)
396482e1 6758 sv_catpvs(sv, "^");
ffc61ed2
JH
6759 for (i = 0; i <= 256; i++) {
6760 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6761 if (rangestart == -1)
6762 rangestart = i;
6763 } else if (rangestart != -1) {
6764 if (i <= rangestart + 3)
6765 for (; rangestart < i; rangestart++)
653099ff 6766 put_byte(sv, rangestart);
ffc61ed2
JH
6767 else {
6768 put_byte(sv, rangestart);
396482e1 6769 sv_catpvs(sv, "-");
ffc61ed2 6770 put_byte(sv, i - 1);
653099ff 6771 }
ffc61ed2 6772 rangestart = -1;
653099ff 6773 }
847a199f 6774 }
ffc61ed2
JH
6775
6776 if (o->flags & ANYOF_CLASS)
bb7a0f54 6777 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
6778 if (ANYOF_CLASS_TEST(o,i))
6779 sv_catpv(sv, anyofs[i]);
6780
6781 if (flags & ANYOF_UNICODE)
396482e1 6782 sv_catpvs(sv, "{unicode}");
1aa99e6b 6783 else if (flags & ANYOF_UNICODE_ALL)
396482e1 6784 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
6785
6786 {
6787 SV *lv;
32fc9b6a 6788 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 6789
ffc61ed2
JH
6790 if (lv) {
6791 if (sw) {
89ebb4a3 6792 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 6793
ffc61ed2 6794 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 6795 uvchr_to_utf8(s, i);
ffc61ed2 6796
3568d838 6797 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
6798 if (rangestart == -1)
6799 rangestart = i;
6800 } else if (rangestart != -1) {
ffc61ed2
JH
6801 if (i <= rangestart + 3)
6802 for (; rangestart < i; rangestart++) {
2d03de9c
AL
6803 const U8 * const e = uvchr_to_utf8(s,rangestart);
6804 U8 *p;
6805 for(p = s; p < e; p++)
ffc61ed2
JH
6806 put_byte(sv, *p);
6807 }
6808 else {
2d03de9c
AL
6809 const U8 *e = uvchr_to_utf8(s,rangestart);
6810 U8 *p;
6811 for (p = s; p < e; p++)
ffc61ed2 6812 put_byte(sv, *p);
396482e1 6813 sv_catpvs(sv, "-");
2d03de9c
AL
6814 e = uvchr_to_utf8(s, i-1);
6815 for (p = s; p < e; p++)
1df70142 6816 put_byte(sv, *p);
ffc61ed2
JH
6817 }
6818 rangestart = -1;
6819 }
19860706 6820 }
ffc61ed2 6821
396482e1 6822 sv_catpvs(sv, "..."); /* et cetera */
19860706 6823 }
fde631ed 6824
ffc61ed2 6825 {
2e0de35c 6826 char *s = savesvpv(lv);
c445ea15 6827 char * const origs = s;
b81d288d 6828
3dab1dad
YO
6829 while (*s && *s != '\n')
6830 s++;
b81d288d 6831
ffc61ed2 6832 if (*s == '\n') {
2d03de9c 6833 const char * const t = ++s;
ffc61ed2
JH
6834
6835 while (*s) {
6836 if (*s == '\n')
6837 *s = ' ';
6838 s++;
6839 }
6840 if (s[-1] == ' ')
6841 s[-1] = 0;
6842
6843 sv_catpv(sv, t);
fde631ed 6844 }
b81d288d 6845
ffc61ed2 6846 Safefree(origs);
fde631ed
JH
6847 }
6848 }
653099ff 6849 }
ffc61ed2 6850
653099ff
GS
6851 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6852 }
9b155405 6853 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 6854 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 6855#else
96a5add6 6856 PERL_UNUSED_CONTEXT;
65e66c80
SP
6857 PERL_UNUSED_ARG(sv);
6858 PERL_UNUSED_ARG(o);
f9049ba1 6859 PERL_UNUSED_ARG(prog);
17c3b450 6860#endif /* DEBUGGING */
35ff7856 6861}
a687059c 6862
cad2e5aa
JH
6863SV *
6864Perl_re_intuit_string(pTHX_ regexp *prog)
6865{ /* Assume that RE_INTUIT is set */
97aff369 6866 dVAR;
a3621e74 6867 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
6868 PERL_UNUSED_CONTEXT;
6869
a3621e74 6870 DEBUG_COMPILE_r(
cfd0369c 6871 {
2d03de9c 6872 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 6873 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
6874
6875 if (!PL_colorset) reginitcolors();
6876 PerlIO_printf(Perl_debug_log,
a0288114 6877 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
6878 PL_colors[4],
6879 prog->check_substr ? "" : "utf8 ",
6880 PL_colors[5],PL_colors[0],
cad2e5aa
JH
6881 s,
6882 PL_colors[1],
6883 (strlen(s) > 60 ? "..." : ""));
6884 } );
6885
33b8afdf 6886 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
6887}
6888
2b69d0c2 6889void
864dbfa3 6890Perl_pregfree(pTHX_ struct regexp *r)
a687059c 6891{
27da23d5 6892 dVAR;
0df25f3d 6893
fc32ee4a 6894 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 6895
7821416a
IZ
6896 if (!r || (--r->refcnt > 0))
6897 return;
ab3bbdeb 6898 DEBUG_COMPILE_r({
0df25f3d
YO
6899 if (!PL_colorset)
6900 reginitcolors();
ab3bbdeb
YO
6901 if (RX_DEBUG(r)){
6902 SV *dsv= sv_newmortal();
6903 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
6904 dsv, r->precomp, r->prelen, 60);
6905 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
6906 PL_colors[4],PL_colors[5],s);
6907 }
9e55ce06 6908 });
cad2e5aa 6909
43c5f42d
NC
6910 /* gcov results gave these as non-null 100% of the time, so there's no
6911 optimisation in checking them before calling Safefree */
6912 Safefree(r->precomp);
6913 Safefree(r->offsets); /* 20010421 MJD */
ed252734 6914 RX_MATCH_COPY_FREE(r);
f8c7b90f 6915#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
6916 if (r->saved_copy)
6917 SvREFCNT_dec(r->saved_copy);
6918#endif
a193d654
GS
6919 if (r->substrs) {
6920 if (r->anchored_substr)
6921 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6922 if (r->anchored_utf8)
6923 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6924 if (r->float_substr)
6925 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6926 if (r->float_utf8)
6927 SvREFCNT_dec(r->float_utf8);
2779dcf1 6928 Safefree(r->substrs);
a193d654 6929 }
c277df42
IZ
6930 if (r->data) {
6931 int n = r->data->count;
f3548bdc
DM
6932 PAD* new_comppad = NULL;
6933 PAD* old_comppad;
4026c95a 6934 PADOFFSET refcnt;
dfad63ad 6935
c277df42 6936 while (--n >= 0) {
261faec3 6937 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6938 switch (r->data->what[n]) {
6939 case 's':
6940 SvREFCNT_dec((SV*)r->data->data[n]);
6941 break;
653099ff
GS
6942 case 'f':
6943 Safefree(r->data->data[n]);
6944 break;
dfad63ad
HS
6945 case 'p':
6946 new_comppad = (AV*)r->data->data[n];
6947 break;
c277df42 6948 case 'o':
dfad63ad 6949 if (new_comppad == NULL)
cea2e8a9 6950 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6951 PAD_SAVE_LOCAL(old_comppad,
6952 /* Watch out for global destruction's random ordering. */
c445ea15 6953 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 6954 );
b34c0dd4 6955 OP_REFCNT_LOCK;
4026c95a
SH
6956 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6957 OP_REFCNT_UNLOCK;
6958 if (!refcnt)
9b978d73 6959 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6960
f3548bdc 6961 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6962 SvREFCNT_dec((SV*)new_comppad);
6963 new_comppad = NULL;
c277df42
IZ
6964 break;
6965 case 'n':
9e55ce06 6966 break;
07be1b83 6967 case 'T':
be8e71aa
YO
6968 { /* Aho Corasick add-on structure for a trie node.
6969 Used in stclass optimization only */
07be1b83
YO
6970 U32 refcount;
6971 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
6972 OP_REFCNT_LOCK;
6973 refcount = --aho->refcount;
6974 OP_REFCNT_UNLOCK;
6975 if ( !refcount ) {
6976 Safefree(aho->states);
6977 Safefree(aho->fail);
6978 aho->trie=NULL; /* not necessary to free this as it is
6979 handled by the 't' case */
6980 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 6981 Safefree(r->regstclass);
07be1b83
YO
6982 }
6983 }
6984 break;
a3621e74 6985 case 't':
07be1b83 6986 {
be8e71aa 6987 /* trie structure. */
07be1b83
YO
6988 U32 refcount;
6989 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6990 OP_REFCNT_LOCK;
6991 refcount = --trie->refcount;
6992 OP_REFCNT_UNLOCK;
6993 if ( !refcount ) {
6994 Safefree(trie->charmap);
6995 if (trie->widecharmap)
6996 SvREFCNT_dec((SV*)trie->widecharmap);
6997 Safefree(trie->states);
6998 Safefree(trie->trans);
6999 if (trie->bitmap)
7000 Safefree(trie->bitmap);
7001 if (trie->wordlen)
7002 Safefree(trie->wordlen);
786e8c11
YO
7003 if (trie->jump)
7004 Safefree(trie->jump);
7005 if (trie->nextword)
7006 Safefree(trie->nextword);
a3621e74 7007#ifdef DEBUGGING
be8e71aa
YO
7008 if (RX_DEBUG(r)) {
7009 if (trie->words)
7010 SvREFCNT_dec((SV*)trie->words);
7011 if (trie->revcharmap)
7012 SvREFCNT_dec((SV*)trie->revcharmap);
7013 }
a3621e74 7014#endif
07be1b83 7015 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 7016 }
07be1b83
YO
7017 }
7018 break;
c277df42 7019 default:
830247a4 7020 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
7021 }
7022 }
7023 Safefree(r->data->what);
7024 Safefree(r->data);
a0d0e21e
LW
7025 }
7026 Safefree(r->startp);
7027 Safefree(r->endp);
7028 Safefree(r);
a687059c 7029}
c277df42 7030
76234dfb 7031#ifndef PERL_IN_XSUB_RE
c277df42
IZ
7032/*
7033 - regnext - dig the "next" pointer out of a node
c277df42
IZ
7034 */
7035regnode *
864dbfa3 7036Perl_regnext(pTHX_ register regnode *p)
c277df42 7037{
97aff369 7038 dVAR;
c277df42
IZ
7039 register I32 offset;
7040
3280af22 7041 if (p == &PL_regdummy)
c277df42
IZ
7042 return(NULL);
7043
7044 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
7045 if (offset == 0)
7046 return(NULL);
7047
c277df42 7048 return(p+offset);
c277df42 7049}
76234dfb 7050#endif
c277df42 7051
01f988be 7052STATIC void
cea2e8a9 7053S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
7054{
7055 va_list args;
7056 STRLEN l1 = strlen(pat1);
7057 STRLEN l2 = strlen(pat2);
7058 char buf[512];
06bf62c7 7059 SV *msv;
73d840c0 7060 const char *message;
c277df42
IZ
7061
7062 if (l1 > 510)
7063 l1 = 510;
7064 if (l1 + l2 > 510)
7065 l2 = 510 - l1;
7066 Copy(pat1, buf, l1 , char);
7067 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
7068 buf[l1 + l2] = '\n';
7069 buf[l1 + l2 + 1] = '\0';
8736538c
AS
7070#ifdef I_STDARG
7071 /* ANSI variant takes additional second argument */
c277df42 7072 va_start(args, pat2);
8736538c
AS
7073#else
7074 va_start(args);
7075#endif
5a844595 7076 msv = vmess(buf, &args);
c277df42 7077 va_end(args);
cfd0369c 7078 message = SvPV_const(msv,l1);
c277df42
IZ
7079 if (l1 > 512)
7080 l1 = 512;
7081 Copy(message, buf, l1 , char);
197cf9b9 7082 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 7083 Perl_croak(aTHX_ "%s", buf);
c277df42 7084}
a0ed51b3
LW
7085
7086/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
7087
76234dfb 7088#ifndef PERL_IN_XSUB_RE
a0ed51b3 7089void
864dbfa3 7090Perl_save_re_context(pTHX)
b81d288d 7091{
97aff369 7092 dVAR;
1ade1aa1
NC
7093
7094 struct re_save_state *state;
7095
7096 SAVEVPTR(PL_curcop);
7097 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
7098
7099 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
7100 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
7101 SSPUSHINT(SAVEt_RE_STATE);
7102
46ab3289 7103 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 7104
a0ed51b3 7105 PL_reg_start_tmp = 0;
a0ed51b3 7106 PL_reg_start_tmpl = 0;
c445ea15 7107 PL_reg_oldsaved = NULL;
a5db57d6 7108 PL_reg_oldsavedlen = 0;
a5db57d6 7109 PL_reg_maxiter = 0;
a5db57d6 7110 PL_reg_leftiter = 0;
c445ea15 7111 PL_reg_poscache = NULL;
a5db57d6 7112 PL_reg_poscache_size = 0;
1ade1aa1
NC
7113#ifdef PERL_OLD_COPY_ON_WRITE
7114 PL_nrs = NULL;
7115#endif
ada6e8a9 7116
c445ea15
AL
7117 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
7118 if (PL_curpm) {
7119 const REGEXP * const rx = PM_GETRE(PL_curpm);
7120 if (rx) {
1df70142 7121 U32 i;
ada6e8a9 7122 for (i = 1; i <= rx->nparens; i++) {
1df70142 7123 char digits[TYPE_CHARS(long)];
d9fad198 7124 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
7125 GV *const *const gvp
7126 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
7127
b37c2d43
AL
7128 if (gvp) {
7129 GV * const gv = *gvp;
7130 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
7131 save_scalar(gv);
49f27e4b 7132 }
ada6e8a9
AMS
7133 }
7134 }
7135 }
a0ed51b3 7136}
76234dfb 7137#endif
51371543 7138
51371543 7139static void
acfe0abc 7140clear_re(pTHX_ void *r)
51371543 7141{
97aff369 7142 dVAR;
51371543
GS
7143 ReREFCNT_dec((regexp *)r);
7144}
ffbc6a93 7145
a28509cc
AL
7146#ifdef DEBUGGING
7147
7148STATIC void
7149S_put_byte(pTHX_ SV *sv, int c)
7150{
7151 if (isCNTRL(c) || c == 255 || !isPRINT(c))
7152 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
7153 else if (c == '-' || c == ']' || c == '\\' || c == '^')
7154 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
7155 else
7156 Perl_sv_catpvf(aTHX_ sv, "%c", c);
7157}
7158
786e8c11 7159
3dab1dad
YO
7160#define CLEAR_OPTSTART \
7161 if (optstart) STMT_START { \
07be1b83 7162 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
7163 optstart=NULL; \
7164 } STMT_END
7165
786e8c11 7166#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 7167
b5a2f8d8
NC
7168STATIC const regnode *
7169S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
7170 const regnode *last, const regnode *plast,
7171 SV* sv, I32 indent, U32 depth)
a28509cc 7172{
97aff369 7173 dVAR;
786e8c11 7174 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 7175 register const regnode *next;
3dab1dad
YO
7176 const regnode *optstart= NULL;
7177 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 7178
786e8c11
YO
7179#ifdef DEBUG_DUMPUNTIL
7180 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
7181 last ? last-start : 0,plast ? plast-start : 0);
7182#endif
7183
7184 if (plast && plast < last)
7185 last= plast;
7186
7187 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
7188 /* While that wasn't END last time... */
7189
7190 NODE_ALIGN(node);
7191 op = OP(node);
7192 if (op == CLOSE)
786e8c11 7193 indent--;
b5a2f8d8 7194 next = regnext((regnode *)node);
07be1b83 7195
a28509cc 7196 /* Where, what. */
8e11feef 7197 if (OP(node) == OPTIMIZED) {
e68ec53f 7198 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 7199 optstart = node;
3dab1dad 7200 else
8e11feef 7201 goto after_print;
3dab1dad
YO
7202 } else
7203 CLEAR_OPTSTART;
07be1b83 7204
32fc9b6a 7205 regprop(r, sv, node);
a28509cc 7206 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 7207 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
7208
7209 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
7210 if (next == NULL) /* Next ptr. */
7211 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
7212 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
7213 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
7214 else
7215 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11
YO
7216
7217 if (PL_regkind[(U8)op] != TRIE)
7218 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
7219 }
7220
a28509cc
AL
7221 after_print:
7222 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
7223 assert(next);
7224 {
7225 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
7226 ? regnext((regnode *)next)
7227 : next);
be8e71aa
YO
7228 if (last && nnode > last)
7229 nnode = last;
786e8c11 7230 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 7231 }
a28509cc
AL
7232 }
7233 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 7234 assert(next);
786e8c11 7235 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
7236 }
7237 else if ( PL_regkind[(U8)op] == TRIE ) {
7238 const I32 n = ARG(node);
4f639d21 7239 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
786e8c11 7240 const regnode *nextbranch= NULL;
a28509cc 7241 I32 word_idx;
786e8c11
YO
7242
7243 DEBUG_TRIE_COMPILE_r(
7244 PerlIO_printf(Perl_debug_log,
7245 " S:%"UVuf"/%"IVdf" W:%d L:%d/%d C:%d/%d ",
7246 (UV)trie->startstate,
7247 (IV)trie->laststate-1,
7248 trie->wordcount,
7249 (int)trie->minlen,
7250 (int)trie->maxlen,
7251 (int)TRIE_CHARCOUNT(trie),
7252 trie->uniquecharcount
7253 );
7254 );
7255 if ( op==TRIEC || trie->bitmap ) {
3dab1dad
YO
7256 int i;
7257 int rangestart= -1;
786e8c11
YO
7258 U8* bitmap= op==TRIEC ? ANYOF_BITMAP(node) : TRIE_BITMAP(trie);
7259
3dab1dad
YO
7260 sv_setpvn(sv, "", 0);
7261 for (i = 0; i <= 256; i++) {
786e8c11 7262 if (i < 256 && BITMAP_TEST(bitmap,i)) {
3dab1dad
YO
7263 if (rangestart == -1)
7264 rangestart = i;
7265 } else if (rangestart != -1) {
7266 if (i <= rangestart + 3)
7267 for (; rangestart < i; rangestart++)
7268 put_byte(sv, rangestart);
7269 else {
7270 put_byte(sv, rangestart);
7271 sv_catpvs(sv, "-");
7272 put_byte(sv, i - 1);
7273 }
7274 rangestart = -1;
7275 }
7276 }
786e8c11 7277 PerlIO_printf(Perl_debug_log, "[%s]\n", SvPVX_const(sv));
3dab1dad 7278 } else
786e8c11 7279 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 7280
786e8c11
YO
7281
7282
7283 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 7284 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
7285
7286 PerlIO_printf(Perl_debug_log, "%*s%s ",
7287 (int)(2*(indent+3)), "",
7288 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
7289 PL_colors[0], PL_colors[1],
7290 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
7291 PERL_PV_PRETTY_ELIPSES |
7292 PERL_PV_PRETTY_LTGT
786e8c11
YO
7293 )
7294 : "???"
7295 );
7296 if (trie->jump) {
7297 U16 dist= trie->jump[word_idx+1];
7298 PerlIO_printf(Perl_debug_log, "(%u)\n",(next - dist) - start);
7299 if (dist) {
7300 if (!nextbranch)
7301 nextbranch= next - trie->jump[0];
7302 DUMPUNTIL(next - dist, nextbranch);
7303 }
7304 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
7305 nextbranch= regnext((regnode *)nextbranch);
7306 } else {
7307 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 7308 }
786e8c11
YO
7309 }
7310 if (last && next > last)
7311 node= last;
7312 else
7313 node= next;
a28509cc 7314 }
786e8c11
YO
7315 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
7316 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
7317 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
7318 }
7319 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 7320 assert(next);
786e8c11 7321 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
7322 }
7323 else if ( op == PLUS || op == STAR) {
786e8c11 7324 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
7325 }
7326 else if (op == ANYOF) {
7327 /* arglen 1 + class block */
7328 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7329 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7330 node = NEXTOPER(node);
7331 }
7332 else if (PL_regkind[(U8)op] == EXACT) {
7333 /* Literal string, where present. */
7334 node += NODE_SZ_STR(node) - 1;
7335 node = NEXTOPER(node);
7336 }
7337 else {
7338 node = NEXTOPER(node);
7339 node += regarglen[(U8)op];
7340 }
7341 if (op == CURLYX || op == OPEN)
786e8c11 7342 indent++;
a28509cc 7343 else if (op == WHILEM)
786e8c11 7344 indent--;
a28509cc 7345 }
3dab1dad 7346 CLEAR_OPTSTART;
786e8c11
YO
7347#ifdef DEBUG_DUMPUNTIL
7348 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
7349#endif
7350 return last ? last : node;
a28509cc
AL
7351}
7352
7353#endif /* DEBUGGING */
7354
241d1a3b
NC
7355/*
7356 * Local variables:
7357 * c-indentation-style: bsd
7358 * c-basic-offset: 4
7359 * indent-tabs-mode: t
7360 * End:
7361 *
37442d52
RGS
7362 * ex: set ts=8 sts=4 sw=4 noet:
7363 */