This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Need to do some sort of die/warn to get the "global destruction"
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e
AD
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
cad2e5aa 38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
d06ea78c 44/* We *really* need to overwrite these symbols: */
56953603
IZ
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
d06ea78c 48# define Perl_pregfree my_regfree
cad2e5aa
JH
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
d06ea78c 51# define Perl_regnext my_regnext
f0b8d043 52# define Perl_save_re_context my_save_re_context
b81d288d 53# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
b81d288d 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 *
79 **** Alterations to Henry's code are...
80 ****
4bb101f2 81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
241d1a3b 82 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 83 ****
9ef589d8
LW
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
86
a687059c
LW
87 *
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
91 */
92#include "EXTERN.h"
864dbfa3 93#define PERL_IN_REGCOMP_C
a687059c 94#include "perl.h"
d06ea78c 95
acfe0abc 96#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
97# include "INTERN.h"
98#endif
c277df42
IZ
99
100#define REG_COMP_C
a687059c
LW
101#include "regcomp.h"
102
d4cce5f1 103#ifdef op
11343788 104#undef op
d4cce5f1 105#endif /* op */
11343788 106
fe14fcc3 107#ifdef MSDOS
7e4e8c89 108# if defined(BUGGY_MSC6)
fe14fcc3 109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 110# pragma optimize("a",off)
fe14fcc3 111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
112# pragma optimize("w",on )
113# endif /* BUGGY_MSC6 */
fe14fcc3
LW
114#endif /* MSDOS */
115
a687059c
LW
116#ifndef STATIC
117#define STATIC static
118#endif
119
830247a4 120typedef struct RExC_state_t {
e2509266 121 U32 flags; /* are we folding, multilining? */
830247a4
IZ
122 char *precomp; /* uncompiled string. */
123 regexp *rx;
fac92740 124 char *start; /* Start of input for compile */
830247a4
IZ
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 128 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 129 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
132 U32 seen;
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
135 I32 extralen;
136 I32 seen_zerolen;
137 I32 seen_evals;
1aa99e6b 138 I32 utf8;
830247a4
IZ
139#if ADD_TO_REGEXEC
140 char *starttry; /* -Dr: where regtry was called. */
141#define RExC_starttry (pRExC_state->starttry)
142#endif
143} RExC_state_t;
144
e2509266 145#define RExC_flags (pRExC_state->flags)
830247a4
IZ
146#define RExC_precomp (pRExC_state->precomp)
147#define RExC_rx (pRExC_state->rx)
fac92740 148#define RExC_start (pRExC_state->start)
830247a4
IZ
149#define RExC_end (pRExC_state->end)
150#define RExC_parse (pRExC_state->parse)
151#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 152#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 153#define RExC_emit (pRExC_state->emit)
fac92740 154#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
155#define RExC_naughty (pRExC_state->naughty)
156#define RExC_sawback (pRExC_state->sawback)
157#define RExC_seen (pRExC_state->seen)
158#define RExC_size (pRExC_state->size)
159#define RExC_npar (pRExC_state->npar)
160#define RExC_extralen (pRExC_state->extralen)
161#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 163#define RExC_utf8 (pRExC_state->utf8)
830247a4 164
a687059c
LW
165#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
a687059c 168
35c8bce7
LW
169#ifdef SPSTART
170#undef SPSTART /* dratted cpp namespace... */
171#endif
a687059c
LW
172/*
173 * Flags to be passed up and down.
174 */
a687059c 175#define WORST 0 /* Worst case. */
821b33a5 176#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
177#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178#define SPSTART 0x4 /* Starts with * or +. */
179#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 180
2c2d71f5
JH
181/* Length of a variant. */
182
183typedef struct scan_data_t {
184 I32 len_min;
185 I32 len_delta;
186 I32 pos_min;
187 I32 pos_delta;
188 SV *last_found;
189 I32 last_end; /* min value, <0 unless valid. */
190 I32 last_start_min;
191 I32 last_start_max;
192 SV **longest; /* Either &l_fixed, or &l_float. */
193 SV *longest_fixed;
194 I32 offset_fixed;
195 SV *longest_float;
196 I32 offset_float_min;
197 I32 offset_float_max;
198 I32 flags;
199 I32 whilem_c;
cb434fcc 200 I32 *last_closep;
653099ff 201 struct regnode_charclass_class *start_class;
2c2d71f5
JH
202} scan_data_t;
203
a687059c 204/*
e50aee73 205 * Forward declarations for pregcomp()'s friends.
a687059c 206 */
a0d0e21e 207
27da23d5
JH
208static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
210
211#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212#define SF_BEFORE_SEOL 0x1
213#define SF_BEFORE_MEOL 0x2
214#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
216
09b7f37c
CB
217#ifdef NO_UNARY_PLUS
218# define SF_FIX_SHIFT_EOL (0+2)
219# define SF_FL_SHIFT_EOL (0+4)
220#else
221# define SF_FIX_SHIFT_EOL (+2)
222# define SF_FL_SHIFT_EOL (+4)
223#endif
c277df42
IZ
224
225#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
227
228#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230#define SF_IS_INF 0x40
231#define SF_HAS_PAR 0x80
232#define SF_IN_PAR 0x100
233#define SF_HAS_EVAL 0x200
4bfe0158 234#define SCF_DO_SUBSTR 0x400
653099ff
GS
235#define SCF_DO_STCLASS_AND 0x0800
236#define SCF_DO_STCLASS_OR 0x1000
237#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 238#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 239
eb160463 240#define UTF (RExC_utf8 != 0)
e2509266
JH
241#define LOC ((RExC_flags & PMf_LOCALE) != 0)
242#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 243
ffc61ed2 244#define OOB_UNICODE 12345678
93733859 245#define OOB_NAMEDCLASS -1
b8c5462f 246
a0ed51b3
LW
247#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
249
8615cb43 250
b45f050a
JF
251/* length of regex to show in messages that don't mark a position within */
252#define RegexLengthToShowInErrorMessages 127
253
254/*
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
258 */
7253e4e3
RK
259#define MARKER1 "<-- HERE" /* marker as it appears in the description */
260#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 261
7253e4e3 262#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
263
264/*
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
267 * "...".
268 */
ccb2c380 269#define FAIL(msg) STMT_START { \
bfed75c6 270 const char *ellipses = ""; \
ccb2c380
MP
271 IV len = RExC_end - RExC_precomp; \
272 \
273 if (!SIZE_ONLY) \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
278 ellipses = "..."; \
279 } \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
282} STMT_END
8615cb43 283
b45f050a
JF
284/*
285 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
286 * args. Show regex, up to a maximum length. If it's too long, chop and add
287 * "...".
288 */
ccb2c380 289#define FAIL2(pat,msg) STMT_START { \
bfed75c6 290 const char *ellipses = ""; \
ccb2c380
MP
291 IV len = RExC_end - RExC_precomp; \
292 \
293 if (!SIZE_ONLY) \
294 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
295 if (len > RegexLengthToShowInErrorMessages) { \
296 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
297 len = RegexLengthToShowInErrorMessages - 10; \
298 ellipses = "..."; \
299 } \
300 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
301 msg, (int)len, RExC_precomp, ellipses); \
302} STMT_END
b45f050a
JF
303
304
305/*
306 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
307 */
ccb2c380 308#define Simple_vFAIL(m) STMT_START { \
a28509cc 309 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
312} STMT_END
b45f050a
JF
313
314/*
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 */
ccb2c380
MP
317#define vFAIL(m) STMT_START { \
318 if (!SIZE_ONLY) \
319 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
320 Simple_vFAIL(m); \
321} STMT_END
b45f050a
JF
322
323/*
324 * Like Simple_vFAIL(), but accepts two arguments.
325 */
ccb2c380 326#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 327 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
328 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
329 (int)offset, RExC_precomp, RExC_precomp + offset); \
330} STMT_END
b45f050a
JF
331
332/*
333 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
334 */
ccb2c380
MP
335#define vFAIL2(m,a1) STMT_START { \
336 if (!SIZE_ONLY) \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
339} STMT_END
b45f050a
JF
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
ccb2c380 345#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 346 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
347 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
348 (int)offset, RExC_precomp, RExC_precomp + offset); \
349} STMT_END
b45f050a
JF
350
351/*
352 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
353 */
ccb2c380
MP
354#define vFAIL3(m,a1,a2) STMT_START { \
355 if (!SIZE_ONLY) \
356 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
357 Simple_vFAIL3(m, a1, a2); \
358} STMT_END
b45f050a
JF
359
360/*
361 * Like Simple_vFAIL(), but accepts four arguments.
362 */
ccb2c380 363#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 364 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
367} STMT_END
b45f050a 368
ccb2c380 369#define vWARN(loc,m) STMT_START { \
a28509cc 370 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
373} STMT_END
374
375#define vWARNdep(loc,m) STMT_START { \
a28509cc 376 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
380} STMT_END
381
382
383#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 384 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
387} STMT_END
388
389#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 390 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
393} STMT_END
394
395#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 396 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
399} STMT_END
400
401#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 402 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
405} STMT_END
9d1d55b5 406
8615cb43 407
cd439c50 408/* Allow for side effects in s */
ccb2c380
MP
409#define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
411} STMT_END
cd439c50 412
fac92740
MJD
413/* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
417 */
418
419#define MJD_OFFSET_DEBUG(x)
a3621e74 420/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
421
422
423#define Set_Node_Offset_To_R(node,byte) STMT_START { \
424 if (! SIZE_ONLY) { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
427 if((node) < 0) { \
428 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
429 } else { \
430 RExC_offsets[2*(node)-1] = (byte); \
431 } \
432 } \
433} STMT_END
434
435#define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
438
439#define Set_Node_Length_To_R(node,len) STMT_START { \
440 if (! SIZE_ONLY) { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
442 __LINE__, (node), (len))); \
443 if((node) < 0) { \
444 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
445 } else { \
446 RExC_offsets[2*(node)] = (len); \
447 } \
448 } \
449} STMT_END
450
451#define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454#define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
456
457/* Get offsets and lengths */
458#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
460
acfe0abc 461static void clear_re(pTHX_ void *r);
4327152a 462
653099ff
GS
463/* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
466
4327152a 467STATIC void
830247a4 468S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 469{
e1ec3a88
AL
470 const STRLEN l = CHR_SVLEN(data->last_found);
471 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 472
c277df42 473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 474 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
b81d288d 478 data->flags
c277df42
IZ
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
480 else
481 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
482 }
483 else {
c277df42 484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
485 data->offset_float_max = (l
486 ? data->last_start_max
c277df42 487 : data->pos_min + data->pos_delta);
9051bda5
HS
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
c277df42 490 if (data->flags & SF_BEFORE_EOL)
b81d288d 491 data->flags
c277df42
IZ
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
493 else
494 data->flags &= ~SF_FL_BEFORE_EOL;
495 }
496 }
497 SvCUR_set(data->last_found, 0);
0eda9292 498 {
a28509cc
AL
499 SV * const sv = data->last_found;
500 MAGIC * const mg =
0eda9292
JH
501 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
502 if (mg && mg->mg_len > 0)
503 mg->mg_len = 0;
504 }
c277df42
IZ
505 data->last_end = -1;
506 data->flags &= ~SF_BEFORE_EOL;
507}
508
653099ff
GS
509/* Can match anything (initialization) */
510STATIC void
830247a4 511S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 512{
653099ff 513 ANYOF_CLASS_ZERO(cl);
f8bef550 514 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 515 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
516 if (LOC)
517 cl->flags |= ANYOF_LOCALE;
518}
519
520/* Can match anything (initialization) */
521STATIC int
a28509cc 522S_cl_is_anything(pTHX_ const struct regnode_charclass_class *cl)
653099ff
GS
523{
524 int value;
525
aaa51d5e 526 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
527 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
528 return 1;
1aa99e6b
IH
529 if (!(cl->flags & ANYOF_UNICODE_ALL))
530 return 0;
f8bef550
NC
531 if (!ANYOF_BITMAP_TESTALLSET(cl))
532 return 0;
653099ff
GS
533 return 1;
534}
535
536/* Can match anything (initialization) */
537STATIC void
830247a4 538S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 539{
8ecf7187 540 Zero(cl, 1, struct regnode_charclass_class);
653099ff 541 cl->type = ANYOF;
830247a4 542 cl_anything(pRExC_state, cl);
653099ff
GS
543}
544
545STATIC void
830247a4 546S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 547{
8ecf7187 548 Zero(cl, 1, struct regnode_charclass_class);
653099ff 549 cl->type = ANYOF;
830247a4 550 cl_anything(pRExC_state, cl);
653099ff
GS
551 if (LOC)
552 cl->flags |= ANYOF_LOCALE;
553}
554
555/* 'And' a given class with another one. Can create false positives */
556/* We assume that cl is not inverted */
557STATIC void
558S_cl_and(pTHX_ struct regnode_charclass_class *cl,
a28509cc 559 const struct regnode_charclass_class *and_with)
653099ff 560{
653099ff
GS
561 if (!(and_with->flags & ANYOF_CLASS)
562 && !(cl->flags & ANYOF_CLASS)
563 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
564 && !(and_with->flags & ANYOF_FOLD)
565 && !(cl->flags & ANYOF_FOLD)) {
566 int i;
567
568 if (and_with->flags & ANYOF_INVERT)
569 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
570 cl->bitmap[i] &= ~and_with->bitmap[i];
571 else
572 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573 cl->bitmap[i] &= and_with->bitmap[i];
574 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
575 if (!(and_with->flags & ANYOF_EOS))
576 cl->flags &= ~ANYOF_EOS;
1aa99e6b 577
14ebb1a2
JH
578 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
579 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
580 cl->flags &= ~ANYOF_UNICODE_ALL;
581 cl->flags |= ANYOF_UNICODE;
582 ARG_SET(cl, ARG(and_with));
583 }
14ebb1a2
JH
584 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
585 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 586 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
587 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
588 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 589 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
590}
591
592/* 'OR' a given class with another one. Can create false positives */
593/* We assume that cl is not inverted */
594STATIC void
a28509cc 595S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 596{
653099ff
GS
597 if (or_with->flags & ANYOF_INVERT) {
598 /* We do not use
599 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
600 * <= (B1 | !B2) | (CL1 | !CL2)
601 * which is wasteful if CL2 is small, but we ignore CL2:
602 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
603 * XXXX Can we handle case-fold? Unclear:
604 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
605 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
606 */
607 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
608 && !(or_with->flags & ANYOF_FOLD)
609 && !(cl->flags & ANYOF_FOLD) ) {
610 int i;
611
612 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613 cl->bitmap[i] |= ~or_with->bitmap[i];
614 } /* XXXX: logic is complicated otherwise */
615 else {
830247a4 616 cl_anything(pRExC_state, cl);
653099ff
GS
617 }
618 } else {
619 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 621 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
622 || (cl->flags & ANYOF_FOLD)) ) {
623 int i;
624
625 /* OR char bitmap and class bitmap separately */
626 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
627 cl->bitmap[i] |= or_with->bitmap[i];
628 if (or_with->flags & ANYOF_CLASS) {
629 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
630 cl->classflags[i] |= or_with->classflags[i];
631 cl->flags |= ANYOF_CLASS;
632 }
633 }
634 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 635 cl_anything(pRExC_state, cl);
653099ff
GS
636 }
637 }
638 if (or_with->flags & ANYOF_EOS)
639 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
640
641 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
642 ARG(cl) != ARG(or_with)) {
643 cl->flags |= ANYOF_UNICODE_ALL;
644 cl->flags &= ~ANYOF_UNICODE;
645 }
646 if (or_with->flags & ANYOF_UNICODE_ALL) {
647 cl->flags |= ANYOF_UNICODE_ALL;
648 cl->flags &= ~ANYOF_UNICODE;
649 }
653099ff
GS
650}
651
5d1c421c 652/*
a3621e74
YO
653
654 make_trie(startbranch,first,last,tail,flags)
655 startbranch: the first branch in the whole branch sequence
656 first : start branch of sequence of branch-exact nodes.
657 May be the same as startbranch
658 last : Thing following the last branch.
659 May be the same as tail.
660 tail : item following the branch sequence
661 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
662
663Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
664
665A trie is an N'ary tree where the branches are determined by digital
666decomposition of the key. IE, at the root node you look up the 1st character and
667follow that branch repeat until you find the end of the branches. Nodes can be
668marked as "accepting" meaning they represent a complete word. Eg:
669
670 /he|she|his|hers/
671
672would convert into the following structure. Numbers represent states, letters
673following numbers represent valid transitions on the letter from that state, if
674the number is in square brackets it represents an accepting state, otherwise it
675will be in parenthesis.
676
677 +-h->+-e->[3]-+-r->(8)-+-s->[9]
678 | |
679 | (2)
680 | |
681 (1) +-i->(6)-+-s->[7]
682 |
683 +-s->(3)-+-h->(4)-+-e->[5]
684
685 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
686
687This shows that when matching against the string 'hers' we will begin at state 1
688read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
689then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
690is also accepting. Thus we know that we can match both 'he' and 'hers' with a
691single traverse. We store a mapping from accepting to state to which word was
692matched, and then when we have multiple possibilities we try to complete the
693rest of the regex in the order in which they occured in the alternation.
694
695The only prior NFA like behaviour that would be changed by the TRIE support is
696the silent ignoring of duplicate alternations which are of the form:
697
698 / (DUPE|DUPE) X? (?{ ... }) Y /x
699
700Thus EVAL blocks follwing a trie may be called a different number of times with
701and without the optimisation. With the optimisations dupes will be silently
702ignored. This inconsistant behaviour of EVAL type nodes is well established as
703the following demonstrates:
704
705 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
706
707which prints out 'word' three times, but
708
709 'words'=~/(word|word|word)(?{ print $1 })S/
710
711which doesnt print it out at all. This is due to other optimisations kicking in.
712
713Example of what happens on a structural level:
714
715The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
716
717 1: CURLYM[1] {1,32767}(18)
718 5: BRANCH(8)
719 6: EXACT <ac>(16)
720 8: BRANCH(11)
721 9: EXACT <ad>(16)
722 11: BRANCH(14)
723 12: EXACT <ab>(16)
724 16: SUCCEED(0)
725 17: NOTHING(18)
726 18: END(0)
727
728This would be optimizable with startbranch=5, first=5, last=16, tail=16
729and should turn into:
730
731 1: CURLYM[1] {1,32767}(18)
732 5: TRIE(16)
733 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
734 <ac>
735 <ad>
736 <ab>
737 16: SUCCEED(0)
738 17: NOTHING(18)
739 18: END(0)
740
741Cases where tail != last would be like /(?foo|bar)baz/:
742
743 1: BRANCH(4)
744 2: EXACT <foo>(8)
745 4: BRANCH(7)
746 5: EXACT <bar>(8)
747 7: TAIL(8)
748 8: EXACT <baz>(10)
749 10: END(0)
750
751which would be optimizable with startbranch=1, first=1, last=7, tail=8
752and would end up looking like:
753
754 1: TRIE(8)
755 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
756 <foo>
757 <bar>
758 7: TAIL(8)
759 8: EXACT <baz>(10)
760 10: END(0)
761
762*/
763
764#define TRIE_DEBUG_CHAR \
765 DEBUG_TRIE_COMPILE_r({ \
766 SV *tmp; \
767 if ( UTF ) { \
768 tmp = newSVpv( "", 0 ); \
769 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
770 } else { \
e4584336 771 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
a3621e74
YO
772 } \
773 av_push( trie->revcharmap, tmp ); \
774 })
775
776#define TRIE_READ_CHAR STMT_START { \
777 if ( UTF ) { \
778 if ( folder ) { \
779 if ( foldlen > 0 ) { \
780 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
781 foldlen -= len; \
782 scan += len; \
783 len = 0; \
784 } else { \
e1ec3a88 785 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
786 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
787 foldlen -= UNISKIP( uvc ); \
788 scan = foldbuf + UNISKIP( uvc ); \
789 } \
790 } else { \
e1ec3a88 791 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
792 } \
793 } else { \
794 uvc = (U32)*uc; \
795 len = 1; \
796 } \
797} STMT_END
798
799
800#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
801#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
802#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
803#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
804
805#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
806 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
807 TRIE_LIST_LEN( state ) *= 2; \
808 Renew( trie->states[ state ].trans.list, \
809 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
810 } \
811 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
812 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
813 TRIE_LIST_CUR( state )++; \
814} STMT_END
815
816#define TRIE_LIST_NEW(state) STMT_START { \
817 Newz( 1023, trie->states[ state ].trans.list, \
818 4, reg_trie_trans_le ); \
819 TRIE_LIST_CUR( state ) = 1; \
820 TRIE_LIST_LEN( state ) = 4; \
821} STMT_END
822
823STATIC I32
824S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
825{
27da23d5 826 dVAR;
a3621e74
YO
827 /* first pass, loop through and scan words */
828 reg_trie_data *trie;
829 regnode *cur;
e1ec3a88 830 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
a3621e74
YO
831 STRLEN len = 0;
832 UV uvc = 0;
833 U16 curword = 0;
834 U32 next_alloc = 0;
835 /* we just use folder as a flag in utf8 */
e1ec3a88 836 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
837 ? PL_fold
838 : ( flags == EXACTFL
839 ? PL_fold_locale
840 : NULL
841 )
842 );
843
e1ec3a88 844 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74
YO
845 SV *re_trie_maxbuff;
846
847 GET_RE_DEBUG_FLAGS_DECL;
848
849 Newz( 848200, trie, 1, reg_trie_data );
850 trie->refcount = 1;
851 RExC_rx->data->data[ data_slot ] = (void*)trie;
852 Newz( 848201, trie->charmap, 256, U16 );
853 DEBUG_r({
854 trie->words = newAV();
855 trie->revcharmap = newAV();
856 });
857
858
0111c4fd 859 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 860 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 861 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74
YO
862 }
863
864 /* -- First loop and Setup --
865
866 We first traverse the branches and scan each word to determine if it
867 contains widechars, and how many unique chars there are, this is
868 important as we have to build a table with at least as many columns as we
869 have unique chars.
870
871 We use an array of integers to represent the character codes 0..255
872 (trie->charmap) and we use a an HV* to store unicode characters. We use the
873 native representation of the character value as the key and IV's for the
874 coded index.
875
876 *TODO* If we keep track of how many times each character is used we can
877 remap the columns so that the table compression later on is more
878 efficient in terms of memory by ensuring most common value is in the
879 middle and the least common are on the outside. IMO this would be better
880 than a most to least common mapping as theres a decent chance the most
881 common letter will share a node with the least common, meaning the node
882 will not be compressable. With a middle is most common approach the worst
883 case is when we have the least common nodes twice.
884
885 */
886
887
888 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
889 regnode *noper = NEXTOPER( cur );
e1ec3a88 890 const U8 *uc = (U8*)STRING( noper );
a28509cc 891 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
892 STRLEN foldlen = 0;
893 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 894 const U8 *scan = (U8*)NULL;
a3621e74
YO
895
896 for ( ; uc < e ; uc += len ) {
897 trie->charcount++;
898 TRIE_READ_CHAR;
899 if ( uvc < 256 ) {
900 if ( !trie->charmap[ uvc ] ) {
901 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
902 if ( folder )
903 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
904 TRIE_DEBUG_CHAR;
905 }
906 } else {
907 SV** svpp;
908 if ( !trie->widecharmap )
909 trie->widecharmap = newHV();
910
911 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
912
913 if ( !svpp )
e4584336 914 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
915
916 if ( !SvTRUE( *svpp ) ) {
917 sv_setiv( *svpp, ++trie->uniquecharcount );
918 TRIE_DEBUG_CHAR;
919 }
920 }
921 }
922 trie->wordcount++;
923 } /* end first pass */
924 DEBUG_TRIE_COMPILE_r(
925 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
926 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
5d7488b2 927 (int)trie->charcount, trie->uniquecharcount )
a3621e74
YO
928 );
929
930
931 /*
932 We now know what we are dealing with in terms of unique chars and
933 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
934 representation using a flat table will take. If it's over a reasonable
935 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
936 conservative but potentially much slower representation using an array
937 of lists.
938
939 At the end we convert both representations into the same compressed
940 form that will be used in regexec.c for matching with. The latter
941 is a form that cannot be used to construct with but has memory
942 properties similar to the list form and access properties similar
943 to the table form making it both suitable for fast searches and
944 small enough that its feasable to store for the duration of a program.
945
946 See the comment in the code where the compressed table is produced
947 inplace from the flat tabe representation for an explanation of how
948 the compression works.
949
950 */
951
952
953 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
954 /*
955 Second Pass -- Array Of Lists Representation
956
957 Each state will be represented by a list of charid:state records
958 (reg_trie_trans_le) the first such element holds the CUR and LEN
959 points of the allocated array. (See defines above).
960
961 We build the initial structure using the lists, and then convert
962 it into the compressed table form which allows faster lookups
963 (but cant be modified once converted).
964
965
966 */
967
968
969 STRLEN transcount = 1;
970
971 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
972 TRIE_LIST_NEW(1);
973 next_alloc = 2;
974
975 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
976
977 regnode *noper = NEXTOPER( cur );
978 U8 *uc = (U8*)STRING( noper );
a28509cc 979 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
980 U32 state = 1; /* required init */
981 U16 charid = 0; /* sanity init */
982 U8 *scan = (U8*)NULL; /* sanity init */
983 STRLEN foldlen = 0; /* required init */
984 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
985
986
987 for ( ; uc < e ; uc += len ) {
988
989 TRIE_READ_CHAR;
990
991 if ( uvc < 256 ) {
992 charid = trie->charmap[ uvc ];
993 } else {
994 SV** svpp=(SV**)NULL;
995 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
996 if ( !svpp ) {
997 charid = 0;
998 } else {
999 charid=(U16)SvIV( *svpp );
1000 }
1001 }
1002 if ( charid ) {
1003
1004 U16 check;
1005 U32 newstate = 0;
1006
1007 charid--;
1008 if ( !trie->states[ state ].trans.list ) {
1009 TRIE_LIST_NEW( state );
1010 }
1011 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1012 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1013 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1014 break;
1015 }
a28509cc
AL
1016 }
1017 if ( ! newstate ) {
1018 newstate = next_alloc++;
1019 TRIE_LIST_PUSH( state, charid, newstate );
1020 transcount++;
1021 }
1022 state = newstate;
a3621e74 1023 } else {
e4584336 1024 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1025 }
1026 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1027 }
1028
1029 if ( !trie->states[ state ].wordnum ) {
1030 /* we havent inserted this word into the structure yet. */
1031 trie->states[ state ].wordnum = ++curword;
1032
1033 DEBUG_r({
1034 /* store the word for dumping */
1035 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1036 if ( UTF ) SvUTF8_on( tmp );
1037 av_push( trie->words, tmp );
1038 });
1039
1040 } else {
1041 /* Its a dupe. So ignore it. */
1042 }
1043
1044 } /* end second pass */
1045
1046 trie->laststate = next_alloc;
1047 Renew( trie->states, next_alloc, reg_trie_state );
1048
1049 DEBUG_TRIE_COMPILE_MORE_r({
1050 U32 state;
a3621e74 1051
a28509cc 1052 /* print out the table precompression. */
a3621e74
YO
1053
1054 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1055 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1056
1057 for( state=1 ; state < next_alloc ; state ++ ) {
a28509cc 1058 U16 charid;
a3621e74 1059
e4584336 1060 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
a3621e74
YO
1061 if ( ! trie->states[ state ].wordnum ) {
1062 PerlIO_printf( Perl_debug_log, "%5s| ","");
1063 } else {
e4584336 1064 PerlIO_printf( Perl_debug_log, "W%04x| ",
a3621e74
YO
1065 trie->states[ state ].wordnum
1066 );
1067 }
1068 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1069 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
e4584336 1070 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
cfd0369c 1071 SvPV_nolen_const( *tmp ),
a3621e74 1072 TRIE_LIST_ITEM(state,charid).forid,
e4584336 1073 (UV)TRIE_LIST_ITEM(state,charid).newstate
a3621e74
YO
1074 );
1075 }
1076
1077 }
1078 PerlIO_printf( Perl_debug_log, "\n\n" );
1079 });
1080
1081 Newz( 848203, trie->trans, transcount ,reg_trie_trans );
1082 {
1083 U32 state;
a3621e74
YO
1084 U32 tp = 0;
1085 U32 zp = 0;
1086
1087
1088 for( state=1 ; state < next_alloc ; state ++ ) {
1089 U32 base=0;
1090
1091 /*
1092 DEBUG_TRIE_COMPILE_MORE_r(
1093 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1094 );
1095 */
1096
1097 if (trie->states[state].trans.list) {
1098 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1099 U16 maxid=minid;
a28509cc 1100 U16 idx;
a3621e74
YO
1101
1102 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1103 if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
1104 minid=TRIE_LIST_ITEM( state, idx).forid;
1105 } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
1106 maxid=TRIE_LIST_ITEM( state, idx).forid;
1107 }
1108 }
1109 if ( transcount < tp + maxid - minid + 1) {
1110 transcount *= 2;
1111 Renew( trie->trans, transcount, reg_trie_trans );
1112 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1113 }
1114 base = trie->uniquecharcount + tp - minid;
1115 if ( maxid == minid ) {
1116 U32 set = 0;
1117 for ( ; zp < tp ; zp++ ) {
1118 if ( ! trie->trans[ zp ].next ) {
1119 base = trie->uniquecharcount + zp - minid;
1120 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1121 trie->trans[ zp ].check = state;
1122 set = 1;
1123 break;
1124 }
1125 }
1126 if ( !set ) {
1127 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1128 trie->trans[ tp ].check = state;
1129 tp++;
1130 zp = tp;
1131 }
1132 } else {
1133 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1134 U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1135 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1136 trie->trans[ tid ].check = state;
1137 }
1138 tp += ( maxid - minid + 1 );
1139 }
1140 Safefree(trie->states[ state ].trans.list);
1141 }
1142 /*
1143 DEBUG_TRIE_COMPILE_MORE_r(
1144 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1145 );
1146 */
1147 trie->states[ state ].trans.base=base;
1148 }
cc601c31 1149 trie->lasttrans = tp + 1;
a3621e74
YO
1150 }
1151 } else {
1152 /*
1153 Second Pass -- Flat Table Representation.
1154
1155 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1156 We know that we will need Charcount+1 trans at most to store the data
1157 (one row per char at worst case) So we preallocate both structures
1158 assuming worst case.
1159
1160 We then construct the trie using only the .next slots of the entry
1161 structs.
1162
1163 We use the .check field of the first entry of the node temporarily to
1164 make compression both faster and easier by keeping track of how many non
1165 zero fields are in the node.
1166
1167 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1168 transition.
1169
1170 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1171 number representing the first entry of the node, and state as a
1172 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1173 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1174 are 2 entrys per node. eg:
1175
1176 A B A B
1177 1. 2 4 1. 3 7
1178 2. 0 3 3. 0 5
1179 3. 0 0 5. 0 0
1180 4. 0 0 7. 0 0
1181
1182 The table is internally in the right hand, idx form. However as we also
1183 have to deal with the states array which is indexed by nodenum we have to
1184 use TRIE_NODENUM() to convert.
1185
1186 */
1187
1188 Newz( 848203, trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1189 reg_trie_trans );
1190 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
1191 next_alloc = trie->uniquecharcount + 1;
1192
1193 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1194
1195 regnode *noper = NEXTOPER( cur );
a28509cc
AL
1196 const U8 *uc = (U8*)STRING( noper );
1197 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1198
1199 U32 state = 1; /* required init */
1200
1201 U16 charid = 0; /* sanity init */
1202 U32 accept_state = 0; /* sanity init */
1203 U8 *scan = (U8*)NULL; /* sanity init */
1204
1205 STRLEN foldlen = 0; /* required init */
1206 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1207
1208
1209 for ( ; uc < e ; uc += len ) {
1210
1211 TRIE_READ_CHAR;
1212
1213 if ( uvc < 256 ) {
1214 charid = trie->charmap[ uvc ];
1215 } else {
1216 SV** svpp=(SV**)NULL;
1217 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1218 if ( !svpp ) {
1219 charid = 0;
1220 } else {
1221 charid=(U16)SvIV( *svpp );
1222 }
1223 }
1224 if ( charid ) {
1225 charid--;
1226 if ( !trie->trans[ state + charid ].next ) {
1227 trie->trans[ state + charid ].next = next_alloc;
1228 trie->trans[ state ].check++;
1229 next_alloc += trie->uniquecharcount;
1230 }
1231 state = trie->trans[ state + charid ].next;
1232 } else {
e4584336 1233 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1234 }
1235 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1236 }
1237
1238 accept_state = TRIE_NODENUM( state );
1239 if ( !trie->states[ accept_state ].wordnum ) {
1240 /* we havent inserted this word into the structure yet. */
1241 trie->states[ accept_state ].wordnum = ++curword;
1242
1243 DEBUG_r({
1244 /* store the word for dumping */
1245 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1246 if ( UTF ) SvUTF8_on( tmp );
1247 av_push( trie->words, tmp );
1248 });
1249
1250 } else {
1251 /* Its a dupe. So ignore it. */
1252 }
1253
1254 } /* end second pass */
1255
1256 DEBUG_TRIE_COMPILE_MORE_r({
1257 /*
1258 print out the table precompression so that we can do a visual check
1259 that they are identical.
1260 */
1261 U32 state;
1262 U16 charid;
1263 PerlIO_printf( Perl_debug_log, "\nChar : " );
1264
1265 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1266 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1267 if ( tmp ) {
cfd0369c 1268 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1269 }
1270 }
1271
1272 PerlIO_printf( Perl_debug_log, "\nState+-" );
1273
1274 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1275 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1276 }
1277
1278 PerlIO_printf( Perl_debug_log, "\n" );
1279
1280 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1281
e4584336 1282 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
a3621e74
YO
1283
1284 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
e4584336
RB
1285 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1286 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
a3621e74
YO
1287 }
1288 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
e4584336 1289 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
a3621e74 1290 } else {
e4584336 1291 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
a3621e74
YO
1292 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1293 }
1294 }
1295 PerlIO_printf( Perl_debug_log, "\n\n" );
1296 });
1297 {
1298 /*
1299 * Inplace compress the table.*
1300
1301 For sparse data sets the table constructed by the trie algorithm will
1302 be mostly 0/FAIL transitions or to put it another way mostly empty.
1303 (Note that leaf nodes will not contain any transitions.)
1304
1305 This algorithm compresses the tables by eliminating most such
1306 transitions, at the cost of a modest bit of extra work during lookup:
1307
1308 - Each states[] entry contains a .base field which indicates the
1309 index in the state[] array wheres its transition data is stored.
1310
1311 - If .base is 0 there are no valid transitions from that node.
1312
1313 - If .base is nonzero then charid is added to it to find an entry in
1314 the trans array.
1315
1316 -If trans[states[state].base+charid].check!=state then the
1317 transition is taken to be a 0/Fail transition. Thus if there are fail
1318 transitions at the front of the node then the .base offset will point
1319 somewhere inside the previous nodes data (or maybe even into a node
1320 even earlier), but the .check field determines if the transition is
1321 valid.
1322
1323 The following process inplace converts the table to the compressed
1324 table: We first do not compress the root node 1,and mark its all its
1325 .check pointers as 1 and set its .base pointer as 1 as well. This
1326 allows to do a DFA construction from the compressed table later, and
1327 ensures that any .base pointers we calculate later are greater than
1328 0.
1329
1330 - We set 'pos' to indicate the first entry of the second node.
1331
1332 - We then iterate over the columns of the node, finding the first and
1333 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1334 and set the .check pointers accordingly, and advance pos
1335 appropriately and repreat for the next node. Note that when we copy
1336 the next pointers we have to convert them from the original
1337 NODEIDX form to NODENUM form as the former is not valid post
1338 compression.
1339
1340 - If a node has no transitions used we mark its base as 0 and do not
1341 advance the pos pointer.
1342
1343 - If a node only has one transition we use a second pointer into the
1344 structure to fill in allocated fail transitions from other states.
1345 This pointer is independent of the main pointer and scans forward
1346 looking for null transitions that are allocated to a state. When it
1347 finds one it writes the single transition into the "hole". If the
1348 pointer doesnt find one the single transition is appeneded as normal.
1349
1350 - Once compressed we can Renew/realloc the structures to release the
1351 excess space.
1352
1353 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1354 specifically Fig 3.47 and the associated pseudocode.
1355
1356 demq
1357 */
a3b680e6 1358 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1359 U32 state, charid;
a3621e74
YO
1360 U32 pos = 0, zp=0;
1361 trie->laststate = laststate;
1362
1363 for ( state = 1 ; state < laststate ; state++ ) {
1364 U8 flag = 0;
a28509cc
AL
1365 const U32 stateidx = TRIE_NODEIDX( state );
1366 const U32 o_used = trie->trans[ stateidx ].check;
1367 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1368 trie->trans[ stateidx ].check = 0;
1369
1370 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1371 if ( flag || trie->trans[ stateidx + charid ].next ) {
1372 if ( trie->trans[ stateidx + charid ].next ) {
1373 if (o_used == 1) {
1374 for ( ; zp < pos ; zp++ ) {
1375 if ( ! trie->trans[ zp ].next ) {
1376 break;
1377 }
1378 }
1379 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1380 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1381 trie->trans[ zp ].check = state;
1382 if ( ++zp > pos ) pos = zp;
1383 break;
1384 }
1385 used--;
1386 }
1387 if ( !flag ) {
1388 flag = 1;
1389 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1390 }
1391 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1392 trie->trans[ pos ].check = state;
1393 pos++;
1394 }
1395 }
1396 }
cc601c31 1397 trie->lasttrans = pos + 1;
a3621e74
YO
1398 Renew( trie->states, laststate + 1, reg_trie_state);
1399 DEBUG_TRIE_COMPILE_MORE_r(
e4584336
RB
1400 PerlIO_printf( Perl_debug_log,
1401 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
5d7488b2
AL
1402 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1403 (IV)next_alloc,
1404 (IV)pos,
a3621e74
YO
1405 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1406 );
1407
1408 } /* end table compress */
1409 }
cc601c31
YO
1410 /* resize the trans array to remove unused space */
1411 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74
YO
1412
1413 DEBUG_TRIE_COMPILE_r({
1414 U32 state;
1415 /*
1416 Now we print it out again, in a slightly different form as there is additional
1417 info we want to be able to see when its compressed. They are close enough for
1418 visual comparison though.
1419 */
1420 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1421
1422 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1423 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1424 if ( tmp ) {
cfd0369c 1425 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1426 }
1427 }
1428 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
cc601c31 1429
a3621e74
YO
1430 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1431 PerlIO_printf( Perl_debug_log, "-----");
1432 PerlIO_printf( Perl_debug_log, "\n");
cc601c31 1433
a3621e74 1434 for( state = 1 ; state < trie->laststate ; state++ ) {
a28509cc 1435 const U32 base = trie->states[ state ].trans.base;
a3621e74 1436
e4584336 1437 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
a3621e74
YO
1438
1439 if ( trie->states[ state ].wordnum ) {
1440 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1441 } else {
1442 PerlIO_printf( Perl_debug_log, "%6s", "" );
1443 }
1444
e4584336 1445 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
a3621e74
YO
1446
1447 if ( base ) {
1448 U32 ofs = 0;
1449
cc601c31
YO
1450 while( ( base + ofs < trie->uniquecharcount ) ||
1451 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1452 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
a3621e74
YO
1453 ofs++;
1454
e4584336 1455 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
a3621e74
YO
1456
1457 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1458 if ( ( base + ofs >= trie->uniquecharcount ) &&
1459 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1460 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1461 {
e4584336
RB
1462 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1463 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
a3621e74
YO
1464 } else {
1465 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1466 }
1467 }
1468
e4584336 1469 PerlIO_printf( Perl_debug_log, "]");
a3621e74
YO
1470
1471 }
1472 PerlIO_printf( Perl_debug_log, "\n" );
1473 }
1474 });
1475
1476 {
1477 /* now finally we "stitch in" the new TRIE node
1478 This means we convert either the first branch or the first Exact,
1479 depending on whether the thing following (in 'last') is a branch
1480 or not and whther first is the startbranch (ie is it a sub part of
1481 the alternation or is it the whole thing.)
1482 Assuming its a sub part we conver the EXACT otherwise we convert
1483 the whole branch sequence, including the first.
1484 */
1485 regnode *convert;
1486
1487
1488
1489
1490 if ( first == startbranch && OP( last ) != BRANCH ) {
1491 convert = first;
1492 } else {
1493 convert = NEXTOPER( first );
1494 NEXT_OFF( first ) = (U16)(last - first);
1495 }
1496
1497 OP( convert ) = TRIE + (U8)( flags - EXACT );
1498 NEXT_OFF( convert ) = (U16)(tail - convert);
1499 ARG_SET( convert, data_slot );
1500
1501 /* tells us if we need to handle accept buffers specially */
1502 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1503
1504
1505 /* needed for dumping*/
1506 DEBUG_r({
1507 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1508 /* We now need to mark all of the space originally used by the
1509 branches as optimized away. This keeps the dumpuntil from
1510 throwing a wobbly as it doesnt use regnext() to traverse the
1511 opcodes.
1512 */
1513 while( optimize < last ) {
1514 OP( optimize ) = OPTIMIZED;
1515 optimize++;
1516 }
1517 });
1518 } /* end node insert */
1519 return 1;
1520}
1521
1522
1523
1524/*
5d1c421c
JH
1525 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1526 * These need to be revisited when a newer toolchain becomes available.
1527 */
1528#if defined(__sparc64__) && defined(__GNUC__)
1529# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1530# undef SPARC64_GCC_WORKAROUND
1531# define SPARC64_GCC_WORKAROUND 1
1532# endif
1533#endif
1534
653099ff
GS
1535/* REx optimizer. Converts nodes into quickier variants "in place".
1536 Finds fixed substrings. */
1537
a0288114 1538/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
1539 to the position after last scanned or to NULL. */
1540
a3621e74 1541
76e3520e 1542STATIC I32
a3621e74 1543S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1544 /* scanp: Start here (read-write). */
1545 /* deltap: Write maxlen-minlen here. */
1546 /* last: Stop before this one. */
1547{
1548 I32 min = 0, pars = 0, code;
1549 regnode *scan = *scanp, *next;
1550 I32 delta = 0;
1551 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1552 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1553 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1554 scan_data_t data_fake;
653099ff 1555 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1556 SV *re_trie_maxbuff = NULL;
1557
1558 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1559
c277df42
IZ
1560 while (scan && OP(scan) != END && scan < last) {
1561 /* Peephole optimizer: */
a3621e74
YO
1562 DEBUG_OPTIMISE_r({
1563 SV *mysv=sv_newmortal();
1564 regprop( mysv, scan);
e4584336 1565 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
cfd0369c 1566 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
a3621e74 1567 });
c277df42 1568
22c35a8c 1569 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 1570 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1571 regnode *n = regnext(scan);
1572 U32 stringok = 1;
1573#ifdef DEBUGGING
1574 regnode *stop = scan;
b81d288d 1575#endif
c277df42 1576
cd439c50 1577 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1578 /* Skip NOTHING, merge EXACT*. */
1579 while (n &&
b81d288d 1580 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
1581 (stringok && (OP(n) == OP(scan))))
1582 && NEXT_OFF(n)
1583 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1584 if (OP(n) == TAIL || n > next)
1585 stringok = 0;
22c35a8c 1586 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
1587 NEXT_OFF(scan) += NEXT_OFF(n);
1588 next = n + NODE_STEP_REGNODE;
1589#ifdef DEBUGGING
1590 if (stringok)
1591 stop = n;
b81d288d 1592#endif
c277df42 1593 n = regnext(n);
a0ed51b3 1594 }
f49d4d0f 1595 else if (stringok) {
a3b680e6 1596 const int oldl = STR_LEN(scan);
c277df42 1597 regnode *nnext = regnext(n);
f49d4d0f 1598
b81d288d 1599 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1600 break;
1601 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1602 STR_LEN(scan) += STR_LEN(n);
1603 next = n + NODE_SZ_STR(n);
c277df42 1604 /* Now we can overwrite *n : */
f49d4d0f 1605 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1606#ifdef DEBUGGING
f49d4d0f 1607 stop = next - 1;
b81d288d 1608#endif
c277df42
IZ
1609 n = nnext;
1610 }
1611 }
61a36c01 1612
a3621e74 1613 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1614/*
1615 Two problematic code points in Unicode casefolding of EXACT nodes:
1616
1617 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1618 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1619
1620 which casefold to
1621
1622 Unicode UTF-8
1623
1624 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1625 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1626
1627 This means that in case-insensitive matching (or "loose matching",
1628 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1629 length of the above casefolded versions) can match a target string
1630 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1631 This would rather mess up the minimum length computation.
1632
1633 What we'll do is to look for the tail four bytes, and then peek
1634 at the preceding two bytes to see whether we need to decrease
1635 the minimum length by four (six minus two).
1636
1637 Thanks to the design of UTF-8, there cannot be false matches:
1638 A sequence of valid UTF-8 bytes cannot be a subsequence of
1639 another valid sequence of UTF-8 bytes.
1640
1641*/
1642 char *s0 = STRING(scan), *s, *t;
1643 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
a28509cc
AL
1644 const char * const t0 = "\xcc\x88\xcc\x81";
1645 const char * const t1 = t0 + 3;
2af232bd 1646
61a36c01
JH
1647 for (s = s0 + 2;
1648 s < s2 && (t = ninstr(s, s1, t0, t1));
1649 s = t + 4) {
1650 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1651 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1652 min -= 4;
1653 }
1654 }
1655
c277df42
IZ
1656#ifdef DEBUGGING
1657 /* Allow dumping */
cd439c50 1658 n = scan + NODE_SZ_STR(scan);
c277df42 1659 while (n <= stop) {
22c35a8c 1660 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1661 OP(n) = OPTIMIZED;
1662 NEXT_OFF(n) = 0;
1663 }
1664 n++;
1665 }
653099ff 1666#endif
c277df42 1667 }
a3621e74
YO
1668
1669
1670
653099ff
GS
1671 /* Follow the next-chain of the current node and optimize
1672 away all the NOTHINGs from it. */
c277df42 1673 if (OP(scan) != CURLYX) {
a3b680e6 1674 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
1675 ? I32_MAX
1676 /* I32 may be smaller than U16 on CRAYs! */
1677 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1678 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1679 int noff;
1680 regnode *n = scan;
b81d288d 1681
c277df42
IZ
1682 /* Skip NOTHING and LONGJMP. */
1683 while ((n = regnext(n))
22c35a8c 1684 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1685 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1686 && off + noff < max)
1687 off += noff;
1688 if (reg_off_by_arg[OP(scan)])
1689 ARG(scan) = off;
b81d288d 1690 else
c277df42
IZ
1691 NEXT_OFF(scan) = off;
1692 }
a3621e74 1693
653099ff
GS
1694 /* The principal pseudo-switch. Cannot be a switch, since we
1695 look into several different things. */
b81d288d 1696 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1697 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1698 next = regnext(scan);
1699 code = OP(scan);
a3621e74 1700 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1701
1702 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1703 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1704 struct regnode_charclass_class accum;
a3621e74 1705 regnode *startbranch=scan;
c277df42 1706
653099ff 1707 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1708 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1709 if (flags & SCF_DO_STCLASS)
830247a4 1710 cl_init_zero(pRExC_state, &accum);
a3621e74 1711
c277df42 1712 while (OP(scan) == code) {
830247a4 1713 I32 deltanext, minnext, f = 0, fake;
653099ff 1714 struct regnode_charclass_class this_class;
c277df42
IZ
1715
1716 num++;
1717 data_fake.flags = 0;
b81d288d 1718 if (data) {
2c2d71f5 1719 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1720 data_fake.last_closep = data->last_closep;
1721 }
1722 else
1723 data_fake.last_closep = &fake;
c277df42
IZ
1724 next = regnext(scan);
1725 scan = NEXTOPER(scan);
1726 if (code != BRANCH)
1727 scan = NEXTOPER(scan);
653099ff 1728 if (flags & SCF_DO_STCLASS) {
830247a4 1729 cl_init(pRExC_state, &this_class);
653099ff
GS
1730 data_fake.start_class = &this_class;
1731 f = SCF_DO_STCLASS_AND;
b81d288d 1732 }
e1901655
IZ
1733 if (flags & SCF_WHILEM_VISITED_POS)
1734 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1735
653099ff 1736 /* we suppose the run is continuous, last=next...*/
830247a4 1737 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1738 next, &data_fake, f,depth+1);
b81d288d 1739 if (min1 > minnext)
c277df42
IZ
1740 min1 = minnext;
1741 if (max1 < minnext + deltanext)
1742 max1 = minnext + deltanext;
1743 if (deltanext == I32_MAX)
aca2d497 1744 is_inf = is_inf_internal = 1;
c277df42
IZ
1745 scan = next;
1746 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1747 pars++;
405ff068 1748 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1749 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1750 if (data)
1751 data->whilem_c = data_fake.whilem_c;
653099ff 1752 if (flags & SCF_DO_STCLASS)
830247a4 1753 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1754 if (code == SUSPEND)
c277df42
IZ
1755 break;
1756 }
1757 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1758 min1 = 0;
1759 if (flags & SCF_DO_SUBSTR) {
1760 data->pos_min += min1;
1761 data->pos_delta += max1 - min1;
1762 if (max1 != min1 || is_inf)
1763 data->longest = &(data->longest_float);
1764 }
1765 min += min1;
1766 delta += max1 - min1;
653099ff 1767 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1768 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1769 if (min1) {
1770 cl_and(data->start_class, &and_with);
1771 flags &= ~SCF_DO_STCLASS;
1772 }
1773 }
1774 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1775 if (min1) {
1776 cl_and(data->start_class, &accum);
653099ff 1777 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1778 }
1779 else {
b81d288d 1780 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1781 * data->start_class */
1782 StructCopy(data->start_class, &and_with,
1783 struct regnode_charclass_class);
1784 flags &= ~SCF_DO_STCLASS_AND;
1785 StructCopy(&accum, data->start_class,
1786 struct regnode_charclass_class);
1787 flags |= SCF_DO_STCLASS_OR;
1788 data->start_class->flags |= ANYOF_EOS;
1789 }
653099ff 1790 }
a3621e74
YO
1791
1792 /* demq.
1793
1794 Assuming this was/is a branch we are dealing with: 'scan' now
1795 points at the item that follows the branch sequence, whatever
1796 it is. We now start at the beginning of the sequence and look
1797 for subsequences of
1798
1799 BRANCH->EXACT=>X
1800 BRANCH->EXACT=>X
1801
1802 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1803
1804 If we can find such a subseqence we need to turn the first
1805 element into a trie and then add the subsequent branch exact
1806 strings to the trie.
1807
1808 We have two cases
1809
1810 1. patterns where the whole set of branch can be converted to a trie,
1811
1812 2. patterns where only a subset of the alternations can be
1813 converted to a trie.
1814
1815 In case 1 we can replace the whole set with a single regop
1816 for the trie. In case 2 we need to keep the start and end
1817 branchs so
1818
1819 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1820 becomes BRANCH TRIE; BRANCH X;
1821
1822 Hypthetically when we know the regex isnt anchored we can
1823 turn a case 1 into a DFA and let it rip... Every time it finds a match
1824 it would just call its tail, no WHILEM/CURLY needed.
1825
1826 */
0111c4fd
RGS
1827 if (DO_TRIE) {
1828 if (!re_trie_maxbuff) {
1829 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1830 if (!SvIOK(re_trie_maxbuff))
1831 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1832 }
a3621e74
YO
1833 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1834 regnode *cur;
1835 regnode *first = (regnode *)NULL;
1836 regnode *last = (regnode *)NULL;
1837 regnode *tail = scan;
1838 U8 optype = 0;
1839 U32 count=0;
1840
1841#ifdef DEBUGGING
1842 SV *mysv = sv_newmortal(); /* for dumping */
1843#endif
1844 /* var tail is used because there may be a TAIL
1845 regop in the way. Ie, the exacts will point to the
1846 thing following the TAIL, but the last branch will
1847 point at the TAIL. So we advance tail. If we
1848 have nested (?:) we may have to move through several
1849 tails.
1850 */
1851
1852 while ( OP( tail ) == TAIL ) {
1853 /* this is the TAIL generated by (?:) */
1854 tail = regnext( tail );
1855 }
1856
1857 DEBUG_OPTIMISE_r({
1858 regprop( mysv, tail );
1859 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
cfd0369c 1860 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
a3621e74
YO
1861 (RExC_seen_evals) ? "[EVAL]" : ""
1862 );
1863 });
1864 /*
1865
1866 step through the branches, cur represents each
1867 branch, noper is the first thing to be matched
1868 as part of that branch and noper_next is the
1869 regnext() of that node. if noper is an EXACT
1870 and noper_next is the same as scan (our current
1871 position in the regex) then the EXACT branch is
1872 a possible optimization target. Once we have
1873 two or more consequetive such branches we can
1874 create a trie of the EXACT's contents and stich
1875 it in place. If the sequence represents all of
1876 the branches we eliminate the whole thing and
1877 replace it with a single TRIE. If it is a
1878 subsequence then we need to stitch it in. This
1879 means the first branch has to remain, and needs
1880 to be repointed at the item on the branch chain
1881 following the last branch optimized. This could
1882 be either a BRANCH, in which case the
1883 subsequence is internal, or it could be the
1884 item following the branch sequence in which
1885 case the subsequence is at the end.
1886
1887 */
1888
1889 /* dont use tail as the end marker for this traverse */
1890 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1891 regnode *noper = NEXTOPER( cur );
1892 regnode *noper_next = regnext( noper );
1893
a3621e74
YO
1894 DEBUG_OPTIMISE_r({
1895 regprop( mysv, cur);
1896 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1897 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
a3621e74
YO
1898
1899 regprop( mysv, noper);
1900 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 1901 SvPV_nolen_const(mysv));
a3621e74
YO
1902
1903 if ( noper_next ) {
1904 regprop( mysv, noper_next );
1905 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 1906 SvPV_nolen_const(mysv));
a3621e74
YO
1907 }
1908 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1909 first, last, cur );
1910 });
1911 if ( ( first ? OP( noper ) == optype
1912 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1913 && noper_next == tail && count<U16_MAX)
1914 {
1915 count++;
1916 if ( !first ) {
1917 first = cur;
1918 optype = OP( noper );
1919 } else {
1920 DEBUG_OPTIMISE_r(
1921 if (!last ) {
1922 regprop( mysv, first);
1923 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1924 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
a3621e74
YO
1925 regprop( mysv, NEXTOPER(first) );
1926 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1927 SvPV_nolen_const( mysv ) );
a3621e74
YO
1928 }
1929 );
1930 last = cur;
1931 DEBUG_OPTIMISE_r({
1932 regprop( mysv, cur);
1933 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1934 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
a3621e74
YO
1935 regprop( mysv, noper );
1936 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1937 SvPV_nolen_const( mysv ) );
a3621e74
YO
1938 });
1939 }
1940 } else {
1941 if ( last ) {
1942 DEBUG_OPTIMISE_r(
1943 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1944 (int)depth * 2 + 2, "E:", "**END**" );
a3621e74
YO
1945 );
1946 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1947 }
1948 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1949 && noper_next == tail )
1950 {
1951 count = 1;
1952 first = cur;
1953 optype = OP( noper );
1954 } else {
1955 count = 0;
1956 first = NULL;
1957 optype = 0;
1958 }
1959 last = NULL;
1960 }
1961 }
1962 DEBUG_OPTIMISE_r({
1963 regprop( mysv, cur);
1964 PerlIO_printf( Perl_debug_log,
e4584336 1965 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
cfd0369c 1966 " ", SvPV_nolen_const( mysv ), first, last, cur);
a3621e74
YO
1967
1968 });
1969 if ( last ) {
1970 DEBUG_OPTIMISE_r(
1971 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1972 (int)depth * 2 + 2, "E:", "==END==" );
a3621e74
YO
1973 );
1974 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1975 }
1976 }
1977 }
a0ed51b3 1978 }
a3621e74 1979 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 1980 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 1981 } else /* single branch is optimized. */
c277df42
IZ
1982 scan = NEXTOPER(scan);
1983 continue;
a0ed51b3
LW
1984 }
1985 else if (OP(scan) == EXACT) {
cd439c50 1986 I32 l = STR_LEN(scan);
1aa99e6b 1987 UV uc = *((U8*)STRING(scan));
a0ed51b3 1988 if (UTF) {
a3b680e6 1989 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 1990 l = utf8_length(s, s + l);
9041c2e3 1991 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
1992 }
1993 min += l;
c277df42 1994 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
1995 /* The code below prefers earlier match for fixed
1996 offset, later match for variable offset. */
1997 if (data->last_end == -1) { /* Update the start info. */
1998 data->last_start_min = data->pos_min;
1999 data->last_start_max = is_inf
b81d288d 2000 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2001 }
cd439c50 2002 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292
JH
2003 {
2004 SV * sv = data->last_found;
a28509cc 2005 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2006 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2007 if (mg && mg->mg_len >= 0)
5e43f467
JH
2008 mg->mg_len += utf8_length((U8*)STRING(scan),
2009 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2010 }
33b8afdf
JH
2011 if (UTF)
2012 SvUTF8_on(data->last_found);
c277df42
IZ
2013 data->last_end = data->pos_min + l;
2014 data->pos_min += l; /* As in the first entry. */
2015 data->flags &= ~SF_BEFORE_EOL;
2016 }
653099ff
GS
2017 if (flags & SCF_DO_STCLASS_AND) {
2018 /* Check whether it is compatible with what we know already! */
2019 int compat = 1;
2020
1aa99e6b 2021 if (uc >= 0x100 ||
516a5887 2022 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2023 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2024 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2025 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2026 )
653099ff
GS
2027 compat = 0;
2028 ANYOF_CLASS_ZERO(data->start_class);
2029 ANYOF_BITMAP_ZERO(data->start_class);
2030 if (compat)
1aa99e6b 2031 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2032 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2033 if (uc < 0x100)
2034 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2035 }
2036 else if (flags & SCF_DO_STCLASS_OR) {
2037 /* false positive possible if the class is case-folded */
1aa99e6b 2038 if (uc < 0x100)
9b877dbb
IH
2039 ANYOF_BITMAP_SET(data->start_class, uc);
2040 else
2041 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2042 data->start_class->flags &= ~ANYOF_EOS;
2043 cl_and(data->start_class, &and_with);
2044 }
2045 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2046 }
653099ff 2047 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2048 I32 l = STR_LEN(scan);
1aa99e6b 2049 UV uc = *((U8*)STRING(scan));
653099ff
GS
2050
2051 /* Search for fixed substrings supports EXACT only. */
b81d288d 2052 if (flags & SCF_DO_SUBSTR)
830247a4 2053 scan_commit(pRExC_state, data);
a0ed51b3 2054 if (UTF) {
1aa99e6b
IH
2055 U8 *s = (U8 *)STRING(scan);
2056 l = utf8_length(s, s + l);
9041c2e3 2057 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2058 }
2059 min += l;
c277df42 2060 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 2061 data->pos_min += l;
653099ff
GS
2062 if (flags & SCF_DO_STCLASS_AND) {
2063 /* Check whether it is compatible with what we know already! */
2064 int compat = 1;
2065
1aa99e6b 2066 if (uc >= 0x100 ||
516a5887 2067 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2068 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2069 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2070 compat = 0;
2071 ANYOF_CLASS_ZERO(data->start_class);
2072 ANYOF_BITMAP_ZERO(data->start_class);
2073 if (compat) {
1aa99e6b 2074 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2075 data->start_class->flags &= ~ANYOF_EOS;
2076 data->start_class->flags |= ANYOF_FOLD;
2077 if (OP(scan) == EXACTFL)
2078 data->start_class->flags |= ANYOF_LOCALE;
2079 }
2080 }
2081 else if (flags & SCF_DO_STCLASS_OR) {
2082 if (data->start_class->flags & ANYOF_FOLD) {
2083 /* false positive possible if the class is case-folded.
2084 Assume that the locale settings are the same... */
1aa99e6b
IH
2085 if (uc < 0x100)
2086 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2087 data->start_class->flags &= ~ANYOF_EOS;
2088 }
2089 cl_and(data->start_class, &and_with);
2090 }
2091 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2092 }
bfed75c6 2093 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2094 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2095 I32 f = flags, pos_before = 0;
c277df42 2096 regnode *oscan = scan;
653099ff
GS
2097 struct regnode_charclass_class this_class;
2098 struct regnode_charclass_class *oclass = NULL;
727f22e3 2099 I32 next_is_eval = 0;
653099ff 2100
22c35a8c 2101 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2102 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2103 scan = NEXTOPER(scan);
2104 goto finish;
2105 case PLUS:
653099ff 2106 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2107 next = NEXTOPER(scan);
653099ff 2108 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2109 mincount = 1;
2110 maxcount = REG_INFTY;
c277df42
IZ
2111 next = regnext(scan);
2112 scan = NEXTOPER(scan);
2113 goto do_curly;
2114 }
2115 }
2116 if (flags & SCF_DO_SUBSTR)
2117 data->pos_min++;
2118 min++;
2119 /* Fall through. */
2120 case STAR:
653099ff
GS
2121 if (flags & SCF_DO_STCLASS) {
2122 mincount = 0;
b81d288d 2123 maxcount = REG_INFTY;
653099ff
GS
2124 next = regnext(scan);
2125 scan = NEXTOPER(scan);
2126 goto do_curly;
2127 }
b81d288d 2128 is_inf = is_inf_internal = 1;
c277df42
IZ
2129 scan = regnext(scan);
2130 if (flags & SCF_DO_SUBSTR) {
830247a4 2131 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2132 data->longest = &(data->longest_float);
2133 }
2134 goto optimize_curly_tail;
2135 case CURLY:
b81d288d 2136 mincount = ARG1(scan);
c277df42
IZ
2137 maxcount = ARG2(scan);
2138 next = regnext(scan);
cb434fcc
IZ
2139 if (OP(scan) == CURLYX) {
2140 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2141 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2142 }
c277df42 2143 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2144 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2145 do_curly:
2146 if (flags & SCF_DO_SUBSTR) {
830247a4 2147 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2148 pos_before = data->pos_min;
2149 }
2150 if (data) {
2151 fl = data->flags;
2152 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2153 if (is_inf)
2154 data->flags |= SF_IS_INF;
2155 }
653099ff 2156 if (flags & SCF_DO_STCLASS) {
830247a4 2157 cl_init(pRExC_state, &this_class);
653099ff
GS
2158 oclass = data->start_class;
2159 data->start_class = &this_class;
2160 f |= SCF_DO_STCLASS_AND;
2161 f &= ~SCF_DO_STCLASS_OR;
2162 }
e1901655
IZ
2163 /* These are the cases when once a subexpression
2164 fails at a particular position, it cannot succeed
2165 even after backtracking at the enclosing scope.
b81d288d 2166
e1901655
IZ
2167 XXXX what if minimal match and we are at the
2168 initial run of {n,m}? */
2169 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2170 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2171
c277df42 2172 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2173 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2174 (mincount == 0
2175 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2176
2177 if (flags & SCF_DO_STCLASS)
2178 data->start_class = oclass;
2179 if (mincount == 0 || minnext == 0) {
2180 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2181 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2182 }
2183 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2184 /* Switch to OR mode: cache the old value of
653099ff
GS
2185 * data->start_class */
2186 StructCopy(data->start_class, &and_with,
2187 struct regnode_charclass_class);
2188 flags &= ~SCF_DO_STCLASS_AND;
2189 StructCopy(&this_class, data->start_class,
2190 struct regnode_charclass_class);
2191 flags |= SCF_DO_STCLASS_OR;
2192 data->start_class->flags |= ANYOF_EOS;
2193 }
2194 } else { /* Non-zero len */
2195 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2196 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2197 cl_and(data->start_class, &and_with);
2198 }
2199 else if (flags & SCF_DO_STCLASS_AND)
2200 cl_and(data->start_class, &this_class);
2201 flags &= ~SCF_DO_STCLASS;
2202 }
c277df42
IZ
2203 if (!scan) /* It was not CURLYX, but CURLY. */
2204 scan = next;
84037bb0 2205 if (ckWARN(WARN_REGEXP)
727f22e3
JP
2206 /* ? quantifier ok, except for (?{ ... }) */
2207 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2208 && (minnext == 0) && (deltanext == 0)
99799961 2209 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 2210 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 2211 {
830247a4 2212 vWARN(RExC_parse,
b45f050a
JF
2213 "Quantifier unexpected on zero-length expression");
2214 }
2215
c277df42 2216 min += minnext * mincount;
b81d288d 2217 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2218 && (minnext + deltanext) > 0)
2219 || deltanext == I32_MAX);
aca2d497 2220 is_inf |= is_inf_internal;
c277df42
IZ
2221 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2222
2223 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2224 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2225 && data->flags & SF_IN_PAR
2226 && !(data->flags & SF_HAS_EVAL)
2227 && !deltanext && minnext == 1 ) {
2228 /* Try to optimize to CURLYN. */
2229 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
2230 regnode *nxt1 = nxt;
2231#ifdef DEBUGGING
2232 regnode *nxt2;
2233#endif
c277df42
IZ
2234
2235 /* Skip open. */
2236 nxt = regnext(nxt);
bfed75c6 2237 if (!strchr((const char*)PL_simple,OP(nxt))
22c35a8c 2238 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 2239 && STR_LEN(nxt) == 1))
c277df42 2240 goto nogo;
497b47a8 2241#ifdef DEBUGGING
c277df42 2242 nxt2 = nxt;
497b47a8 2243#endif
c277df42 2244 nxt = regnext(nxt);
b81d288d 2245 if (OP(nxt) != CLOSE)
c277df42
IZ
2246 goto nogo;
2247 /* Now we know that nxt2 is the only contents: */
eb160463 2248 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2249 OP(oscan) = CURLYN;
2250 OP(nxt1) = NOTHING; /* was OPEN. */
2251#ifdef DEBUGGING
2252 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2253 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2254 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2255 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2256 OP(nxt + 1) = OPTIMIZED; /* was count. */
2257 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2258#endif
c277df42 2259 }
c277df42
IZ
2260 nogo:
2261
2262 /* Try optimization CURLYX => CURLYM. */
b81d288d 2263 if ( OP(oscan) == CURLYX && data
c277df42 2264 && !(data->flags & SF_HAS_PAR)
c277df42 2265 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2266 && !deltanext /* atom is fixed width */
2267 && minnext != 0 /* CURLYM can't handle zero width */
2268 ) {
c277df42
IZ
2269 /* XXXX How to optimize if data == 0? */
2270 /* Optimize to a simpler form. */
2271 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2272 regnode *nxt2;
2273
2274 OP(oscan) = CURLYM;
2275 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2276 && (OP(nxt2) != WHILEM))
c277df42
IZ
2277 nxt = nxt2;
2278 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2279 /* Need to optimize away parenths. */
2280 if (data->flags & SF_IN_PAR) {
2281 /* Set the parenth number. */
2282 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2283
b81d288d 2284 if (OP(nxt) != CLOSE)
b45f050a 2285 FAIL("Panic opt close");
eb160463 2286 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2287 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2288 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2289#ifdef DEBUGGING
2290 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2291 OP(nxt + 1) = OPTIMIZED; /* was count. */
2292 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2293 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2294#endif
c277df42
IZ
2295#if 0
2296 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2297 regnode *nnxt = regnext(nxt1);
b81d288d 2298
c277df42
IZ
2299 if (nnxt == nxt) {
2300 if (reg_off_by_arg[OP(nxt1)])
2301 ARG_SET(nxt1, nxt2 - nxt1);
2302 else if (nxt2 - nxt1 < U16_MAX)
2303 NEXT_OFF(nxt1) = nxt2 - nxt1;
2304 else
2305 OP(nxt) = NOTHING; /* Cannot beautify */
2306 }
2307 nxt1 = nnxt;
2308 }
2309#endif
2310 /* Optimize again: */
b81d288d 2311 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2312 NULL, 0,depth+1);
a0ed51b3
LW
2313 }
2314 else
c277df42 2315 oscan->flags = 0;
c277df42 2316 }
e1901655
IZ
2317 else if ((OP(oscan) == CURLYX)
2318 && (flags & SCF_WHILEM_VISITED_POS)
2319 /* See the comment on a similar expression above.
2320 However, this time it not a subexpression
2321 we care about, but the expression itself. */
2322 && (maxcount == REG_INFTY)
2323 && data && ++data->whilem_c < 16) {
2324 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2325 /* Find WHILEM (as in regexec.c) */
2326 regnode *nxt = oscan + NEXT_OFF(oscan);
2327
2328 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2329 nxt += ARG(nxt);
eb160463
GS
2330 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2331 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2332 }
b81d288d 2333 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2334 pars++;
2335 if (flags & SCF_DO_SUBSTR) {
2336 SV *last_str = Nullsv;
2337 int counted = mincount != 0;
2338
2339 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2340#if defined(SPARC64_GCC_WORKAROUND)
2341 I32 b = 0;
2342 STRLEN l = 0;
cfd0369c 2343 const char *s = NULL;
5d1c421c
JH
2344 I32 old = 0;
2345
2346 if (pos_before >= data->last_start_min)
2347 b = pos_before;
2348 else
2349 b = data->last_start_min;
2350
2351 l = 0;
cfd0369c 2352 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2353 old = b - data->last_start_min;
2354
2355#else
b81d288d 2356 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2357 ? pos_before : data->last_start_min;
2358 STRLEN l;
cfd0369c 2359 const char *s = SvPV_const(data->last_found, l);
a0ed51b3 2360 I32 old = b - data->last_start_min;
5d1c421c 2361#endif
a0ed51b3
LW
2362
2363 if (UTF)
2364 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2365
a0ed51b3 2366 l -= old;
c277df42 2367 /* Get the added string: */
79cb57f6 2368 last_str = newSVpvn(s + old, l);
0e933229
IH
2369 if (UTF)
2370 SvUTF8_on(last_str);
c277df42
IZ
2371 if (deltanext == 0 && pos_before == b) {
2372 /* What was added is a constant string */
2373 if (mincount > 1) {
2374 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2375 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2376 SvPVX_const(last_str), l, mincount - 1);
b162af07 2377 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2378 /* Add additional parts. */
b81d288d 2379 SvCUR_set(data->last_found,
c277df42
IZ
2380 SvCUR(data->last_found) - l);
2381 sv_catsv(data->last_found, last_str);
0eda9292
JH
2382 {
2383 SV * sv = data->last_found;
2384 MAGIC *mg =
2385 SvUTF8(sv) && SvMAGICAL(sv) ?
2386 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2387 if (mg && mg->mg_len >= 0)
2388 mg->mg_len += CHR_SVLEN(last_str);
2389 }
c277df42
IZ
2390 data->last_end += l * (mincount - 1);
2391 }
2a8d9689
HS
2392 } else {
2393 /* start offset must point into the last copy */
2394 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2395 data->last_start_max += is_inf ? I32_MAX
2396 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2397 }
2398 }
2399 /* It is counted once already... */
2400 data->pos_min += minnext * (mincount - counted);
2401 data->pos_delta += - counted * deltanext +
2402 (minnext + deltanext) * maxcount - minnext * mincount;
2403 if (mincount != maxcount) {
653099ff
GS
2404 /* Cannot extend fixed substrings found inside
2405 the group. */
830247a4 2406 scan_commit(pRExC_state,data);
c277df42
IZ
2407 if (mincount && last_str) {
2408 sv_setsv(data->last_found, last_str);
2409 data->last_end = data->pos_min;
b81d288d 2410 data->last_start_min =
a0ed51b3 2411 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2412 data->last_start_max = is_inf
2413 ? I32_MAX
c277df42 2414 : data->pos_min + data->pos_delta
a0ed51b3 2415 - CHR_SVLEN(last_str);
c277df42
IZ
2416 }
2417 data->longest = &(data->longest_float);
2418 }
aca2d497 2419 SvREFCNT_dec(last_str);
c277df42 2420 }
405ff068 2421 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2422 data->flags |= SF_HAS_EVAL;
2423 optimize_curly_tail:
c277df42 2424 if (OP(oscan) != CURLYX) {
22c35a8c 2425 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2426 && NEXT_OFF(next))
2427 NEXT_OFF(oscan) += NEXT_OFF(next);
2428 }
c277df42 2429 continue;
653099ff 2430 default: /* REF and CLUMP only? */
c277df42 2431 if (flags & SCF_DO_SUBSTR) {
830247a4 2432 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2433 data->longest = &(data->longest_float);
2434 }
aca2d497 2435 is_inf = is_inf_internal = 1;
653099ff 2436 if (flags & SCF_DO_STCLASS_OR)
830247a4 2437 cl_anything(pRExC_state, data->start_class);
653099ff 2438 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2439 break;
2440 }
a0ed51b3 2441 }
bfed75c6 2442 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2443 int value = 0;
653099ff 2444
c277df42 2445 if (flags & SCF_DO_SUBSTR) {
830247a4 2446 scan_commit(pRExC_state,data);
c277df42
IZ
2447 data->pos_min++;
2448 }
2449 min++;
653099ff
GS
2450 if (flags & SCF_DO_STCLASS) {
2451 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2452
2453 /* Some of the logic below assumes that switching
2454 locale on will only add false positives. */
2455 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2456 case SANY:
653099ff
GS
2457 default:
2458 do_default:
2459 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2460 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2461 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2462 break;
2463 case REG_ANY:
2464 if (OP(scan) == SANY)
2465 goto do_default;
2466 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2467 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2468 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2469 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2470 }
2471 if (flags & SCF_DO_STCLASS_AND || !value)
2472 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2473 break;
2474 case ANYOF:
2475 if (flags & SCF_DO_STCLASS_AND)
2476 cl_and(data->start_class,
2477 (struct regnode_charclass_class*)scan);
2478 else
830247a4 2479 cl_or(pRExC_state, data->start_class,
653099ff
GS
2480 (struct regnode_charclass_class*)scan);
2481 break;
2482 case ALNUM:
2483 if (flags & SCF_DO_STCLASS_AND) {
2484 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2485 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2486 for (value = 0; value < 256; value++)
2487 if (!isALNUM(value))
2488 ANYOF_BITMAP_CLEAR(data->start_class, value);
2489 }
2490 }
2491 else {
2492 if (data->start_class->flags & ANYOF_LOCALE)
2493 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2494 else {
2495 for (value = 0; value < 256; value++)
2496 if (isALNUM(value))
b81d288d 2497 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2498 }
2499 }
2500 break;
2501 case ALNUML:
2502 if (flags & SCF_DO_STCLASS_AND) {
2503 if (data->start_class->flags & ANYOF_LOCALE)
2504 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2505 }
2506 else {
2507 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2508 data->start_class->flags |= ANYOF_LOCALE;
2509 }
2510 break;
2511 case NALNUM:
2512 if (flags & SCF_DO_STCLASS_AND) {
2513 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2514 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2515 for (value = 0; value < 256; value++)
2516 if (isALNUM(value))
2517 ANYOF_BITMAP_CLEAR(data->start_class, value);
2518 }
2519 }
2520 else {
2521 if (data->start_class->flags & ANYOF_LOCALE)
2522 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2523 else {
2524 for (value = 0; value < 256; value++)
2525 if (!isALNUM(value))
b81d288d 2526 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2527 }
2528 }
2529 break;
2530 case NALNUML:
2531 if (flags & SCF_DO_STCLASS_AND) {
2532 if (data->start_class->flags & ANYOF_LOCALE)
2533 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2534 }
2535 else {
2536 data->start_class->flags |= ANYOF_LOCALE;
2537 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2538 }
2539 break;
2540 case SPACE:
2541 if (flags & SCF_DO_STCLASS_AND) {
2542 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2543 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2544 for (value = 0; value < 256; value++)
2545 if (!isSPACE(value))
2546 ANYOF_BITMAP_CLEAR(data->start_class, value);
2547 }
2548 }
2549 else {
2550 if (data->start_class->flags & ANYOF_LOCALE)
2551 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2552 else {
2553 for (value = 0; value < 256; value++)
2554 if (isSPACE(value))
b81d288d 2555 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2556 }
2557 }
2558 break;
2559 case SPACEL:
2560 if (flags & SCF_DO_STCLASS_AND) {
2561 if (data->start_class->flags & ANYOF_LOCALE)
2562 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2563 }
2564 else {
2565 data->start_class->flags |= ANYOF_LOCALE;
2566 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2567 }
2568 break;
2569 case NSPACE:
2570 if (flags & SCF_DO_STCLASS_AND) {
2571 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2572 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2573 for (value = 0; value < 256; value++)
2574 if (isSPACE(value))
2575 ANYOF_BITMAP_CLEAR(data->start_class, value);
2576 }
2577 }
2578 else {
2579 if (data->start_class->flags & ANYOF_LOCALE)
2580 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2581 else {
2582 for (value = 0; value < 256; value++)
2583 if (!isSPACE(value))
b81d288d 2584 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2585 }
2586 }
2587 break;
2588 case NSPACEL:
2589 if (flags & SCF_DO_STCLASS_AND) {
2590 if (data->start_class->flags & ANYOF_LOCALE) {
2591 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2592 for (value = 0; value < 256; value++)
2593 if (!isSPACE(value))
2594 ANYOF_BITMAP_CLEAR(data->start_class, value);
2595 }
2596 }
2597 else {
2598 data->start_class->flags |= ANYOF_LOCALE;
2599 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2600 }
2601 break;
2602 case DIGIT:
2603 if (flags & SCF_DO_STCLASS_AND) {
2604 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2605 for (value = 0; value < 256; value++)
2606 if (!isDIGIT(value))
2607 ANYOF_BITMAP_CLEAR(data->start_class, value);
2608 }
2609 else {
2610 if (data->start_class->flags & ANYOF_LOCALE)
2611 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2612 else {
2613 for (value = 0; value < 256; value++)
2614 if (isDIGIT(value))
b81d288d 2615 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2616 }
2617 }
2618 break;
2619 case NDIGIT:
2620 if (flags & SCF_DO_STCLASS_AND) {
2621 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2622 for (value = 0; value < 256; value++)
2623 if (isDIGIT(value))
2624 ANYOF_BITMAP_CLEAR(data->start_class, value);
2625 }
2626 else {
2627 if (data->start_class->flags & ANYOF_LOCALE)
2628 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2629 else {
2630 for (value = 0; value < 256; value++)
2631 if (!isDIGIT(value))
b81d288d 2632 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2633 }
2634 }
2635 break;
2636 }
2637 if (flags & SCF_DO_STCLASS_OR)
2638 cl_and(data->start_class, &and_with);
2639 flags &= ~SCF_DO_STCLASS;
2640 }
a0ed51b3 2641 }
22c35a8c 2642 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2643 data->flags |= (OP(scan) == MEOL
2644 ? SF_BEFORE_MEOL
2645 : SF_BEFORE_SEOL);
a0ed51b3 2646 }
653099ff
GS
2647 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2648 /* Lookbehind, or need to calculate parens/evals/stclass: */
2649 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2650 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2651 /* Lookahead/lookbehind */
cb434fcc 2652 I32 deltanext, minnext, fake = 0;
c277df42 2653 regnode *nscan;
653099ff
GS
2654 struct regnode_charclass_class intrnl;
2655 int f = 0;
c277df42
IZ
2656
2657 data_fake.flags = 0;
b81d288d 2658 if (data) {
2c2d71f5 2659 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2660 data_fake.last_closep = data->last_closep;
2661 }
2662 else
2663 data_fake.last_closep = &fake;
653099ff
GS
2664 if ( flags & SCF_DO_STCLASS && !scan->flags
2665 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2666 cl_init(pRExC_state, &intrnl);
653099ff 2667 data_fake.start_class = &intrnl;
e1901655 2668 f |= SCF_DO_STCLASS_AND;
653099ff 2669 }
e1901655
IZ
2670 if (flags & SCF_WHILEM_VISITED_POS)
2671 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2672 next = regnext(scan);
2673 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2674 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2675 if (scan->flags) {
2676 if (deltanext) {
9baa0206 2677 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2678 }
2679 else if (minnext > U8_MAX) {
9baa0206 2680 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2681 }
eb160463 2682 scan->flags = (U8)minnext;
c277df42
IZ
2683 }
2684 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2685 pars++;
405ff068 2686 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2687 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2688 if (data)
2689 data->whilem_c = data_fake.whilem_c;
e1901655 2690 if (f & SCF_DO_STCLASS_AND) {
a28509cc 2691 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
2692
2693 cl_and(data->start_class, &intrnl);
2694 if (was)
2695 data->start_class->flags |= ANYOF_EOS;
2696 }
a0ed51b3
LW
2697 }
2698 else if (OP(scan) == OPEN) {
c277df42 2699 pars++;
a0ed51b3 2700 }
cb434fcc 2701 else if (OP(scan) == CLOSE) {
eb160463 2702 if ((I32)ARG(scan) == is_par) {
cb434fcc 2703 next = regnext(scan);
c277df42 2704
cb434fcc
IZ
2705 if ( next && (OP(next) != WHILEM) && next < last)
2706 is_par = 0; /* Disable optimization */
2707 }
2708 if (data)
2709 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2710 }
2711 else if (OP(scan) == EVAL) {
c277df42
IZ
2712 if (data)
2713 data->flags |= SF_HAS_EVAL;
2714 }
96776eda 2715 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2716 if (flags & SCF_DO_SUBSTR) {
830247a4 2717 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2718 data->longest = &(data->longest_float);
2719 }
2720 is_inf = is_inf_internal = 1;
653099ff 2721 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2722 cl_anything(pRExC_state, data->start_class);
96776eda 2723 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2724 }
c277df42
IZ
2725 /* Else: zero-length, ignore. */
2726 scan = regnext(scan);
2727 }
2728
2729 finish:
2730 *scanp = scan;
aca2d497 2731 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2732 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2733 data->pos_delta = I32_MAX - data->pos_min;
2734 if (is_par > U8_MAX)
2735 is_par = 0;
2736 if (is_par && pars==1 && data) {
2737 data->flags |= SF_IN_PAR;
2738 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2739 }
2740 else if (pars && data) {
c277df42
IZ
2741 data->flags |= SF_HAS_PAR;
2742 data->flags &= ~SF_IN_PAR;
2743 }
653099ff
GS
2744 if (flags & SCF_DO_STCLASS_OR)
2745 cl_and(data->start_class, &and_with);
c277df42
IZ
2746 return min;
2747}
2748
76e3520e 2749STATIC I32
bfed75c6 2750S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2751{
830247a4 2752 if (RExC_rx->data) {
b81d288d
AB
2753 Renewc(RExC_rx->data,
2754 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2755 char, struct reg_data);
830247a4
IZ
2756 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2757 RExC_rx->data->count += n;
a0ed51b3
LW
2758 }
2759 else {
830247a4 2760 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2761 char, struct reg_data);
830247a4
IZ
2762 New(1208, RExC_rx->data->what, n, U8);
2763 RExC_rx->data->count = n;
c277df42 2764 }
830247a4
IZ
2765 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2766 return RExC_rx->data->count - n;
c277df42
IZ
2767}
2768
d88dccdf 2769void
864dbfa3 2770Perl_reginitcolors(pTHX)
d88dccdf 2771{
1df70142 2772 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 2773 if (s) {
1df70142
AL
2774 char *t = savepv(s);
2775 int i = 0;
2776 PL_colors[0] = t;
d88dccdf 2777 while (++i < 6) {
1df70142
AL
2778 t = strchr(t, '\t');
2779 if (t) {
2780 *t = '\0';
2781 PL_colors[i] = ++t;
d88dccdf
IZ
2782 }
2783 else
1df70142 2784 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
2785 }
2786 } else {
1df70142 2787 int i = 0;
b81d288d 2788 while (i < 6)
06b5626a 2789 PL_colors[i++] = (char *)"";
d88dccdf
IZ
2790 }
2791 PL_colorset = 1;
2792}
2793
8615cb43 2794
a687059c 2795/*
e50aee73 2796 - pregcomp - compile a regular expression into internal code
a687059c
LW
2797 *
2798 * We can't allocate space until we know how big the compiled form will be,
2799 * but we can't compile it (and thus know how big it is) until we've got a
2800 * place to put the code. So we cheat: we compile it twice, once with code
2801 * generation turned off and size counting turned on, and once "for real".
2802 * This also means that we don't allocate space until we are sure that the
2803 * thing really will compile successfully, and we never have to move the
2804 * code and thus invalidate pointers into it. (Note that it has to be in
2805 * one piece because free() must be able to free it all.) [NB: not true in perl]
2806 *
2807 * Beware that the optimization-preparation code in here knows about some
2808 * of the structure of the compiled regexp. [I'll say.]
2809 */
2810regexp *
864dbfa3 2811Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2812{
a0d0e21e 2813 register regexp *r;
c277df42 2814 regnode *scan;
c277df42 2815 regnode *first;
a0d0e21e 2816 I32 flags;
a0d0e21e
LW
2817 I32 minlen = 0;
2818 I32 sawplus = 0;
2819 I32 sawopen = 0;
2c2d71f5 2820 scan_data_t data;
830247a4
IZ
2821 RExC_state_t RExC_state;
2822 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 2823
a3621e74
YO
2824 GET_RE_DEBUG_FLAGS_DECL;
2825
a0d0e21e 2826 if (exp == NULL)
c277df42 2827 FAIL("NULL regexp argument");
a0d0e21e 2828
a5961de5 2829 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 2830
5cfc7842 2831 RExC_precomp = exp;
a3621e74
YO
2832 DEBUG_r(if (!PL_colorset) reginitcolors());
2833 DEBUG_COMPILE_r({
2834 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
2835 PL_colors[4],PL_colors[5],PL_colors[0],
2836 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2837 });
e2509266 2838 RExC_flags = pm->op_pmflags;
830247a4 2839 RExC_sawback = 0;
bbce6d69 2840
830247a4
IZ
2841 RExC_seen = 0;
2842 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2843 RExC_seen_evals = 0;
2844 RExC_extralen = 0;
c277df42 2845
bbce6d69 2846 /* First pass: determine size, legality. */
830247a4 2847 RExC_parse = exp;
fac92740 2848 RExC_start = exp;
830247a4
IZ
2849 RExC_end = xend;
2850 RExC_naughty = 0;
2851 RExC_npar = 1;
2852 RExC_size = 0L;
2853 RExC_emit = &PL_regdummy;
2854 RExC_whilem_seen = 0;
85ddcde9
JH
2855#if 0 /* REGC() is (currently) a NOP at the first pass.
2856 * Clever compilers notice this and complain. --jhi */
830247a4 2857 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 2858#endif
830247a4 2859 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 2860 RExC_precomp = Nullch;
a0d0e21e
LW
2861 return(NULL);
2862 }
a3621e74 2863 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 2864
c277df42
IZ
2865 /* Small enough for pointer-storage convention?
2866 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
2867 if (RExC_size >= 0x10000L && RExC_extralen)
2868 RExC_size += RExC_extralen;
c277df42 2869 else
830247a4
IZ
2870 RExC_extralen = 0;
2871 if (RExC_whilem_seen > 15)
2872 RExC_whilem_seen = 15;
a0d0e21e 2873
bbce6d69 2874 /* Allocate space and initialize. */
830247a4 2875 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 2876 char, regexp);
a0d0e21e 2877 if (r == NULL)
b45f050a
JF
2878 FAIL("Regexp out of space");
2879
0f79a09d
GS
2880#ifdef DEBUGGING
2881 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 2882 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 2883#endif
c277df42 2884 r->refcnt = 1;
bbce6d69 2885 r->prelen = xend - exp;
5cfc7842 2886 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 2887 r->subbeg = NULL;
f8c7b90f 2888#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2889 r->saved_copy = Nullsv;
2890#endif
cf93c79d 2891 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 2892 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
2893
2894 r->substrs = 0; /* Useful during FAIL. */
2895 r->startp = 0; /* Useful during FAIL. */
2896 r->endp = 0; /* Useful during FAIL. */
2897
fac92740
MJD
2898 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2899 if (r->offsets) {
2af232bd 2900 r->offsets[0] = RExC_size;
fac92740 2901 }
a3621e74 2902 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
2903 "%s %"UVuf" bytes for offset annotations.\n",
2904 r->offsets ? "Got" : "Couldn't get",
392fbf5d 2905 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 2906
830247a4 2907 RExC_rx = r;
bbce6d69 2908
2909 /* Second pass: emit code. */
e2509266 2910 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
2911 RExC_parse = exp;
2912 RExC_end = xend;
2913 RExC_naughty = 0;
2914 RExC_npar = 1;
fac92740 2915 RExC_emit_start = r->program;
830247a4 2916 RExC_emit = r->program;
2cd61cdb 2917 /* Store the count of eval-groups for security checks: */
eb160463 2918 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 2919 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 2920 r->data = 0;
830247a4 2921 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
2922 return(NULL);
2923
a3621e74 2924
a0d0e21e 2925 /* Dig out information for optimizations. */
cf93c79d 2926 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 2927 pm->op_pmflags = RExC_flags;
a0ed51b3 2928 if (UTF)
5ff6fc6d 2929 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 2930 r->regstclass = NULL;
830247a4 2931 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 2932 r->reganch |= ROPT_NAUGHTY;
c277df42 2933 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
2934
2935 /* XXXX To minimize changes to RE engine we always allocate
2936 3-units-long substrs field. */
2937 Newz(1004, r->substrs, 1, struct reg_substr_data);
2938
2c2d71f5 2939 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 2940 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 2941 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 2942 I32 fake;
c5254dd6 2943 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
2944 struct regnode_charclass_class ch_class;
2945 int stclass_flag;
cb434fcc 2946 I32 last_close = 0;
a0d0e21e
LW
2947
2948 first = scan;
c277df42 2949 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 2950 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 2951 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
2952 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2953 (OP(first) == PLUS) ||
2954 (OP(first) == MINMOD) ||
653099ff 2955 /* An {n,m} with n>0 */
22c35a8c 2956 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
2957 if (OP(first) == PLUS)
2958 sawplus = 1;
2959 else
2960 first += regarglen[(U8)OP(first)];
2961 first = NEXTOPER(first);
a687059c
LW
2962 }
2963
a0d0e21e
LW
2964 /* Starting-point info. */
2965 again:
653099ff 2966 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
2967 if (OP(first) == EXACT)
2968 ; /* Empty, get anchored substr later. */
2969 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
2970 r->regstclass = first;
2971 }
bfed75c6 2972 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 2973 r->regstclass = first;
22c35a8c
GS
2974 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2975 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 2976 r->regstclass = first;
22c35a8c 2977 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
2978 r->reganch |= (OP(first) == MBOL
2979 ? ROPT_ANCH_MBOL
2980 : (OP(first) == SBOL
2981 ? ROPT_ANCH_SBOL
2982 : ROPT_ANCH_BOL));
a0d0e21e 2983 first = NEXTOPER(first);
774d564b 2984 goto again;
2985 }
2986 else if (OP(first) == GPOS) {
2987 r->reganch |= ROPT_ANCH_GPOS;
2988 first = NEXTOPER(first);
2989 goto again;
a0d0e21e 2990 }
e09294f4 2991 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 2992 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
2993 !(r->reganch & ROPT_ANCH) )
2994 {
2995 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
2996 const int type =
2997 (OP(NEXTOPER(first)) == REG_ANY)
2998 ? ROPT_ANCH_MBOL
2999 : ROPT_ANCH_SBOL;
cad2e5aa 3000 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3001 first = NEXTOPER(first);
774d564b 3002 goto again;
a0d0e21e 3003 }
b81d288d 3004 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3005 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3006 /* x+ must match at the 1st pos of run of x's */
3007 r->reganch |= ROPT_SKIP;
a0d0e21e 3008
c277df42 3009 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 3010 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 3011 (IV)(first - scan + 1)));
a0d0e21e
LW
3012 /*
3013 * If there's something expensive in the r.e., find the
3014 * longest literal string that must appear and make it the
3015 * regmust. Resolve ties in favor of later strings, since
3016 * the regstart check works with the beginning of the r.e.
3017 * and avoiding duplication strengthens checking. Not a
3018 * strong reason, but sufficient in the absence of others.
3019 * [Now we resolve ties in favor of the earlier string if
c277df42 3020 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3021 * earlier string may buy us something the later one won't.]
3022 */
a0d0e21e 3023 minlen = 0;
a687059c 3024
79cb57f6
GS
3025 data.longest_fixed = newSVpvn("",0);
3026 data.longest_float = newSVpvn("",0);
3027 data.last_found = newSVpvn("",0);
c277df42
IZ
3028 data.longest = &(data.longest_fixed);
3029 first = scan;
653099ff 3030 if (!r->regstclass) {
830247a4 3031 cl_init(pRExC_state, &ch_class);
653099ff
GS
3032 data.start_class = &ch_class;
3033 stclass_flag = SCF_DO_STCLASS_AND;
3034 } else /* XXXX Check for BOUND? */
3035 stclass_flag = 0;
cb434fcc 3036 data.last_closep = &last_close;
653099ff 3037
830247a4 3038 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3039 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3040 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3041 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3042 && !RExC_seen_zerolen
3043 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3044 r->reganch |= ROPT_CHECK_ALL;
830247a4 3045 scan_commit(pRExC_state, &data);
c277df42
IZ
3046 SvREFCNT_dec(data.last_found);
3047
a0ed51b3 3048 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3049 if (longest_float_length
c277df42
IZ
3050 || (data.flags & SF_FL_BEFORE_EOL
3051 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3052 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3053 int t;
3054
a0ed51b3 3055 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3056 && data.offset_fixed == data.offset_float_min
3057 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3058 goto remove_float; /* As in (a)+. */
3059
33b8afdf
JH
3060 if (SvUTF8(data.longest_float)) {
3061 r->float_utf8 = data.longest_float;
3062 r->float_substr = Nullsv;
3063 } else {
3064 r->float_substr = data.longest_float;
3065 r->float_utf8 = Nullsv;
3066 }
c277df42
IZ
3067 r->float_min_offset = data.offset_float_min;
3068 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3069 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3070 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3071 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3072 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3073 }
3074 else {
aca2d497 3075 remove_float:
33b8afdf 3076 r->float_substr = r->float_utf8 = Nullsv;
c277df42 3077 SvREFCNT_dec(data.longest_float);
c5254dd6 3078 longest_float_length = 0;
a0d0e21e 3079 }
c277df42 3080
a0ed51b3 3081 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3082 if (longest_fixed_length
c277df42
IZ
3083 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3084 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3085 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3086 int t;
3087
33b8afdf
JH
3088 if (SvUTF8(data.longest_fixed)) {
3089 r->anchored_utf8 = data.longest_fixed;
3090 r->anchored_substr = Nullsv;
3091 } else {
3092 r->anchored_substr = data.longest_fixed;
3093 r->anchored_utf8 = Nullsv;
3094 }
c277df42 3095 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3096 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3097 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3098 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3099 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3100 }
3101 else {
33b8afdf 3102 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 3103 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3104 longest_fixed_length = 0;
a0d0e21e 3105 }
b81d288d 3106 if (r->regstclass
ffc61ed2 3107 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3108 r->regstclass = NULL;
33b8afdf
JH
3109 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3110 && stclass_flag
653099ff 3111 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3112 && !cl_is_anything(data.start_class))
3113 {
1df70142 3114 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3115
b81d288d 3116 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
3117 struct regnode_charclass_class);
3118 StructCopy(data.start_class,
830247a4 3119 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3120 struct regnode_charclass_class);
830247a4 3121 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3122 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 3123 PL_regdata = r->data; /* for regprop() */
a3621e74 3124 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
9c5ffd7c
JH
3125 regprop(sv, (regnode*)data.start_class);
3126 PerlIO_printf(Perl_debug_log,
a0288114 3127 "synthetic stclass \"%s\".\n",
3f7c398e 3128 SvPVX_const(sv));});
653099ff 3129 }
c277df42
IZ
3130
3131 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3132 if (longest_fixed_length > longest_float_length) {
c277df42 3133 r->check_substr = r->anchored_substr;
33b8afdf 3134 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3135 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3136 if (r->reganch & ROPT_ANCH_SINGLE)
3137 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3138 }
3139 else {
c277df42 3140 r->check_substr = r->float_substr;
33b8afdf 3141 r->check_utf8 = r->float_utf8;
c277df42
IZ
3142 r->check_offset_min = data.offset_float_min;
3143 r->check_offset_max = data.offset_float_max;
a0d0e21e 3144 }
30382c73
IZ
3145 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3146 This should be changed ASAP! */
33b8afdf 3147 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3148 r->reganch |= RE_USE_INTUIT;
33b8afdf 3149 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3150 r->reganch |= RE_INTUIT_TAIL;
3151 }
a0ed51b3
LW
3152 }
3153 else {
c277df42
IZ
3154 /* Several toplevels. Best we can is to set minlen. */
3155 I32 fake;
653099ff 3156 struct regnode_charclass_class ch_class;
cb434fcc 3157 I32 last_close = 0;
c277df42 3158
a3621e74 3159 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3160 scan = r->program + 1;
830247a4 3161 cl_init(pRExC_state, &ch_class);
653099ff 3162 data.start_class = &ch_class;
cb434fcc 3163 data.last_closep = &last_close;
a3621e74 3164 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf
JH
3165 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3166 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 3167 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3168 && !cl_is_anything(data.start_class))
3169 {
1df70142 3170 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3171
b81d288d 3172 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
3173 struct regnode_charclass_class);
3174 StructCopy(data.start_class,
830247a4 3175 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3176 struct regnode_charclass_class);
830247a4 3177 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3178 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3179 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
9c5ffd7c
JH
3180 regprop(sv, (regnode*)data.start_class);
3181 PerlIO_printf(Perl_debug_log,
a0288114 3182 "synthetic stclass \"%s\".\n",
3f7c398e 3183 SvPVX_const(sv));});
653099ff 3184 }
a0d0e21e
LW
3185 }
3186
a0d0e21e 3187 r->minlen = minlen;
b81d288d 3188 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3189 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3190 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3191 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3192 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3193 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3194 if (RExC_seen & REG_SEEN_CANY)
3195 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
3196 Newz(1002, r->startp, RExC_npar, I32);
3197 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 3198 PL_regdata = r->data; /* for regprop() */
a3621e74 3199 DEBUG_COMPILE_r(regdump(r));
a0d0e21e 3200 return(r);
a687059c
LW
3201}
3202
3203/*
3204 - reg - regular expression, i.e. main body or parenthesized thing
3205 *
3206 * Caller must absorb opening parenthesis.
3207 *
3208 * Combining parenthesis handling with the base level of regular expression
3209 * is a trifle forced, but the need to tie the tails of the branches to what
3210 * follows makes it hard to avoid.
3211 */
76e3520e 3212STATIC regnode *
830247a4 3213S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 3214 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3215{
27da23d5 3216 dVAR;
c277df42
IZ
3217 register regnode *ret; /* Will be the head of the group. */
3218 register regnode *br;
3219 register regnode *lastbr;
3220 register regnode *ender = 0;
a0d0e21e 3221 register I32 parno = 0;
e2509266 3222 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
3223
3224 /* for (?g), (?gc), and (?o) warnings; warning
3225 about (?c) will warn about (?g) -- japhy */
3226
3227 I32 wastedflags = 0x00,
3228 wasted_o = 0x01,
3229 wasted_g = 0x02,
3230 wasted_gc = 0x02 | 0x04,
3231 wasted_c = 0x04;
3232
fac92740 3233 char * parse_start = RExC_parse; /* MJD */
a28509cc 3234 char * const oregcomp_parse = RExC_parse;
c277df42 3235 char c;
a0d0e21e 3236
821b33a5 3237 *flagp = 0; /* Tentatively. */
a0d0e21e 3238
9d1d55b5 3239
a0d0e21e
LW
3240 /* Make an OPEN node, if parenthesized. */
3241 if (paren) {
fac92740 3242 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3243 U32 posflags = 0, negflags = 0;
3244 U32 *flagsp = &posflags;
0f5d15d6 3245 int logical = 0;
a28509cc 3246 const char * const seqstart = RExC_parse;
ca9dfc88 3247
830247a4
IZ
3248 RExC_parse++;
3249 paren = *RExC_parse++;
c277df42 3250 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3251 switch (paren) {
fac92740 3252 case '<': /* (?<...) */
830247a4 3253 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3254 if (*RExC_parse == '!')
c277df42 3255 paren = ',';
b81d288d 3256 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3257 goto unknown;
830247a4 3258 RExC_parse++;
fac92740
MJD
3259 case '=': /* (?=...) */
3260 case '!': /* (?!...) */
830247a4 3261 RExC_seen_zerolen++;
fac92740
MJD
3262 case ':': /* (?:...) */
3263 case '>': /* (?>...) */
a0d0e21e 3264 break;
fac92740
MJD
3265 case '$': /* (?$...) */
3266 case '@': /* (?@...) */
8615cb43 3267 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3268 break;
fac92740 3269 case '#': /* (?#...) */
830247a4
IZ
3270 while (*RExC_parse && *RExC_parse != ')')
3271 RExC_parse++;
3272 if (*RExC_parse != ')')
c277df42 3273 FAIL("Sequence (?#... not terminated");
830247a4 3274 nextchar(pRExC_state);
a0d0e21e
LW
3275 *flagp = TRYAGAIN;
3276 return NULL;
fac92740 3277 case 'p': /* (?p...) */
9014280d 3278 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3279 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3280 /* FALL THROUGH*/
fac92740 3281 case '?': /* (??...) */
0f5d15d6 3282 logical = 1;
438a3801
YST
3283 if (*RExC_parse != '{')
3284 goto unknown;
830247a4 3285 paren = *RExC_parse++;
0f5d15d6 3286 /* FALL THROUGH */
fac92740 3287 case '{': /* (?{...}) */
c277df42 3288 {
c277df42
IZ
3289 I32 count = 1, n = 0;
3290 char c;
830247a4 3291 char *s = RExC_parse;
c277df42
IZ
3292 SV *sv;
3293 OP_4tree *sop, *rop;
3294
830247a4
IZ
3295 RExC_seen_zerolen++;
3296 RExC_seen |= REG_SEEN_EVAL;
3297 while (count && (c = *RExC_parse)) {
3298 if (c == '\\' && RExC_parse[1])
3299 RExC_parse++;
b81d288d 3300 else if (c == '{')
c277df42 3301 count++;
b81d288d 3302 else if (c == '}')
c277df42 3303 count--;
830247a4 3304 RExC_parse++;
c277df42 3305 }
830247a4 3306 if (*RExC_parse != ')')
b45f050a 3307 {
b81d288d 3308 RExC_parse = s;
b45f050a
JF
3309 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3310 }
c277df42 3311 if (!SIZE_ONLY) {
f3548bdc 3312 PAD *pad;
b81d288d
AB
3313
3314 if (RExC_parse - 1 - s)
830247a4 3315 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3316 else
79cb57f6 3317 sv = newSVpvn("", 0);
c277df42 3318
569233ed
SB
3319 ENTER;
3320 Perl_save_re_context(aTHX);
f3548bdc 3321 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3322 sop->op_private |= OPpREFCOUNTED;
3323 /* re_dup will OpREFCNT_inc */
3324 OpREFCNT_set(sop, 1);
569233ed 3325 LEAVE;
c277df42 3326
830247a4
IZ
3327 n = add_data(pRExC_state, 3, "nop");
3328 RExC_rx->data->data[n] = (void*)rop;
3329 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3330 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3331 SvREFCNT_dec(sv);
a0ed51b3 3332 }
e24b16f9 3333 else { /* First pass */
830247a4 3334 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3335 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3336 /* No compiled RE interpolated, has runtime
3337 components ===> unsafe. */
3338 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3339 if (PL_tainting && PL_tainted)
cc6b7395 3340 FAIL("Eval-group in insecure regular expression");
923e4eb5 3341 if (IN_PERL_COMPILETIME)
b5c19bd7 3342 PL_cv_has_eval = 1;
c277df42 3343 }
b5c19bd7 3344
830247a4 3345 nextchar(pRExC_state);
0f5d15d6 3346 if (logical) {
830247a4 3347 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3348 if (!SIZE_ONLY)
3349 ret->flags = 2;
830247a4 3350 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3351 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3352 return ret;
3353 }
ccb2c380
MP
3354 ret = reganode(pRExC_state, EVAL, n);
3355 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3356 Set_Node_Offset(ret, parse_start);
3357 return ret;
c277df42 3358 }
fac92740 3359 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3360 {
fac92740 3361 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3362 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3363 || RExC_parse[1] == '<'
830247a4 3364 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3365 I32 flag;
3366
830247a4 3367 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3368 if (!SIZE_ONLY)
3369 ret->flags = 1;
830247a4 3370 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 3371 goto insert_if;
b81d288d 3372 }
a0ed51b3 3373 }
830247a4 3374 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3375 /* (?(1)...) */
830247a4 3376 parno = atoi(RExC_parse++);
c277df42 3377
830247a4
IZ
3378 while (isDIGIT(*RExC_parse))
3379 RExC_parse++;
fac92740 3380 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3381
830247a4 3382 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3383 vFAIL("Switch condition not recognized");
c277df42 3384 insert_if:
830247a4
IZ
3385 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3386 br = regbranch(pRExC_state, &flags, 1);
c277df42 3387 if (br == NULL)
830247a4 3388 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3389 else
830247a4
IZ
3390 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3391 c = *nextchar(pRExC_state);
d1b80229
IZ
3392 if (flags&HASWIDTH)
3393 *flagp |= HASWIDTH;
c277df42 3394 if (c == '|') {
830247a4
IZ
3395 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3396 regbranch(pRExC_state, &flags, 1);
3397 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
3398 if (flags&HASWIDTH)
3399 *flagp |= HASWIDTH;
830247a4 3400 c = *nextchar(pRExC_state);
a0ed51b3
LW
3401 }
3402 else
c277df42
IZ
3403 lastbr = NULL;
3404 if (c != ')')
8615cb43 3405 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
3406 ender = reg_node(pRExC_state, TAIL);
3407 regtail(pRExC_state, br, ender);
c277df42 3408 if (lastbr) {
830247a4
IZ
3409 regtail(pRExC_state, lastbr, ender);
3410 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3411 }
3412 else
830247a4 3413 regtail(pRExC_state, ret, ender);
c277df42 3414 return ret;
a0ed51b3
LW
3415 }
3416 else {
830247a4 3417 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3418 }
3419 }
1b1626e4 3420 case 0:
830247a4 3421 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3422 vFAIL("Sequence (? incomplete");
1b1626e4 3423 break;
a0d0e21e 3424 default:
830247a4 3425 --RExC_parse;
fac92740 3426 parse_flags: /* (?i) */
830247a4 3427 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3428 /* (?g), (?gc) and (?o) are useless here
3429 and must be globally applied -- japhy */
3430
3431 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3432 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3433 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3434 if (! (wastedflags & wflagbit) ) {
3435 wastedflags |= wflagbit;
3436 vWARN5(
3437 RExC_parse + 1,
3438 "Useless (%s%c) - %suse /%c modifier",
3439 flagsp == &negflags ? "?-" : "?",
3440 *RExC_parse,
3441 flagsp == &negflags ? "don't " : "",
3442 *RExC_parse
3443 );
3444 }
3445 }
3446 }
3447 else if (*RExC_parse == 'c') {
3448 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3449 if (! (wastedflags & wasted_c) ) {
3450 wastedflags |= wasted_gc;
3451 vWARN3(
3452 RExC_parse + 1,
3453 "Useless (%sc) - %suse /gc modifier",
3454 flagsp == &negflags ? "?-" : "?",
3455 flagsp == &negflags ? "don't " : ""
3456 );
3457 }
3458 }
3459 }
3460 else { pmflag(flagsp, *RExC_parse); }
3461
830247a4 3462 ++RExC_parse;
ca9dfc88 3463 }
830247a4 3464 if (*RExC_parse == '-') {
ca9dfc88 3465 flagsp = &negflags;
9d1d55b5 3466 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3467 ++RExC_parse;
ca9dfc88 3468 goto parse_flags;
48c036b1 3469 }
e2509266
JH
3470 RExC_flags |= posflags;
3471 RExC_flags &= ~negflags;
830247a4
IZ
3472 if (*RExC_parse == ':') {
3473 RExC_parse++;
ca9dfc88
IZ
3474 paren = ':';
3475 break;
3476 }
c277df42 3477 unknown:
830247a4
IZ
3478 if (*RExC_parse != ')') {
3479 RExC_parse++;
3480 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3481 }
830247a4 3482 nextchar(pRExC_state);
a0d0e21e
LW
3483 *flagp = TRYAGAIN;
3484 return NULL;
3485 }
3486 }
fac92740 3487 else { /* (...) */
830247a4
IZ
3488 parno = RExC_npar;
3489 RExC_npar++;
3490 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3491 Set_Node_Length(ret, 1); /* MJD */
3492 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 3493 open = 1;
a0d0e21e 3494 }
a0ed51b3 3495 }
fac92740 3496 else /* ! paren */
a0d0e21e
LW
3497 ret = NULL;
3498
3499 /* Pick up the branches, linking them together. */
fac92740 3500 parse_start = RExC_parse; /* MJD */
830247a4 3501 br = regbranch(pRExC_state, &flags, 1);
fac92740 3502 /* branch_len = (paren != 0); */
2af232bd 3503
a0d0e21e
LW
3504 if (br == NULL)
3505 return(NULL);
830247a4
IZ
3506 if (*RExC_parse == '|') {
3507 if (!SIZE_ONLY && RExC_extralen) {
3508 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3509 }
fac92740 3510 else { /* MJD */
830247a4 3511 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3512 Set_Node_Length(br, paren != 0);
3513 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3514 }
c277df42
IZ
3515 have_branch = 1;
3516 if (SIZE_ONLY)
830247a4 3517 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3518 }
3519 else if (paren == ':') {
c277df42
IZ
3520 *flagp |= flags&SIMPLE;
3521 }
3522 if (open) { /* Starts with OPEN. */
830247a4 3523 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3524 }
3525 else if (paren != '?') /* Not Conditional */
a0d0e21e 3526 ret = br;
32a0ca98 3527 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3528 lastbr = br;
830247a4
IZ
3529 while (*RExC_parse == '|') {
3530 if (!SIZE_ONLY && RExC_extralen) {
3531 ender = reganode(pRExC_state, LONGJMP,0);
3532 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3533 }
3534 if (SIZE_ONLY)
830247a4
IZ
3535 RExC_extralen += 2; /* Account for LONGJMP. */
3536 nextchar(pRExC_state);
3537 br = regbranch(pRExC_state, &flags, 0);
2af232bd 3538
a687059c 3539 if (br == NULL)
a0d0e21e 3540 return(NULL);
830247a4 3541 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3542 lastbr = br;
821b33a5
IZ
3543 if (flags&HASWIDTH)
3544 *flagp |= HASWIDTH;
a687059c 3545 *flagp |= flags&SPSTART;
a0d0e21e
LW
3546 }
3547
c277df42
IZ
3548 if (have_branch || paren != ':') {
3549 /* Make a closing node, and hook it on the end. */
3550 switch (paren) {
3551 case ':':
830247a4 3552 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3553 break;
3554 case 1:
830247a4 3555 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3556 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3557 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3558 break;
3559 case '<':
c277df42
IZ
3560 case ',':
3561 case '=':
3562 case '!':
c277df42 3563 *flagp &= ~HASWIDTH;
821b33a5
IZ
3564 /* FALL THROUGH */
3565 case '>':
830247a4 3566 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3567 break;
3568 case 0:
830247a4 3569 ender = reg_node(pRExC_state, END);
c277df42
IZ
3570 break;
3571 }
830247a4 3572 regtail(pRExC_state, lastbr, ender);
a0d0e21e 3573
c277df42
IZ
3574 if (have_branch) {
3575 /* Hook the tails of the branches to the closing node. */
3576 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 3577 regoptail(pRExC_state, br, ender);
c277df42
IZ
3578 }
3579 }
a0d0e21e 3580 }
c277df42
IZ
3581
3582 {
e1ec3a88
AL
3583 const char *p;
3584 static const char parens[] = "=!<,>";
c277df42
IZ
3585
3586 if (paren && (p = strchr(parens, paren))) {
eb160463 3587 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
3588 int flag = (p - parens) > 1;
3589
3590 if (paren == '>')
3591 node = SUSPEND, flag = 0;
830247a4 3592 reginsert(pRExC_state, node,ret);
45948336
EP
3593 Set_Node_Cur_Length(ret);
3594 Set_Node_Offset(ret, parse_start + 1);
c277df42 3595 ret->flags = flag;
830247a4 3596 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 3597 }
a0d0e21e
LW
3598 }
3599
3600 /* Check for proper termination. */
ce3e6498 3601 if (paren) {
e2509266 3602 RExC_flags = oregflags;
830247a4
IZ
3603 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3604 RExC_parse = oregcomp_parse;
380a0633 3605 vFAIL("Unmatched (");
ce3e6498 3606 }
a0ed51b3 3607 }
830247a4
IZ
3608 else if (!paren && RExC_parse < RExC_end) {
3609 if (*RExC_parse == ')') {
3610 RExC_parse++;
380a0633 3611 vFAIL("Unmatched )");
a0ed51b3
LW
3612 }
3613 else
b45f050a 3614 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
3615 /* NOTREACHED */
3616 }
a687059c 3617
a0d0e21e 3618 return(ret);
a687059c
LW
3619}
3620
3621/*
3622 - regbranch - one alternative of an | operator
3623 *
3624 * Implements the concatenation operator.
3625 */
76e3520e 3626STATIC regnode *
830247a4 3627S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 3628{
c277df42
IZ
3629 register regnode *ret;
3630 register regnode *chain = NULL;
3631 register regnode *latest;
3632 I32 flags = 0, c = 0;
a0d0e21e 3633
b81d288d 3634 if (first)
c277df42
IZ
3635 ret = NULL;
3636 else {
b81d288d 3637 if (!SIZE_ONLY && RExC_extralen)
830247a4 3638 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 3639 else {
830247a4 3640 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
3641 Set_Node_Length(ret, 1);
3642 }
c277df42
IZ
3643 }
3644
b81d288d 3645 if (!first && SIZE_ONLY)
830247a4 3646 RExC_extralen += 1; /* BRANCHJ */
b81d288d 3647
c277df42 3648 *flagp = WORST; /* Tentatively. */
a0d0e21e 3649
830247a4
IZ
3650 RExC_parse--;
3651 nextchar(pRExC_state);
3652 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 3653 flags &= ~TRYAGAIN;
830247a4 3654 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
3655 if (latest == NULL) {
3656 if (flags & TRYAGAIN)
3657 continue;
3658 return(NULL);
a0ed51b3
LW
3659 }
3660 else if (ret == NULL)
c277df42 3661 ret = latest;
a0d0e21e 3662 *flagp |= flags&HASWIDTH;
c277df42 3663 if (chain == NULL) /* First piece. */
a0d0e21e
LW
3664 *flagp |= flags&SPSTART;
3665 else {
830247a4
IZ
3666 RExC_naughty++;
3667 regtail(pRExC_state, chain, latest);
a687059c 3668 }
a0d0e21e 3669 chain = latest;
c277df42
IZ
3670 c++;
3671 }
3672 if (chain == NULL) { /* Loop ran zero times. */
830247a4 3673 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
3674 if (ret == NULL)
3675 ret = chain;
3676 }
3677 if (c == 1) {
3678 *flagp |= flags&SIMPLE;
a0d0e21e 3679 }
a687059c 3680
a0d0e21e 3681 return(ret);
a687059c
LW
3682}
3683
3684/*
3685 - regpiece - something followed by possible [*+?]
3686 *
3687 * Note that the branching code sequences used for ? and the general cases
3688 * of * and + are somewhat optimized: they use the same NOTHING node as
3689 * both the endmarker for their branch list and the body of the last branch.
3690 * It might seem that this node could be dispensed with entirely, but the
3691 * endmarker role is not redundant.
3692 */
76e3520e 3693STATIC regnode *
830247a4 3694S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3695{
c277df42 3696 register regnode *ret;
a0d0e21e
LW
3697 register char op;
3698 register char *next;
3699 I32 flags;
1df70142 3700 const char * const origparse = RExC_parse;
a0d0e21e
LW
3701 char *maxpos;
3702 I32 min;
c277df42 3703 I32 max = REG_INFTY;
fac92740 3704 char *parse_start;
a0d0e21e 3705
830247a4 3706 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
3707 if (ret == NULL) {
3708 if (flags & TRYAGAIN)
3709 *flagp |= TRYAGAIN;
3710 return(NULL);
3711 }
3712
830247a4 3713 op = *RExC_parse;
a0d0e21e 3714
830247a4 3715 if (op == '{' && regcurly(RExC_parse)) {
fac92740 3716 parse_start = RExC_parse; /* MJD */
830247a4 3717 next = RExC_parse + 1;
a0d0e21e
LW
3718 maxpos = Nullch;
3719 while (isDIGIT(*next) || *next == ',') {
3720 if (*next == ',') {
3721 if (maxpos)
3722 break;
3723 else
3724 maxpos = next;
a687059c 3725 }
a0d0e21e
LW
3726 next++;
3727 }
3728 if (*next == '}') { /* got one */
3729 if (!maxpos)
3730 maxpos = next;
830247a4
IZ
3731 RExC_parse++;
3732 min = atoi(RExC_parse);
a0d0e21e
LW
3733 if (*maxpos == ',')
3734 maxpos++;
3735 else
830247a4 3736 maxpos = RExC_parse;
a0d0e21e
LW
3737 max = atoi(maxpos);
3738 if (!max && *maxpos != '0')
c277df42
IZ
3739 max = REG_INFTY; /* meaning "infinity" */
3740 else if (max >= REG_INFTY)
8615cb43 3741 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
3742 RExC_parse = next;
3743 nextchar(pRExC_state);
a0d0e21e
LW
3744
3745 do_curly:
3746 if ((flags&SIMPLE)) {
830247a4
IZ
3747 RExC_naughty += 2 + RExC_naughty / 2;
3748 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
3749 Set_Node_Offset(ret, parse_start+1); /* MJD */
3750 Set_Node_Cur_Length(ret);
a0d0e21e
LW
3751 }
3752 else {
830247a4 3753 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
3754
3755 w->flags = 0;
830247a4
IZ
3756 regtail(pRExC_state, ret, w);
3757 if (!SIZE_ONLY && RExC_extralen) {
3758 reginsert(pRExC_state, LONGJMP,ret);
3759 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
3760 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3761 }
830247a4 3762 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
3763 /* MJD hk */
3764 Set_Node_Offset(ret, parse_start+1);
2af232bd 3765 Set_Node_Length(ret,
fac92740 3766 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 3767
830247a4 3768 if (!SIZE_ONLY && RExC_extralen)
c277df42 3769 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 3770 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 3771 if (SIZE_ONLY)
830247a4
IZ
3772 RExC_whilem_seen++, RExC_extralen += 3;
3773 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 3774 }
c277df42 3775 ret->flags = 0;
a0d0e21e
LW
3776
3777 if (min > 0)
821b33a5
IZ
3778 *flagp = WORST;
3779 if (max > 0)
3780 *flagp |= HASWIDTH;
a0d0e21e 3781 if (max && max < min)
8615cb43 3782 vFAIL("Can't do {n,m} with n > m");
c277df42 3783 if (!SIZE_ONLY) {
eb160463
GS
3784 ARG1_SET(ret, (U16)min);
3785 ARG2_SET(ret, (U16)max);
a687059c 3786 }
a687059c 3787
a0d0e21e 3788 goto nest_check;
a687059c 3789 }
a0d0e21e 3790 }
a687059c 3791
a0d0e21e
LW
3792 if (!ISMULT1(op)) {
3793 *flagp = flags;
a687059c 3794 return(ret);
a0d0e21e 3795 }
bb20fd44 3796
c277df42 3797#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
3798
3799 /* if this is reinstated, don't forget to put this back into perldiag:
3800
3801 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3802
3803 (F) The part of the regexp subject to either the * or + quantifier
3804 could match an empty string. The {#} shows in the regular
3805 expression about where the problem was discovered.
3806
3807 */
3808
bb20fd44 3809 if (!(flags&HASWIDTH) && op != '?')
b45f050a 3810 vFAIL("Regexp *+ operand could be empty");
b81d288d 3811#endif
bb20fd44 3812
fac92740 3813 parse_start = RExC_parse;
830247a4 3814 nextchar(pRExC_state);
a0d0e21e 3815
821b33a5 3816 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
3817
3818 if (op == '*' && (flags&SIMPLE)) {
830247a4 3819 reginsert(pRExC_state, STAR, ret);
c277df42 3820 ret->flags = 0;
830247a4 3821 RExC_naughty += 4;
a0d0e21e
LW
3822 }
3823 else if (op == '*') {
3824 min = 0;
3825 goto do_curly;
a0ed51b3
LW
3826 }
3827 else if (op == '+' && (flags&SIMPLE)) {
830247a4 3828 reginsert(pRExC_state, PLUS, ret);
c277df42 3829 ret->flags = 0;
830247a4 3830 RExC_naughty += 3;
a0d0e21e
LW
3831 }
3832 else if (op == '+') {
3833 min = 1;
3834 goto do_curly;
a0ed51b3
LW
3835 }
3836 else if (op == '?') {
a0d0e21e
LW
3837 min = 0; max = 1;
3838 goto do_curly;
3839 }
3840 nest_check:
e476b1b5 3841 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 3842 vWARN3(RExC_parse,
b45f050a 3843 "%.*s matches null string many times",
830247a4 3844 RExC_parse - origparse,
b45f050a 3845 origparse);
a0d0e21e
LW
3846 }
3847
830247a4
IZ
3848 if (*RExC_parse == '?') {
3849 nextchar(pRExC_state);
3850 reginsert(pRExC_state, MINMOD, ret);
3851 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 3852 }
830247a4
IZ
3853 if (ISMULT2(RExC_parse)) {
3854 RExC_parse++;
b45f050a
JF
3855 vFAIL("Nested quantifiers");
3856 }
a0d0e21e
LW
3857
3858 return(ret);
a687059c
LW
3859}
3860
3861/*
3862 - regatom - the lowest level
3863 *
3864 * Optimization: gobbles an entire sequence of ordinary characters so that
3865 * it can turn them into a single node, which is smaller to store and
3866 * faster to run. Backslashed characters are exceptions, each becoming a
3867 * separate node; the code is simpler that way and it's not worth fixing.
3868 *
b45f050a 3869 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 3870STATIC regnode *
830247a4 3871S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3872{
c277df42 3873 register regnode *ret = 0;
a0d0e21e 3874 I32 flags;
45948336 3875 char *parse_start = RExC_parse;
a0d0e21e
LW
3876
3877 *flagp = WORST; /* Tentatively. */
3878
3879tryagain:
830247a4 3880 switch (*RExC_parse) {
a0d0e21e 3881 case '^':
830247a4
IZ
3882 RExC_seen_zerolen++;
3883 nextchar(pRExC_state);
e2509266 3884 if (RExC_flags & PMf_MULTILINE)
830247a4 3885 ret = reg_node(pRExC_state, MBOL);
e2509266 3886 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3887 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3888 else
830247a4 3889 ret = reg_node(pRExC_state, BOL);
fac92740 3890 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3891 break;
3892 case '$':
830247a4 3893 nextchar(pRExC_state);
b81d288d 3894 if (*RExC_parse)
830247a4 3895 RExC_seen_zerolen++;
e2509266 3896 if (RExC_flags & PMf_MULTILINE)
830247a4 3897 ret = reg_node(pRExC_state, MEOL);
e2509266 3898 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3899 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3900 else
830247a4 3901 ret = reg_node(pRExC_state, EOL);
fac92740 3902 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3903 break;
3904 case '.':
830247a4 3905 nextchar(pRExC_state);
e2509266 3906 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
3907 ret = reg_node(pRExC_state, SANY);
3908 else
3909 ret = reg_node(pRExC_state, REG_ANY);
3910 *flagp |= HASWIDTH|SIMPLE;
830247a4 3911 RExC_naughty++;
fac92740 3912 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3913 break;
3914 case '[':
b45f050a 3915 {
830247a4 3916 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 3917 ret = regclass(pRExC_state);
830247a4
IZ
3918 if (*RExC_parse != ']') {
3919 RExC_parse = oregcomp_parse;
b45f050a
JF
3920 vFAIL("Unmatched [");
3921 }
830247a4 3922 nextchar(pRExC_state);
a0d0e21e 3923 *flagp |= HASWIDTH|SIMPLE;
fac92740 3924 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 3925 break;
b45f050a 3926 }
a0d0e21e 3927 case '(':
830247a4
IZ
3928 nextchar(pRExC_state);
3929 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 3930 if (ret == NULL) {
bf93d4cc 3931 if (flags & TRYAGAIN) {
830247a4 3932 if (RExC_parse == RExC_end) {
bf93d4cc
GS
3933 /* Make parent create an empty node if needed. */
3934 *flagp |= TRYAGAIN;
3935 return(NULL);
3936 }
a0d0e21e 3937 goto tryagain;
bf93d4cc 3938 }
a0d0e21e
LW
3939 return(NULL);
3940 }
c277df42 3941 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
3942 break;
3943 case '|':
3944 case ')':
3945 if (flags & TRYAGAIN) {
3946 *flagp |= TRYAGAIN;
3947 return NULL;
3948 }
b45f050a 3949 vFAIL("Internal urp");
a0d0e21e
LW
3950 /* Supposed to be caught earlier. */
3951 break;
85afd4ae 3952 case '{':
830247a4
IZ
3953 if (!regcurly(RExC_parse)) {
3954 RExC_parse++;
85afd4ae
CS
3955 goto defchar;
3956 }
3957 /* FALL THROUGH */
a0d0e21e
LW
3958 case '?':
3959 case '+':
3960 case '*':
830247a4 3961 RExC_parse++;
b45f050a 3962 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
3963 break;
3964 case '\\':
830247a4 3965 switch (*++RExC_parse) {
a0d0e21e 3966 case 'A':
830247a4
IZ
3967 RExC_seen_zerolen++;
3968 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3969 *flagp |= SIMPLE;
830247a4 3970 nextchar(pRExC_state);
fac92740 3971 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3972 break;
3973 case 'G':
830247a4
IZ
3974 ret = reg_node(pRExC_state, GPOS);
3975 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 3976 *flagp |= SIMPLE;
830247a4 3977 nextchar(pRExC_state);
fac92740 3978 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3979 break;
3980 case 'Z':
830247a4 3981 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3982 *flagp |= SIMPLE;
a1917ab9 3983 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 3984 nextchar(pRExC_state);
a0d0e21e 3985 break;
b85d18e9 3986 case 'z':
830247a4 3987 ret = reg_node(pRExC_state, EOS);
b85d18e9 3988 *flagp |= SIMPLE;
830247a4
IZ
3989 RExC_seen_zerolen++; /* Do not optimize RE away */
3990 nextchar(pRExC_state);
fac92740 3991 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 3992 break;
4a2d328f 3993 case 'C':
f33976b4
DB
3994 ret = reg_node(pRExC_state, CANY);
3995 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 3996 *flagp |= HASWIDTH|SIMPLE;
830247a4 3997 nextchar(pRExC_state);
fac92740 3998 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
3999 break;
4000 case 'X':
830247a4 4001 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 4002 *flagp |= HASWIDTH;
830247a4 4003 nextchar(pRExC_state);
fac92740 4004 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 4005 break;
a0d0e21e 4006 case 'w':
eb160463 4007 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 4008 *flagp |= HASWIDTH|SIMPLE;
830247a4 4009 nextchar(pRExC_state);
fac92740 4010 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4011 break;
4012 case 'W':
eb160463 4013 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4014 *flagp |= HASWIDTH|SIMPLE;
830247a4 4015 nextchar(pRExC_state);
fac92740 4016 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4017 break;
4018 case 'b':
830247a4
IZ
4019 RExC_seen_zerolen++;
4020 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4021 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4022 *flagp |= SIMPLE;
830247a4 4023 nextchar(pRExC_state);
fac92740 4024 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4025 break;
4026 case 'B':
830247a4
IZ
4027 RExC_seen_zerolen++;
4028 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4029 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4030 *flagp |= SIMPLE;
830247a4 4031 nextchar(pRExC_state);
fac92740 4032 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4033 break;
4034 case 's':
eb160463 4035 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4036 *flagp |= HASWIDTH|SIMPLE;
830247a4 4037 nextchar(pRExC_state);
fac92740 4038 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4039 break;
4040 case 'S':
eb160463 4041 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4042 *flagp |= HASWIDTH|SIMPLE;
830247a4 4043 nextchar(pRExC_state);
fac92740 4044 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4045 break;
4046 case 'd':
ffc61ed2 4047 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4048 *flagp |= HASWIDTH|SIMPLE;
830247a4 4049 nextchar(pRExC_state);
fac92740 4050 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4051 break;
4052 case 'D':
ffc61ed2 4053 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4054 *flagp |= HASWIDTH|SIMPLE;
830247a4 4055 nextchar(pRExC_state);
fac92740 4056 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4057 break;
a14b48bc
LW
4058 case 'p':
4059 case 'P':
3568d838 4060 {
830247a4 4061 char* oldregxend = RExC_end;
ccb2c380 4062 char* parse_start = RExC_parse - 2;
a14b48bc 4063
830247a4 4064 if (RExC_parse[1] == '{') {
3568d838 4065 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4066 RExC_end = strchr(RExC_parse, '}');
4067 if (!RExC_end) {
0da60cf5 4068 U8 c = (U8)*RExC_parse;
830247a4
IZ
4069 RExC_parse += 2;
4070 RExC_end = oldregxend;
0da60cf5 4071 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4072 }
830247a4 4073 RExC_end++;
a14b48bc 4074 }
af6f566e 4075 else {
830247a4 4076 RExC_end = RExC_parse + 2;
af6f566e
HS
4077 if (RExC_end > oldregxend)
4078 RExC_end = oldregxend;
4079 }
830247a4 4080 RExC_parse--;
a14b48bc 4081
ffc61ed2 4082 ret = regclass(pRExC_state);
a14b48bc 4083
830247a4
IZ
4084 RExC_end = oldregxend;
4085 RExC_parse--;
ccb2c380
MP
4086
4087 Set_Node_Offset(ret, parse_start + 2);
4088 Set_Node_Cur_Length(ret);
830247a4 4089 nextchar(pRExC_state);
a14b48bc
LW
4090 *flagp |= HASWIDTH|SIMPLE;
4091 }
4092 break;
a0d0e21e
LW
4093 case 'n':
4094 case 'r':
4095 case 't':
4096 case 'f':
4097 case 'e':
4098 case 'a':
4099 case 'x':
4100 case 'c':
4101 case '0':
4102 goto defchar;
4103 case '1': case '2': case '3': case '4':
4104 case '5': case '6': case '7': case '8': case '9':
4105 {
1df70142 4106 const I32 num = atoi(RExC_parse);
a0d0e21e 4107
830247a4 4108 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4109 goto defchar;
4110 else {
fac92740 4111 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4112 while (isDIGIT(*RExC_parse))
4113 RExC_parse++;
b45f050a 4114
eb160463 4115 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4116 vFAIL("Reference to nonexistent group");
830247a4 4117 RExC_sawback = 1;
eb160463
GS
4118 ret = reganode(pRExC_state,
4119 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4120 num);
a0d0e21e 4121 *flagp |= HASWIDTH;
2af232bd 4122
fac92740 4123 /* override incorrect value set in reganode MJD */
2af232bd 4124 Set_Node_Offset(ret, parse_start+1);
fac92740 4125 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4126 RExC_parse--;
4127 nextchar(pRExC_state);
a0d0e21e
LW
4128 }
4129 }
4130 break;
4131 case '\0':
830247a4 4132 if (RExC_parse >= RExC_end)
b45f050a 4133 FAIL("Trailing \\");
a0d0e21e
LW
4134 /* FALL THROUGH */
4135 default:
a0288114 4136 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 4137 back into the quick-grab loop below */
45948336 4138 parse_start--;
a0d0e21e
LW
4139 goto defchar;
4140 }
4141 break;
4633a7c4
LW
4142
4143 case '#':
e2509266 4144 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4145 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4146 if (RExC_parse < RExC_end)
4633a7c4
LW
4147 goto tryagain;
4148 }
4149 /* FALL THROUGH */
4150
a0d0e21e 4151 default: {
ba210ebe 4152 register STRLEN len;
58ae7d3f 4153 register UV ender;
a0d0e21e 4154 register char *p;
c277df42 4155 char *oldp, *s;
80aecb99 4156 STRLEN foldlen;
89ebb4a3 4157 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4158
4159 parse_start = RExC_parse - 1;
a0d0e21e 4160
830247a4 4161 RExC_parse++;
a0d0e21e
LW
4162
4163 defchar:
58ae7d3f 4164 ender = 0;
eb160463
GS
4165 ret = reg_node(pRExC_state,
4166 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4167 s = STRING(ret);
830247a4
IZ
4168 for (len = 0, p = RExC_parse - 1;
4169 len < 127 && p < RExC_end;
a0d0e21e
LW
4170 len++)
4171 {
4172 oldp = p;
5b5a24f7 4173
e2509266 4174 if (RExC_flags & PMf_EXTENDED)
830247a4 4175 p = regwhite(p, RExC_end);
a0d0e21e
LW
4176 switch (*p) {
4177 case '^':
4178 case '$':
4179 case '.':
4180 case '[':
4181 case '(':
4182 case ')':
4183 case '|':
4184 goto loopdone;
4185 case '\\':
4186 switch (*++p) {
4187 case 'A':
1ed8eac0
JF
4188 case 'C':
4189 case 'X':
a0d0e21e
LW
4190 case 'G':
4191 case 'Z':
b85d18e9 4192 case 'z':
a0d0e21e
LW
4193 case 'w':
4194 case 'W':
4195 case 'b':
4196 case 'B':
4197 case 's':
4198 case 'S':
4199 case 'd':
4200 case 'D':
a14b48bc
LW
4201 case 'p':
4202 case 'P':
a0d0e21e
LW
4203 --p;
4204 goto loopdone;
4205 case 'n':
4206 ender = '\n';
4207 p++;
a687059c 4208 break;
a0d0e21e
LW
4209 case 'r':
4210 ender = '\r';
4211 p++;
a687059c 4212 break;
a0d0e21e
LW
4213 case 't':
4214 ender = '\t';
4215 p++;
a687059c 4216 break;
a0d0e21e
LW
4217 case 'f':
4218 ender = '\f';
4219 p++;
a687059c 4220 break;
a0d0e21e 4221 case 'e':
c7f1f016 4222 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4223 p++;
a687059c 4224 break;
a0d0e21e 4225 case 'a':
c7f1f016 4226 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4227 p++;
a687059c 4228 break;
a0d0e21e 4229 case 'x':
a0ed51b3 4230 if (*++p == '{') {
1df70142 4231 char* const e = strchr(p, '}');
b81d288d 4232
b45f050a 4233 if (!e) {
830247a4 4234 RExC_parse = p + 1;
b45f050a
JF
4235 vFAIL("Missing right brace on \\x{}");
4236 }
de5f0749 4237 else {
a4c04bdc
NC
4238 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4239 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 4240 STRLEN numlen = e - p - 1;
53305cf1 4241 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4242 if (ender > 0xff)
4243 RExC_utf8 = 1;
a0ed51b3
LW
4244 p = e + 1;
4245 }
a0ed51b3
LW
4246 }
4247 else {
a4c04bdc 4248 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 4249 STRLEN numlen = 2;
53305cf1 4250 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4251 p += numlen;
4252 }
a687059c 4253 break;
a0d0e21e
LW
4254 case 'c':
4255 p++;
bbce6d69 4256 ender = UCHARAT(p++);
4257 ender = toCTRL(ender);
a687059c 4258 break;
a0d0e21e
LW
4259 case '0': case '1': case '2': case '3':case '4':
4260 case '5': case '6': case '7': case '8':case '9':
4261 if (*p == '0' ||
830247a4 4262 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 4263 I32 flags = 0;
1df70142 4264 STRLEN numlen = 3;
53305cf1 4265 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4266 p += numlen;
4267 }
4268 else {
4269 --p;
4270 goto loopdone;
a687059c
LW
4271 }
4272 break;
a0d0e21e 4273 case '\0':
830247a4 4274 if (p >= RExC_end)
b45f050a 4275 FAIL("Trailing \\");
a687059c 4276 /* FALL THROUGH */
a0d0e21e 4277 default:
e476b1b5 4278 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 4279 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4280 goto normal_default;
a0d0e21e
LW
4281 }
4282 break;
a687059c 4283 default:
a0ed51b3 4284 normal_default:
fd400ab9 4285 if (UTF8_IS_START(*p) && UTF) {
1df70142 4286 STRLEN numlen;
5e12f4fb 4287 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 4288 &numlen, 0);
a0ed51b3
LW
4289 p += numlen;
4290 }
4291 else
4292 ender = *p++;
a0d0e21e 4293 break;
a687059c 4294 }
e2509266 4295 if (RExC_flags & PMf_EXTENDED)
830247a4 4296 p = regwhite(p, RExC_end);
60a8b682
JH
4297 if (UTF && FOLD) {
4298 /* Prime the casefolded buffer. */
ac7e0132 4299 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4300 }
a0d0e21e
LW
4301 if (ISMULT2(p)) { /* Back off on ?+*. */
4302 if (len)
4303 p = oldp;
16ea2a2e 4304 else if (UTF) {
0ebc6274
JH
4305 STRLEN unilen;
4306
80aecb99 4307 if (FOLD) {
60a8b682 4308 /* Emit all the Unicode characters. */
1df70142 4309 STRLEN numlen;
80aecb99
JH
4310 for (foldbuf = tmpbuf;
4311 foldlen;
4312 foldlen -= numlen) {
4313 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4314 if (numlen > 0) {
0ebc6274
JH
4315 reguni(pRExC_state, ender, s, &unilen);
4316 s += unilen;
4317 len += unilen;
4318 /* In EBCDIC the numlen
4319 * and unilen can differ. */
9dc45d57 4320 foldbuf += numlen;
47654450
JH
4321 if (numlen >= foldlen)
4322 break;
9dc45d57
JH
4323 }
4324 else
4325 break; /* "Can't happen." */
80aecb99
JH
4326 }
4327 }
4328 else {
0ebc6274 4329 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4330 if (unilen > 0) {
0ebc6274
JH
4331 s += unilen;
4332 len += unilen;
9dc45d57 4333 }
80aecb99 4334 }
a0ed51b3 4335 }
a0d0e21e
LW
4336 else {
4337 len++;
eb160463 4338 REGC((char)ender, s++);
a0d0e21e
LW
4339 }
4340 break;
a687059c 4341 }
16ea2a2e 4342 if (UTF) {
0ebc6274
JH
4343 STRLEN unilen;
4344
80aecb99 4345 if (FOLD) {
60a8b682 4346 /* Emit all the Unicode characters. */
1df70142 4347 STRLEN numlen;
80aecb99
JH
4348 for (foldbuf = tmpbuf;
4349 foldlen;
4350 foldlen -= numlen) {
4351 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4352 if (numlen > 0) {
0ebc6274
JH
4353 reguni(pRExC_state, ender, s, &unilen);
4354 len += unilen;
4355 s += unilen;
4356 /* In EBCDIC the numlen
4357 * and unilen can differ. */
9dc45d57 4358 foldbuf += numlen;
47654450
JH
4359 if (numlen >= foldlen)
4360 break;
9dc45d57
JH
4361 }
4362 else
4363 break;
80aecb99
JH
4364 }
4365 }
4366 else {
0ebc6274 4367 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4368 if (unilen > 0) {
0ebc6274
JH
4369 s += unilen;
4370 len += unilen;
9dc45d57 4371 }
80aecb99
JH
4372 }
4373 len--;
a0ed51b3
LW
4374 }
4375 else
eb160463 4376 REGC((char)ender, s++);
a0d0e21e
LW
4377 }
4378 loopdone:
830247a4 4379 RExC_parse = p - 1;
fac92740 4380 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4381 nextchar(pRExC_state);
793db0cb
JH
4382 {
4383 /* len is STRLEN which is unsigned, need to copy to signed */
4384 IV iv = len;
4385 if (iv < 0)
4386 vFAIL("Internal disaster");
4387 }
a0d0e21e
LW
4388 if (len > 0)
4389 *flagp |= HASWIDTH;
090f7165 4390 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4391 *flagp |= SIMPLE;
c277df42 4392 if (!SIZE_ONLY)
cd439c50
IZ
4393 STR_LEN(ret) = len;
4394 if (SIZE_ONLY)
830247a4 4395 RExC_size += STR_SZ(len);
cd439c50 4396 else
830247a4 4397 RExC_emit += STR_SZ(len);
a687059c 4398 }
a0d0e21e
LW
4399 break;
4400 }
a687059c 4401
60a8b682
JH
4402 /* If the encoding pragma is in effect recode the text of
4403 * any EXACT-kind nodes. */
22c54be3 4404 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
4405 STRLEN oldlen = STR_LEN(ret);
4406 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4407
4408 if (RExC_utf8)
4409 SvUTF8_on(sv);
4410 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
4411 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4412 const STRLEN newlen = SvCUR(sv);
d0063567
DK
4413
4414 if (SvUTF8(sv))
4415 RExC_utf8 = 1;
4416 if (!SIZE_ONLY) {
a3621e74
YO
4417 GET_RE_DEBUG_FLAGS_DECL;
4418 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
4419 (int)oldlen, STRING(ret),
4420 (int)newlen, s));
4421 Copy(s, STRING(ret), newlen, char);
4422 STR_LEN(ret) += newlen - oldlen;
4423 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4424 } else
4425 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4426 }
a72c7584
JH
4427 }
4428
a0d0e21e 4429 return(ret);
a687059c
LW
4430}
4431
873ef191 4432STATIC char *
504618e9 4433S_regwhite(pTHX_ char *p, const char *e)
5b5a24f7
CS
4434{
4435 while (p < e) {
4436 if (isSPACE(*p))
4437 ++p;
4438 else if (*p == '#') {
4439 do {
4440 p++;
4441 } while (p < e && *p != '\n');
4442 }
4443 else
4444 break;
4445 }
4446 return p;
4447}
4448
b8c5462f
JH
4449/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4450 Character classes ([:foo:]) can also be negated ([:^foo:]).
4451 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4452 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 4453 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
4454
4455#define POSIXCC_DONE(c) ((c) == ':')
4456#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4457#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4458
b8c5462f 4459STATIC I32
830247a4 4460S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 4461{
936ed897 4462 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 4463
830247a4 4464 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 4465 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 4466 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 4467 const char c = UCHARAT(RExC_parse);
830247a4 4468 char* s = RExC_parse++;
b81d288d 4469
9a86a77b 4470 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
4471 RExC_parse++;
4472 if (RExC_parse == RExC_end)
620e46c5 4473 /* Grandfather lone [:, [=, [. */
830247a4 4474 RExC_parse = s;
620e46c5 4475 else {
1df70142 4476 const char* t = RExC_parse++; /* skip over the c */
a28509cc 4477 const char *posixcc;
b8c5462f 4478
80916619
NC
4479 assert(*t == c);
4480
9a86a77b 4481 if (UCHARAT(RExC_parse) == ']') {
830247a4 4482 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
4483 posixcc = s + 1;
4484 if (*s == ':') {
1df70142
AL
4485 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4486 const I32 skip = t - posixcc;
80916619
NC
4487
4488 /* Initially switch on the length of the name. */
4489 switch (skip) {
4490 case 4:
4491 if (memEQ(posixcc, "word", 4)) {
4492 /* this is not POSIX, this is the Perl \w */;
4493 namedclass
4494 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4495 }
cc4319de 4496 break;
80916619
NC
4497 case 5:
4498 /* Names all of length 5. */
4499 /* alnum alpha ascii blank cntrl digit graph lower
4500 print punct space upper */
4501 /* Offset 4 gives the best switch position. */
4502 switch (posixcc[4]) {
4503 case 'a':
4504 if (memEQ(posixcc, "alph", 4)) {
4505 /* a */
4506 namedclass
4507 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4508 }
4509 break;
4510 case 'e':
4511 if (memEQ(posixcc, "spac", 4)) {
4512 /* e */
4513 namedclass
4514 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4515 }
4516 break;
4517 case 'h':
4518 if (memEQ(posixcc, "grap", 4)) {
4519 /* h */
4520 namedclass
4521 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4522 }
4523 break;
4524 case 'i':
4525 if (memEQ(posixcc, "asci", 4)) {
4526 /* i */
4527 namedclass
4528 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4529 }
4530 break;
4531 case 'k':
4532 if (memEQ(posixcc, "blan", 4)) {
4533 /* k */
4534 namedclass
4535 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4536 }
4537 break;
4538 case 'l':
4539 if (memEQ(posixcc, "cntr", 4)) {
4540 /* l */
4541 namedclass
4542 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4543 }
4544 break;
4545 case 'm':
4546 if (memEQ(posixcc, "alnu", 4)) {
4547 /* m */
4548 namedclass
4549 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4550 }
4551 break;
4552 case 'r':
4553 if (memEQ(posixcc, "lowe", 4)) {
4554 /* r */
4555 namedclass
4556 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4557 }
4558 if (memEQ(posixcc, "uppe", 4)) {
8fdec511 4559 /* r */
80916619
NC
4560 namedclass
4561 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4562 }
4563 break;
4564 case 't':
4565 if (memEQ(posixcc, "digi", 4)) {
4566 /* t */
4567 namedclass
4568 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4569 }
4570 if (memEQ(posixcc, "prin", 4)) {
8fdec511 4571 /* t */
80916619
NC
4572 namedclass
4573 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4574 }
4575 if (memEQ(posixcc, "punc", 4)) {
4576 /* t */
4577 namedclass
4578 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4579 }
4580 break;
b8c5462f
JH
4581 }
4582 break;
80916619
NC
4583 case 6:
4584 if (memEQ(posixcc, "xdigit", 6)) {
4585 namedclass
4586 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
4587 }
4588 break;
4589 }
80916619
NC
4590
4591 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
4592 {
4593 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4594 t - s - 1, s + 1);
4595 }
80916619
NC
4596 assert (posixcc[skip] == ':');
4597 assert (posixcc[skip+1] == ']');
b45f050a 4598 } else if (!SIZE_ONLY) {
b8c5462f 4599 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 4600
830247a4 4601 /* adjust RExC_parse so the warning shows after
b45f050a 4602 the class closes */
9a86a77b 4603 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 4604 RExC_parse++;
b45f050a
JF
4605 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4606 }
b8c5462f
JH
4607 } else {
4608 /* Maternal grandfather:
4609 * "[:" ending in ":" but not in ":]" */
830247a4 4610 RExC_parse = s;
767d463e 4611 }
620e46c5
JH
4612 }
4613 }
4614
b8c5462f
JH
4615 return namedclass;
4616}
4617
4618STATIC void
830247a4 4619S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 4620{
b938889d 4621 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
4622 const char *s = RExC_parse;
4623 const char c = *s++;
b8c5462f
JH
4624
4625 while(*s && isALNUM(*s))
4626 s++;
4627 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
4628 if (ckWARN(WARN_REGEXP))
4629 vWARN3(s+2,
4630 "POSIX syntax [%c %c] belongs inside character classes",
4631 c, c);
b45f050a
JF
4632
4633 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 4634 if (POSIXCC_NOTYET(c)) {
830247a4 4635 /* adjust RExC_parse so the error shows after
b45f050a 4636 the class closes */
9a86a77b 4637 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
4638 ;
4639 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4640 }
b8c5462f
JH
4641 }
4642 }
620e46c5
JH
4643}
4644
76e3520e 4645STATIC regnode *
830247a4 4646S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 4647{
ffc61ed2 4648 register UV value;
9a86a77b 4649 register UV nextvalue;
3568d838 4650 register IV prevvalue = OOB_UNICODE;
ffc61ed2 4651 register IV range = 0;
c277df42 4652 register regnode *ret;
ba210ebe 4653 STRLEN numlen;
ffc61ed2 4654 IV namedclass;
9c5ffd7c 4655 char *rangebegin = 0;
936ed897 4656 bool need_class = 0;
9c5ffd7c 4657 SV *listsv = Nullsv;
ffc61ed2
JH
4658 register char *e;
4659 UV n;
9e55ce06
JH
4660 bool optimize_invert = TRUE;
4661 AV* unicode_alternate = 0;
1b2d223b
JH
4662#ifdef EBCDIC
4663 UV literal_endpoint = 0;
4664#endif
ffc61ed2
JH
4665
4666 ret = reganode(pRExC_state, ANYOF, 0);
4667
4668 if (!SIZE_ONLY)
4669 ANYOF_FLAGS(ret) = 0;
4670
9a86a77b 4671 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
4672 RExC_naughty++;
4673 RExC_parse++;
4674 if (!SIZE_ONLY)
4675 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4676 }
a0d0e21e 4677
936ed897 4678 if (SIZE_ONLY)
830247a4 4679 RExC_size += ANYOF_SKIP;
936ed897 4680 else {
830247a4 4681 RExC_emit += ANYOF_SKIP;
936ed897
IZ
4682 if (FOLD)
4683 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4684 if (LOC)
4685 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
4686 ANYOF_BITMAP_ZERO(ret);
4687 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 4688 }
b8c5462f 4689
9a86a77b
JH
4690 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4691
b938889d 4692 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 4693 checkposixcc(pRExC_state);
b8c5462f 4694
f064b6ad
HS
4695 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4696 if (UCHARAT(RExC_parse) == ']')
4697 goto charclassloop;
ffc61ed2 4698
9a86a77b 4699 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
4700
4701 charclassloop:
4702
4703 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4704
73b437c8 4705 if (!range)
830247a4 4706 rangebegin = RExC_parse;
ffc61ed2 4707 if (UTF) {
5e12f4fb 4708 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
4709 RExC_end - RExC_parse,
4710 &numlen, 0);
ffc61ed2
JH
4711 RExC_parse += numlen;
4712 }
4713 else
4714 value = UCHARAT(RExC_parse++);
9a86a77b
JH
4715 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4716 if (value == '[' && POSIXCC(nextvalue))
830247a4 4717 namedclass = regpposixcc(pRExC_state, value);
620e46c5 4718 else if (value == '\\') {
ffc61ed2 4719 if (UTF) {
5e12f4fb 4720 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
4721 RExC_end - RExC_parse,
4722 &numlen, 0);
4723 RExC_parse += numlen;
4724 }
4725 else
4726 value = UCHARAT(RExC_parse++);
470c3474 4727 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 4728 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
4729 * be a problem later if we want switch on Unicode.
4730 * A similar issue a little bit later when switching on
4731 * namedclass. --jhi */
ffc61ed2 4732 switch ((I32)value) {
b8c5462f
JH
4733 case 'w': namedclass = ANYOF_ALNUM; break;
4734 case 'W': namedclass = ANYOF_NALNUM; break;
4735 case 's': namedclass = ANYOF_SPACE; break;
4736 case 'S': namedclass = ANYOF_NSPACE; break;
4737 case 'd': namedclass = ANYOF_DIGIT; break;
4738 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
4739 case 'p':
4740 case 'P':
af6f566e 4741 if (RExC_parse >= RExC_end)
2a4859cd 4742 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 4743 if (*RExC_parse == '{') {
1df70142 4744 const U8 c = (U8)value;
ffc61ed2
JH
4745 e = strchr(RExC_parse++, '}');
4746 if (!e)
0da60cf5 4747 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
4748 while (isSPACE(UCHARAT(RExC_parse)))
4749 RExC_parse++;
4750 if (e == RExC_parse)
0da60cf5 4751 vFAIL2("Empty \\%c{}", c);
ffc61ed2 4752 n = e - RExC_parse;
ab13f0c7
JH
4753 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4754 n--;
ffc61ed2
JH
4755 }
4756 else {
4757 e = RExC_parse;
4758 n = 1;
4759 }
4760 if (!SIZE_ONLY) {
ab13f0c7
JH
4761 if (UCHARAT(RExC_parse) == '^') {
4762 RExC_parse++;
4763 n--;
4764 value = value == 'p' ? 'P' : 'p'; /* toggle */
4765 while (isSPACE(UCHARAT(RExC_parse))) {
4766 RExC_parse++;
4767 n--;
4768 }
4769 }
ffc61ed2 4770 if (value == 'p')
ab13f0c7
JH
4771 Perl_sv_catpvf(aTHX_ listsv,
4772 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 4773 else
ab13f0c7
JH
4774 Perl_sv_catpvf(aTHX_ listsv,
4775 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
4776 }
4777 RExC_parse = e + 1;
4778 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2
JP
4779 namedclass = ANYOF_MAX; /* no official name, but it's named */
4780 break;
b8c5462f
JH
4781 case 'n': value = '\n'; break;
4782 case 'r': value = '\r'; break;
4783 case 't': value = '\t'; break;
4784 case 'f': value = '\f'; break;
4785 case 'b': value = '\b'; break;
c7f1f016
NIS
4786 case 'e': value = ASCII_TO_NATIVE('\033');break;
4787 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 4788 case 'x':
ffc61ed2 4789 if (*RExC_parse == '{') {
a4c04bdc
NC
4790 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4791 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 4792 e = strchr(RExC_parse++, '}');
b81d288d 4793 if (!e)
ffc61ed2 4794 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
4795
4796 numlen = e - RExC_parse;
4797 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4798 RExC_parse = e + 1;
4799 }
4800 else {
a4c04bdc 4801 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4802 numlen = 2;
4803 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4804 RExC_parse += numlen;
4805 }
b8c5462f
JH
4806 break;
4807 case 'c':
830247a4 4808 value = UCHARAT(RExC_parse++);
b8c5462f
JH
4809 value = toCTRL(value);
4810 break;
4811 case '0': case '1': case '2': case '3': case '4':
4812 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
4813 {
4814 I32 flags = 0;
4815 numlen = 3;
4816 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 4817 RExC_parse += numlen;
b8c5462f 4818 break;
53305cf1 4819 }
1028017a 4820 default:
e476b1b5 4821 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
4822 vWARN2(RExC_parse,
4823 "Unrecognized escape \\%c in character class passed through",
4824 (int)value);
1028017a 4825 break;
b8c5462f 4826 }
ffc61ed2 4827 } /* end of \blah */
1b2d223b
JH
4828#ifdef EBCDIC
4829 else
4830 literal_endpoint++;
4831#endif
ffc61ed2
JH
4832
4833 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4834
4835 if (!SIZE_ONLY && !need_class)
936ed897 4836 ANYOF_CLASS_ZERO(ret);
ffc61ed2 4837
936ed897 4838 need_class = 1;
ffc61ed2
JH
4839
4840 /* a bad range like a-\d, a-[:digit:] ? */
4841 if (range) {
73b437c8 4842 if (!SIZE_ONLY) {
e476b1b5 4843 if (ckWARN(WARN_REGEXP))
830247a4 4844 vWARN4(RExC_parse,
b45f050a 4845 "False [] range \"%*.*s\"",
830247a4
IZ
4846 RExC_parse - rangebegin,
4847 RExC_parse - rangebegin,
b45f050a 4848 rangebegin);
3568d838
JH
4849 if (prevvalue < 256) {
4850 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
4851 ANYOF_BITMAP_SET(ret, '-');
4852 }
4853 else {
4854 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4855 Perl_sv_catpvf(aTHX_ listsv,
3568d838 4856 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 4857 }
b8c5462f 4858 }
ffc61ed2
JH
4859
4860 range = 0; /* this was not a true range */
73b437c8 4861 }
ffc61ed2 4862
73b437c8 4863 if (!SIZE_ONLY) {
c49a72a9
NC
4864 const char *what = NULL;
4865 char yesno = 0;
4866
3568d838
JH
4867 if (namedclass > OOB_NAMEDCLASS)
4868 optimize_invert = FALSE;
e2962f66
JH
4869 /* Possible truncation here but in some 64-bit environments
4870 * the compiler gets heartburn about switch on 64-bit values.
4871 * A similar issue a little earlier when switching on value.
98f323fa 4872 * --jhi */
e2962f66 4873 switch ((I32)namedclass) {
73b437c8
JH
4874 case ANYOF_ALNUM:
4875 if (LOC)
936ed897 4876 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
4877 else {
4878 for (value = 0; value < 256; value++)
4879 if (isALNUM(value))
936ed897 4880 ANYOF_BITMAP_SET(ret, value);
73b437c8 4881 }
c49a72a9
NC
4882 yesno = '+';
4883 what = "Word";
73b437c8
JH
4884 break;
4885 case ANYOF_NALNUM:
4886 if (LOC)
936ed897 4887 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
4888 else {
4889 for (value = 0; value < 256; value++)
4890 if (!isALNUM(value))
936ed897 4891 ANYOF_BITMAP_SET(ret, value);
73b437c8 4892 }
c49a72a9
NC
4893 yesno = '!';
4894 what = "Word";
73b437c8 4895 break;
ffc61ed2 4896 case ANYOF_ALNUMC:
73b437c8 4897 if (LOC)
ffc61ed2 4898 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
4899 else {
4900 for (value = 0; value < 256; value++)
ffc61ed2 4901 if (isALNUMC(value))
936ed897 4902 ANYOF_BITMAP_SET(ret, value);
73b437c8 4903 }
c49a72a9
NC
4904 yesno = '+';
4905 what = "Alnum";
73b437c8
JH
4906 break;
4907 case ANYOF_NALNUMC:
4908 if (LOC)
936ed897 4909 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
4910 else {
4911 for (value = 0; value < 256; value++)
4912 if (!isALNUMC(value))
936ed897 4913 ANYOF_BITMAP_SET(ret, value);
73b437c8 4914 }
c49a72a9
NC
4915 yesno = '!';
4916 what = "Alnum";
73b437c8
JH
4917 break;
4918 case ANYOF_ALPHA:
4919 if (LOC)
936ed897 4920 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
4921 else {
4922 for (value = 0; value < 256; value++)
4923 if (isALPHA(value))
936ed897 4924 ANYOF_BITMAP_SET(ret, value);
73b437c8 4925 }
c49a72a9
NC
4926 yesno = '+';
4927 what = "Alpha";
73b437c8
JH
4928 break;
4929 case ANYOF_NALPHA:
4930 if (LOC)
936ed897 4931 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
4932 else {
4933 for (value = 0; value < 256; value++)
4934 if (!isALPHA(value))
936ed897 4935 ANYOF_BITMAP_SET(ret, value);
73b437c8 4936 }
c49a72a9
NC
4937 yesno = '!';
4938 what = "Alpha";
73b437c8
JH
4939 break;
4940 case ANYOF_ASCII:
4941 if (LOC)
936ed897 4942 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 4943 else {
c7f1f016 4944#ifndef EBCDIC
1ba5c669
JH
4945 for (value = 0; value < 128; value++)
4946 ANYOF_BITMAP_SET(ret, value);
4947#else /* EBCDIC */
ffbc6a93 4948 for (value = 0; value < 256; value++) {
3a3c4447
JH
4949 if (isASCII(value))
4950 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4951 }
1ba5c669 4952#endif /* EBCDIC */
73b437c8 4953 }
c49a72a9
NC
4954 yesno = '+';
4955 what = "ASCII";
73b437c8
JH
4956 break;
4957 case ANYOF_NASCII:
4958 if (LOC)
936ed897 4959 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 4960 else {
c7f1f016 4961#ifndef EBCDIC
1ba5c669
JH
4962 for (value = 128; value < 256; value++)
4963 ANYOF_BITMAP_SET(ret, value);
4964#else /* EBCDIC */
ffbc6a93 4965 for (value = 0; value < 256; value++) {
3a3c4447
JH
4966 if (!isASCII(value))
4967 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4968 }
1ba5c669 4969#endif /* EBCDIC */
73b437c8 4970 }
c49a72a9
NC
4971 yesno = '!';
4972 what = "ASCII";
73b437c8 4973 break;
aaa51d5e
JF
4974 case ANYOF_BLANK:
4975 if (LOC)
4976 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4977 else {
4978 for (value = 0; value < 256; value++)
4979 if (isBLANK(value))
4980 ANYOF_BITMAP_SET(ret, value);
4981 }
c49a72a9
NC
4982 yesno = '+';
4983 what = "Blank";
aaa51d5e
JF
4984 break;
4985 case ANYOF_NBLANK:
4986 if (LOC)
4987 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4988 else {
4989 for (value = 0; value < 256; value++)
4990 if (!isBLANK(value))
4991 ANYOF_BITMAP_SET(ret, value);
4992 }
c49a72a9
NC
4993 yesno = '!';
4994 what = "Blank";
aaa51d5e 4995 break;
73b437c8
JH
4996 case ANYOF_CNTRL:
4997 if (LOC)
936ed897 4998 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
4999 else {
5000 for (value = 0; value < 256; value++)
5001 if (isCNTRL(value))
936ed897 5002 ANYOF_BITMAP_SET(ret, value);
73b437c8 5003 }
c49a72a9
NC
5004 yesno = '+';
5005 what = "Cntrl";
73b437c8
JH
5006 break;
5007 case ANYOF_NCNTRL:
5008 if (LOC)
936ed897 5009 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5010 else {
5011 for (value = 0; value < 256; value++)
5012 if (!isCNTRL(value))
936ed897 5013 ANYOF_BITMAP_SET(ret, value);
73b437c8 5014 }
c49a72a9
NC
5015 yesno = '!';
5016 what = "Cntrl";
ffc61ed2
JH
5017 break;
5018 case ANYOF_DIGIT:
5019 if (LOC)
5020 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5021 else {
5022 /* consecutive digits assumed */
5023 for (value = '0'; value <= '9'; value++)
5024 ANYOF_BITMAP_SET(ret, value);
5025 }
c49a72a9
NC
5026 yesno = '+';
5027 what = "Digit";
ffc61ed2
JH
5028 break;
5029 case ANYOF_NDIGIT:
5030 if (LOC)
5031 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5032 else {
5033 /* consecutive digits assumed */
5034 for (value = 0; value < '0'; value++)
5035 ANYOF_BITMAP_SET(ret, value);
5036 for (value = '9' + 1; value < 256; value++)
5037 ANYOF_BITMAP_SET(ret, value);
5038 }
c49a72a9
NC
5039 yesno = '!';
5040 what = "Digit";
73b437c8
JH
5041 break;
5042 case ANYOF_GRAPH:
5043 if (LOC)
936ed897 5044 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5045 else {
5046 for (value = 0; value < 256; value++)
5047 if (isGRAPH(value))
936ed897 5048 ANYOF_BITMAP_SET(ret, value);
73b437c8 5049 }
c49a72a9
NC
5050 yesno = '+';
5051 what = "Graph";
73b437c8
JH
5052 break;
5053 case ANYOF_NGRAPH:
5054 if (LOC)
936ed897 5055 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5056 else {
5057 for (value = 0; value < 256; value++)
5058 if (!isGRAPH(value))
936ed897 5059 ANYOF_BITMAP_SET(ret, value);
73b437c8 5060 }
c49a72a9
NC
5061 yesno = '!';
5062 what = "Graph";
73b437c8
JH
5063 break;
5064 case ANYOF_LOWER:
5065 if (LOC)
936ed897 5066 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5067 else {
5068 for (value = 0; value < 256; value++)
5069 if (isLOWER(value))
936ed897 5070 ANYOF_BITMAP_SET(ret, value);
73b437c8 5071 }
c49a72a9
NC
5072 yesno = '+';
5073 what = "Lower";
73b437c8
JH
5074 break;
5075 case ANYOF_NLOWER:
5076 if (LOC)
936ed897 5077 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5078 else {
5079 for (value = 0; value < 256; value++)
5080 if (!isLOWER(value))
936ed897 5081 ANYOF_BITMAP_SET(ret, value);
73b437c8 5082 }
c49a72a9
NC
5083 yesno = '!';
5084 what = "Lower";
73b437c8
JH
5085 break;
5086 case ANYOF_PRINT:
5087 if (LOC)
936ed897 5088 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5089 else {
5090 for (value = 0; value < 256; value++)
5091 if (isPRINT(value))
936ed897 5092 ANYOF_BITMAP_SET(ret, value);
73b437c8 5093 }
c49a72a9
NC
5094 yesno = '+';
5095 what = "Print";
73b437c8
JH
5096 break;
5097 case ANYOF_NPRINT:
5098 if (LOC)
936ed897 5099 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5100 else {
5101 for (value = 0; value < 256; value++)
5102 if (!isPRINT(value))
936ed897 5103 ANYOF_BITMAP_SET(ret, value);
73b437c8 5104 }
c49a72a9
NC
5105 yesno = '!';
5106 what = "Print";
73b437c8 5107 break;
aaa51d5e
JF
5108 case ANYOF_PSXSPC:
5109 if (LOC)
5110 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5111 else {
5112 for (value = 0; value < 256; value++)
5113 if (isPSXSPC(value))
5114 ANYOF_BITMAP_SET(ret, value);
5115 }
c49a72a9
NC
5116 yesno = '+';
5117 what = "Space";
aaa51d5e
JF
5118 break;
5119 case ANYOF_NPSXSPC:
5120 if (LOC)
5121 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5122 else {
5123 for (value = 0; value < 256; value++)
5124 if (!isPSXSPC(value))
5125 ANYOF_BITMAP_SET(ret, value);
5126 }
c49a72a9
NC
5127 yesno = '!';
5128 what = "Space";
aaa51d5e 5129 break;
73b437c8
JH
5130 case ANYOF_PUNCT:
5131 if (LOC)
936ed897 5132 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5133 else {
5134 for (value = 0; value < 256; value++)
5135 if (isPUNCT(value))
936ed897 5136 ANYOF_BITMAP_SET(ret, value);
73b437c8 5137 }
c49a72a9
NC
5138 yesno = '+';
5139 what = "Punct";
73b437c8
JH
5140 break;
5141 case ANYOF_NPUNCT:
5142 if (LOC)
936ed897 5143 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5144 else {
5145 for (value = 0; value < 256; value++)
5146 if (!isPUNCT(value))
936ed897 5147 ANYOF_BITMAP_SET(ret, value);
73b437c8 5148 }
c49a72a9
NC
5149 yesno = '!';
5150 what = "Punct";
ffc61ed2
JH
5151 break;
5152 case ANYOF_SPACE:
5153 if (LOC)
5154 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5155 else {
5156 for (value = 0; value < 256; value++)
5157 if (isSPACE(value))
5158 ANYOF_BITMAP_SET(ret, value);
5159 }
c49a72a9
NC
5160 yesno = '+';
5161 what = "SpacePerl";
ffc61ed2
JH
5162 break;
5163 case ANYOF_NSPACE:
5164 if (LOC)
5165 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5166 else {
5167 for (value = 0; value < 256; value++)
5168 if (!isSPACE(value))
5169 ANYOF_BITMAP_SET(ret, value);
5170 }
c49a72a9
NC
5171 yesno = '!';
5172 what = "SpacePerl";
73b437c8
JH
5173 break;
5174 case ANYOF_UPPER:
5175 if (LOC)
936ed897 5176 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5177 else {
5178 for (value = 0; value < 256; value++)
5179 if (isUPPER(value))
936ed897 5180 ANYOF_BITMAP_SET(ret, value);
73b437c8 5181 }
c49a72a9
NC
5182 yesno = '+';
5183 what = "Upper";
73b437c8
JH
5184 break;
5185 case ANYOF_NUPPER:
5186 if (LOC)
936ed897 5187 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5188 else {
5189 for (value = 0; value < 256; value++)
5190 if (!isUPPER(value))
936ed897 5191 ANYOF_BITMAP_SET(ret, value);
73b437c8 5192 }
c49a72a9
NC
5193 yesno = '!';
5194 what = "Upper";
73b437c8
JH
5195 break;
5196 case ANYOF_XDIGIT:
5197 if (LOC)
936ed897 5198 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5199 else {
5200 for (value = 0; value < 256; value++)
5201 if (isXDIGIT(value))
936ed897 5202 ANYOF_BITMAP_SET(ret, value);
73b437c8 5203 }
c49a72a9
NC
5204 yesno = '+';
5205 what = "XDigit";
73b437c8
JH
5206 break;
5207 case ANYOF_NXDIGIT:
5208 if (LOC)
936ed897 5209 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5210 else {
5211 for (value = 0; value < 256; value++)
5212 if (!isXDIGIT(value))
936ed897 5213 ANYOF_BITMAP_SET(ret, value);
73b437c8 5214 }
c49a72a9
NC
5215 yesno = '!';
5216 what = "XDigit";
73b437c8 5217 break;
f81125e2
JP
5218 case ANYOF_MAX:
5219 /* this is to handle \p and \P */
5220 break;
73b437c8 5221 default:
b45f050a 5222 vFAIL("Invalid [::] class");
73b437c8 5223 break;
b8c5462f 5224 }
c49a72a9
NC
5225 if (what) {
5226 /* Strings such as "+utf8::isWord\n" */
5227 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5228 }
b8c5462f 5229 if (LOC)
936ed897 5230 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5231 continue;
a0d0e21e 5232 }
ffc61ed2
JH
5233 } /* end of namedclass \blah */
5234
a0d0e21e 5235 if (range) {
eb160463 5236 if (prevvalue > (IV)value) /* b-a */ {
b45f050a 5237 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
5238 RExC_parse - rangebegin,
5239 RExC_parse - rangebegin,
b45f050a 5240 rangebegin);
3568d838 5241 range = 0; /* not a valid range */
73b437c8 5242 }
a0d0e21e
LW
5243 }
5244 else {
3568d838 5245 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5246 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5247 RExC_parse[1] != ']') {
5248 RExC_parse++;
ffc61ed2
JH
5249
5250 /* a bad range like \w-, [:word:]- ? */
5251 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 5252 if (ckWARN(WARN_REGEXP))
830247a4 5253 vWARN4(RExC_parse,
b45f050a 5254 "False [] range \"%*.*s\"",
830247a4
IZ
5255 RExC_parse - rangebegin,
5256 RExC_parse - rangebegin,
b45f050a 5257 rangebegin);
73b437c8 5258 if (!SIZE_ONLY)
936ed897 5259 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5260 } else
ffc61ed2
JH
5261 range = 1; /* yeah, it's a range! */
5262 continue; /* but do it the next time */
a0d0e21e 5263 }
a687059c 5264 }
ffc61ed2 5265
93733859 5266 /* now is the next time */
ae5c130c 5267 if (!SIZE_ONLY) {
3568d838
JH
5268 IV i;
5269
5270 if (prevvalue < 256) {
1df70142 5271 const IV ceilvalue = value < 256 ? value : 255;
3568d838
JH
5272
5273#ifdef EBCDIC
1b2d223b
JH
5274 /* In EBCDIC [\x89-\x91] should include
5275 * the \x8e but [i-j] should not. */
5276 if (literal_endpoint == 2 &&
5277 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5278 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5279 {
3568d838
JH
5280 if (isLOWER(prevvalue)) {
5281 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5282 if (isLOWER(i))
5283 ANYOF_BITMAP_SET(ret, i);
5284 } else {
3568d838 5285 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5286 if (isUPPER(i))
5287 ANYOF_BITMAP_SET(ret, i);
5288 }
8ada0baa 5289 }
ffc61ed2 5290 else
8ada0baa 5291#endif
a5961de5
JH
5292 for (i = prevvalue; i <= ceilvalue; i++)
5293 ANYOF_BITMAP_SET(ret, i);
3568d838 5294 }
a5961de5 5295 if (value > 255 || UTF) {
1df70142
AL
5296 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5297 const UV natvalue = NATIVE_TO_UNI(value);
b08decb7 5298
ffc61ed2 5299 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5300 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5301 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5302 prevnatvalue, natvalue);
5303 }
5304 else if (prevnatvalue == natvalue) {
5305 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5306 if (FOLD) {
89ebb4a3 5307 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5308 STRLEN foldlen;
1df70142 5309 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5310
c840d2a2
JH
5311 /* If folding and foldable and a single
5312 * character, insert also the folded version
5313 * to the charclass. */
9e55ce06 5314 if (f != value) {
eb160463 5315 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5316 Perl_sv_catpvf(aTHX_ listsv,
5317 "%04"UVxf"\n", f);
5318 else {
5319 /* Any multicharacter foldings
5320 * require the following transform:
5321 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5322 * where E folds into "pq" and F folds
5323 * into "rst", all other characters
5324 * fold to single characters. We save
5325 * away these multicharacter foldings,
5326 * to be later saved as part of the
5327 * additional "s" data. */
5328 SV *sv;
5329
5330 if (!unicode_alternate)
5331 unicode_alternate = newAV();
5332 sv = newSVpvn((char*)foldbuf, foldlen);
5333 SvUTF8_on(sv);
5334 av_push(unicode_alternate, sv);
5335 }
5336 }
254ba52a 5337
60a8b682
JH
5338 /* If folding and the value is one of the Greek
5339 * sigmas insert a few more sigmas to make the
5340 * folding rules of the sigmas to work right.
5341 * Note that not all the possible combinations
5342 * are handled here: some of them are handled
9e55ce06
JH
5343 * by the standard folding rules, and some of
5344 * them (literal or EXACTF cases) are handled
5345 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5346 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5347 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5348 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5349 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5350 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5351 }
5352 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5353 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5354 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5355 }
5356 }
ffc61ed2 5357 }
1b2d223b
JH
5358#ifdef EBCDIC
5359 literal_endpoint = 0;
5360#endif
8ada0baa 5361 }
ffc61ed2
JH
5362
5363 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5364 }
ffc61ed2 5365
936ed897 5366 if (need_class) {
4f66b38d 5367 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5368 if (SIZE_ONLY)
830247a4 5369 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5370 else
830247a4 5371 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5372 }
ffc61ed2 5373
ae5c130c 5374 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 5375 if (!SIZE_ONLY &&
ffc61ed2 5376 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5377 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5378 ) {
a0ed51b3 5379 for (value = 0; value < 256; ++value) {
936ed897 5380 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5381 UV fold = PL_fold[value];
ffc61ed2
JH
5382
5383 if (fold != value)
5384 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5385 }
5386 }
936ed897 5387 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5388 }
ffc61ed2 5389
ae5c130c 5390 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 5391 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
5392 /* If the only flag is inversion. */
5393 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5394 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5395 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5396 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5397 }
a0d0e21e 5398
b81d288d 5399 if (!SIZE_ONLY) {
fde631ed 5400 AV *av = newAV();
ffc61ed2
JH
5401 SV *rv;
5402
9e55ce06 5403 /* The 0th element stores the character class description
6a0407ee 5404 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5405 * to initialize the appropriate swash (which gets stored in
5406 * the 1st element), and also useful for dumping the regnode.
5407 * The 2nd element stores the multicharacter foldings,
6a0407ee 5408 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
5409 av_store(av, 0, listsv);
5410 av_store(av, 1, NULL);
9e55ce06 5411 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 5412 rv = newRV_noinc((SV*)av);
19860706 5413 n = add_data(pRExC_state, 1, "s");
830247a4 5414 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 5415 ARG_SET(ret, n);
a0ed51b3
LW
5416 }
5417
5418 return ret;
5419}
5420
76e3520e 5421STATIC char*
830247a4 5422S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 5423{
830247a4 5424 char* retval = RExC_parse++;
a0d0e21e 5425
4633a7c4 5426 for (;;) {
830247a4
IZ
5427 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5428 RExC_parse[2] == '#') {
e994fd66
AE
5429 while (*RExC_parse != ')') {
5430 if (RExC_parse == RExC_end)
5431 FAIL("Sequence (?#... not terminated");
830247a4 5432 RExC_parse++;
e994fd66 5433 }
830247a4 5434 RExC_parse++;
4633a7c4
LW
5435 continue;
5436 }
e2509266 5437 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
5438 if (isSPACE(*RExC_parse)) {
5439 RExC_parse++;
748a9306
LW
5440 continue;
5441 }
830247a4 5442 else if (*RExC_parse == '#') {
e994fd66
AE
5443 while (RExC_parse < RExC_end)
5444 if (*RExC_parse++ == '\n') break;
748a9306
LW
5445 continue;
5446 }
748a9306 5447 }
4633a7c4 5448 return retval;
a0d0e21e 5449 }
a687059c
LW
5450}
5451
5452/*
c277df42 5453- reg_node - emit a node
a0d0e21e 5454*/
76e3520e 5455STATIC regnode * /* Location. */
830247a4 5456S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 5457{
c277df42 5458 register regnode *ptr;
504618e9 5459 regnode * const ret = RExC_emit;
a687059c 5460
c277df42 5461 if (SIZE_ONLY) {
830247a4
IZ
5462 SIZE_ALIGN(RExC_size);
5463 RExC_size += 1;
a0d0e21e
LW
5464 return(ret);
5465 }
a687059c 5466
c277df42 5467 NODE_ALIGN_FILL(ret);
a0d0e21e 5468 ptr = ret;
c277df42 5469 FILL_ADVANCE_NODE(ptr, op);
fac92740 5470 if (RExC_offsets) { /* MJD */
ccb2c380 5471 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
5472 "reg_node", __LINE__,
5473 reg_name[op],
5474 RExC_emit - RExC_emit_start > RExC_offsets[0]
5475 ? "Overwriting end of array!\n" : "OK",
5476 RExC_emit - RExC_emit_start,
5477 RExC_parse - RExC_start,
5478 RExC_offsets[0]));
ccb2c380 5479 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
5480 }
5481
830247a4 5482 RExC_emit = ptr;
a687059c 5483
a0d0e21e 5484 return(ret);
a687059c
LW
5485}
5486
5487/*
a0d0e21e
LW
5488- reganode - emit a node with an argument
5489*/
76e3520e 5490STATIC regnode * /* Location. */
830247a4 5491S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 5492{
c277df42 5493 register regnode *ptr;
504618e9 5494 regnode * const ret = RExC_emit;
fe14fcc3 5495
c277df42 5496 if (SIZE_ONLY) {
830247a4
IZ
5497 SIZE_ALIGN(RExC_size);
5498 RExC_size += 2;
a0d0e21e
LW
5499 return(ret);
5500 }
fe14fcc3 5501
c277df42 5502 NODE_ALIGN_FILL(ret);
a0d0e21e 5503 ptr = ret;
c277df42 5504 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 5505 if (RExC_offsets) { /* MJD */
ccb2c380 5506 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5507 "reganode",
ccb2c380
MP
5508 __LINE__,
5509 reg_name[op],
fac92740
MJD
5510 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5511 "Overwriting end of array!\n" : "OK",
5512 RExC_emit - RExC_emit_start,
5513 RExC_parse - RExC_start,
5514 RExC_offsets[0]));
ccb2c380 5515 Set_Cur_Node_Offset;
fac92740
MJD
5516 }
5517
830247a4 5518 RExC_emit = ptr;
fe14fcc3 5519
a0d0e21e 5520 return(ret);
fe14fcc3
LW
5521}
5522
5523/*
cd439c50 5524- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
5525*/
5526STATIC void
a28509cc 5527S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 5528{
5e12f4fb 5529 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
5530}
5531
5532/*
a0d0e21e
LW
5533- reginsert - insert an operator in front of already-emitted operand
5534*
5535* Means relocating the operand.
5536*/
76e3520e 5537STATIC void
830247a4 5538S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 5539{
c277df42
IZ
5540 register regnode *src;
5541 register regnode *dst;
5542 register regnode *place;
504618e9 5543 const int offset = regarglen[(U8)op];
b81d288d 5544
22c35a8c 5545/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
5546
5547 if (SIZE_ONLY) {
830247a4 5548 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
5549 return;
5550 }
a687059c 5551
830247a4
IZ
5552 src = RExC_emit;
5553 RExC_emit += NODE_STEP_REGNODE + offset;
5554 dst = RExC_emit;
fac92740 5555 while (src > opnd) {
c277df42 5556 StructCopy(--src, --dst, regnode);
fac92740 5557 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 5558 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 5559 "reg_insert",
ccb2c380
MP
5560 __LINE__,
5561 reg_name[op],
fac92740
MJD
5562 dst - RExC_emit_start > RExC_offsets[0]
5563 ? "Overwriting end of array!\n" : "OK",
5564 src - RExC_emit_start,
5565 dst - RExC_emit_start,
5566 RExC_offsets[0]));
ccb2c380
MP
5567 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5568 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
5569 }
5570 }
5571
a0d0e21e
LW
5572
5573 place = opnd; /* Op node, where operand used to be. */
fac92740 5574 if (RExC_offsets) { /* MJD */
ccb2c380 5575 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5576 "reginsert",
ccb2c380
MP
5577 __LINE__,
5578 reg_name[op],
fac92740
MJD
5579 place - RExC_emit_start > RExC_offsets[0]
5580 ? "Overwriting end of array!\n" : "OK",
5581 place - RExC_emit_start,
5582 RExC_parse - RExC_start,
5583 RExC_offsets[0]));
ccb2c380 5584 Set_Node_Offset(place, RExC_parse);
45948336 5585 Set_Node_Length(place, 1);
fac92740 5586 }
c277df42
IZ
5587 src = NEXTOPER(place);
5588 FILL_ADVANCE_NODE(place, op);
5589 Zero(src, offset, regnode);
a687059c
LW
5590}
5591
5592/*
c277df42 5593- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 5594*/
76e3520e 5595STATIC void
830247a4 5596S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5597{
c277df42 5598 register regnode *scan;
a0d0e21e 5599
c277df42 5600 if (SIZE_ONLY)
a0d0e21e
LW
5601 return;
5602
5603 /* Find last node. */
5604 scan = p;
5605 for (;;) {
504618e9 5606 regnode * const temp = regnext(scan);
a0d0e21e
LW
5607 if (temp == NULL)
5608 break;
5609 scan = temp;
5610 }
a687059c 5611
c277df42
IZ
5612 if (reg_off_by_arg[OP(scan)]) {
5613 ARG_SET(scan, val - scan);
a0ed51b3
LW
5614 }
5615 else {
c277df42
IZ
5616 NEXT_OFF(scan) = val - scan;
5617 }
a687059c
LW
5618}
5619
5620/*
a0d0e21e
LW
5621- regoptail - regtail on operand of first argument; nop if operandless
5622*/
76e3520e 5623STATIC void
830247a4 5624S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5625{
a0d0e21e 5626 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
5627 if (p == NULL || SIZE_ONLY)
5628 return;
22c35a8c 5629 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 5630 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 5631 }
22c35a8c 5632 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 5633 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
5634 }
5635 else
a0d0e21e 5636 return;
a687059c
LW
5637}
5638
5639/*
5640 - regcurly - a little FSA that accepts {\d+,?\d*}
5641 */
79072805 5642STATIC I32
a3b680e6 5643S_regcurly(pTHX_ register const char *s)
a687059c
LW
5644{
5645 if (*s++ != '{')
5646 return FALSE;
f0fcb552 5647 if (!isDIGIT(*s))
a687059c 5648 return FALSE;
f0fcb552 5649 while (isDIGIT(*s))
a687059c
LW
5650 s++;
5651 if (*s == ',')
5652 s++;
f0fcb552 5653 while (isDIGIT(*s))
a687059c
LW
5654 s++;
5655 if (*s != '}')
5656 return FALSE;
5657 return TRUE;
5658}
5659
a687059c
LW
5660
5661/*
fd181c75 5662 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
5663 */
5664void
864dbfa3 5665Perl_regdump(pTHX_ regexp *r)
a687059c 5666{
35ff7856 5667#ifdef DEBUGGING
46fc3d4c 5668 SV *sv = sv_newmortal();
a687059c 5669
c277df42 5670 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
5671
5672 /* Header fields of interest. */
c277df42 5673 if (r->anchored_substr)
7b0972df 5674 PerlIO_printf(Perl_debug_log,
a0288114 5675 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
3280af22 5676 PL_colors[0],
7b0972df 5677 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
3f7c398e 5678 SvPVX_const(r->anchored_substr),
3280af22 5679 PL_colors[1],
c277df42 5680 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 5681 (IV)r->anchored_offset);
33b8afdf
JH
5682 else if (r->anchored_utf8)
5683 PerlIO_printf(Perl_debug_log,
a0288114 5684 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
33b8afdf
JH
5685 PL_colors[0],
5686 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
3f7c398e 5687 SvPVX_const(r->anchored_utf8),
33b8afdf
JH
5688 PL_colors[1],
5689 SvTAIL(r->anchored_utf8) ? "$" : "",
5690 (IV)r->anchored_offset);
c277df42 5691 if (r->float_substr)
7b0972df 5692 PerlIO_printf(Perl_debug_log,
a0288114 5693 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
3280af22 5694 PL_colors[0],
b81d288d 5695 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
3f7c398e 5696 SvPVX_const(r->float_substr),
3280af22 5697 PL_colors[1],
c277df42 5698 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 5699 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
5700 else if (r->float_utf8)
5701 PerlIO_printf(Perl_debug_log,
a0288114 5702 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
33b8afdf
JH
5703 PL_colors[0],
5704 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
3f7c398e 5705 SvPVX_const(r->float_utf8),
33b8afdf
JH
5706 PL_colors[1],
5707 SvTAIL(r->float_utf8) ? "$" : "",
5708 (IV)r->float_min_offset, (UV)r->float_max_offset);
5709 if (r->check_substr || r->check_utf8)
b81d288d
AB
5710 PerlIO_printf(Perl_debug_log,
5711 r->check_substr == r->float_substr
33b8afdf 5712 && r->check_utf8 == r->float_utf8
c277df42
IZ
5713 ? "(checking floating" : "(checking anchored");
5714 if (r->reganch & ROPT_NOSCAN)
5715 PerlIO_printf(Perl_debug_log, " noscan");
5716 if (r->reganch & ROPT_CHECK_ALL)
5717 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 5718 if (r->check_substr || r->check_utf8)
c277df42
IZ
5719 PerlIO_printf(Perl_debug_log, ") ");
5720
46fc3d4c 5721 if (r->regstclass) {
5722 regprop(sv, r->regstclass);
3f7c398e 5723 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
46fc3d4c 5724 }
774d564b 5725 if (r->reganch & ROPT_ANCH) {
5726 PerlIO_printf(Perl_debug_log, "anchored");
5727 if (r->reganch & ROPT_ANCH_BOL)
5728 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
5729 if (r->reganch & ROPT_ANCH_MBOL)
5730 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
5731 if (r->reganch & ROPT_ANCH_SBOL)
5732 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 5733 if (r->reganch & ROPT_ANCH_GPOS)
5734 PerlIO_printf(Perl_debug_log, "(GPOS)");
5735 PerlIO_putc(Perl_debug_log, ' ');
5736 }
c277df42
IZ
5737 if (r->reganch & ROPT_GPOS_SEEN)
5738 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 5739 if (r->reganch & ROPT_SKIP)
760ac839 5740 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 5741 if (r->reganch & ROPT_IMPLICIT)
760ac839 5742 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 5743 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
5744 if (r->reganch & ROPT_EVAL_SEEN)
5745 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 5746 PerlIO_printf(Perl_debug_log, "\n");
fac92740 5747 if (r->offsets) {
e4584336 5748 const U32 len = r->offsets[0];
a3621e74
YO
5749 GET_RE_DEBUG_FLAGS_DECL;
5750 DEBUG_OFFSETS_r({
1df70142 5751 U32 i;
e4584336
RB
5752 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5753 for (i = 1; i <= len; i++)
5754 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5755 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5756 PerlIO_printf(Perl_debug_log, "\n");
a3621e74 5757 });
fac92740 5758 }
17c3b450 5759#endif /* DEBUGGING */
a687059c
LW
5760}
5761
5762/*
a0d0e21e
LW
5763- regprop - printable representation of opcode
5764*/
46fc3d4c 5765void
a3b680e6 5766Perl_regprop(pTHX_ SV *sv, const regnode *o)
a687059c 5767{
35ff7856 5768#ifdef DEBUGGING
9b155405 5769 register int k;
a0d0e21e 5770
54dc92de 5771 sv_setpvn(sv, "", 0);
9b155405 5772 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
5773 /* It would be nice to FAIL() here, but this may be called from
5774 regexec.c, and it would be hard to supply pRExC_state. */
5775 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 5776 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405
IZ
5777
5778 k = PL_regkind[(U8)OP(o)];
5779
2a782b5b
JH
5780 if (k == EXACT) {
5781 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
5782 /* Using is_utf8_string() is a crude hack but it may
5783 * be the best for now since we have no flag "this EXACTish
5784 * node was UTF-8" --jhi */
1df70142
AL
5785 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
5786 const char *s = do_utf8 ?
c728cb41
JH
5787 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5788 UNI_DISPLAY_REGEX) :
2a782b5b 5789 STRING(o);
e1ec3a88 5790 const int len = do_utf8 ?
2a782b5b
JH
5791 strlen(s) :
5792 STR_LEN(o);
5793 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5794 PL_colors[0],
5795 len, s,
5796 PL_colors[1]);
a3621e74
YO
5797 } else if (k == TRIE) {/*
5798 this isn't always safe, as Pl_regdata may not be for this regex yet
5799 (depending on where its called from) so its being moved to dumpuntil
5800 I32 n = ARG(o);
5801 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5802 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5803 trie->wordcount,
5804 trie->charcount,
5805 trie->uniquecharcount,
5806 trie->laststate);
5807 */
5808 } else if (k == CURLY) {
cb434fcc 5809 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
5810 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5811 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 5812 }
2c2d71f5
JH
5813 else if (k == WHILEM && o->flags) /* Ordinal/of */
5814 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 5815 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 5816 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 5817 else if (k == LOGICAL)
04ebc1ab 5818 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
5819 else if (k == ANYOF) {
5820 int i, rangestart = -1;
ffc61ed2 5821 U8 flags = ANYOF_FLAGS(o);
a6d05634 5822 const char * const anyofs[] = { /* Should be synchronized with
19860706 5823 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
5824 "\\w",
5825 "\\W",
5826 "\\s",
5827 "\\S",
5828 "\\d",
5829 "\\D",
5830 "[:alnum:]",
5831 "[:^alnum:]",
5832 "[:alpha:]",
5833 "[:^alpha:]",
5834 "[:ascii:]",
5835 "[:^ascii:]",
5836 "[:ctrl:]",
5837 "[:^ctrl:]",
5838 "[:graph:]",
5839 "[:^graph:]",
5840 "[:lower:]",
5841 "[:^lower:]",
5842 "[:print:]",
5843 "[:^print:]",
5844 "[:punct:]",
5845 "[:^punct:]",
5846 "[:upper:]",
aaa51d5e 5847 "[:^upper:]",
653099ff 5848 "[:xdigit:]",
aaa51d5e
JF
5849 "[:^xdigit:]",
5850 "[:space:]",
5851 "[:^space:]",
5852 "[:blank:]",
5853 "[:^blank:]"
653099ff
GS
5854 };
5855
19860706 5856 if (flags & ANYOF_LOCALE)
653099ff 5857 sv_catpv(sv, "{loc}");
19860706 5858 if (flags & ANYOF_FOLD)
653099ff
GS
5859 sv_catpv(sv, "{i}");
5860 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 5861 if (flags & ANYOF_INVERT)
653099ff 5862 sv_catpv(sv, "^");
ffc61ed2
JH
5863 for (i = 0; i <= 256; i++) {
5864 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5865 if (rangestart == -1)
5866 rangestart = i;
5867 } else if (rangestart != -1) {
5868 if (i <= rangestart + 3)
5869 for (; rangestart < i; rangestart++)
653099ff 5870 put_byte(sv, rangestart);
ffc61ed2
JH
5871 else {
5872 put_byte(sv, rangestart);
5873 sv_catpv(sv, "-");
5874 put_byte(sv, i - 1);
653099ff 5875 }
ffc61ed2 5876 rangestart = -1;
653099ff 5877 }
847a199f 5878 }
ffc61ed2
JH
5879
5880 if (o->flags & ANYOF_CLASS)
5881 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
5882 if (ANYOF_CLASS_TEST(o,i))
5883 sv_catpv(sv, anyofs[i]);
5884
5885 if (flags & ANYOF_UNICODE)
5886 sv_catpv(sv, "{unicode}");
1aa99e6b 5887 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 5888 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
5889
5890 {
5891 SV *lv;
9e55ce06 5892 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 5893
ffc61ed2
JH
5894 if (lv) {
5895 if (sw) {
89ebb4a3 5896 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 5897
ffc61ed2 5898 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 5899 uvchr_to_utf8(s, i);
ffc61ed2 5900
3568d838 5901 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
5902 if (rangestart == -1)
5903 rangestart = i;
5904 } else if (rangestart != -1) {
5905 U8 *p;
b81d288d 5906
ffc61ed2
JH
5907 if (i <= rangestart + 3)
5908 for (; rangestart < i; rangestart++) {
1df70142 5909 U8 *e;
2b9d42f0 5910 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
5911 put_byte(sv, *p);
5912 }
5913 else {
1df70142 5914 U8 *e;
2b9d42f0 5915 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
5916 put_byte(sv, *p);
5917 sv_catpv(sv, "-");
1df70142
AL
5918 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
5919 put_byte(sv, *p);
ffc61ed2
JH
5920 }
5921 rangestart = -1;
5922 }
19860706 5923 }
ffc61ed2
JH
5924
5925 sv_catpv(sv, "..."); /* et cetera */
19860706 5926 }
fde631ed 5927
ffc61ed2 5928 {
2e0de35c 5929 char *s = savesvpv(lv);
ffc61ed2 5930 char *origs = s;
b81d288d 5931
ffc61ed2 5932 while(*s && *s != '\n') s++;
b81d288d 5933
ffc61ed2 5934 if (*s == '\n') {
1df70142 5935 const char *t = ++s;
ffc61ed2
JH
5936
5937 while (*s) {
5938 if (*s == '\n')
5939 *s = ' ';
5940 s++;
5941 }
5942 if (s[-1] == ' ')
5943 s[-1] = 0;
5944
5945 sv_catpv(sv, t);
fde631ed 5946 }
b81d288d 5947
ffc61ed2 5948 Safefree(origs);
fde631ed
JH
5949 }
5950 }
653099ff 5951 }
ffc61ed2 5952
653099ff
GS
5953 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5954 }
9b155405 5955 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 5956 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 5957#endif /* DEBUGGING */
35ff7856 5958}
a687059c 5959
cad2e5aa
JH
5960SV *
5961Perl_re_intuit_string(pTHX_ regexp *prog)
5962{ /* Assume that RE_INTUIT is set */
a3621e74
YO
5963 GET_RE_DEBUG_FLAGS_DECL;
5964 DEBUG_COMPILE_r(
cfd0369c
NC
5965 {
5966 const char *s = SvPV_nolen_const(prog->check_substr
5967 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
5968
5969 if (!PL_colorset) reginitcolors();
5970 PerlIO_printf(Perl_debug_log,
a0288114 5971 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
5972 PL_colors[4],
5973 prog->check_substr ? "" : "utf8 ",
5974 PL_colors[5],PL_colors[0],
cad2e5aa
JH
5975 s,
5976 PL_colors[1],
5977 (strlen(s) > 60 ? "..." : ""));
5978 } );
5979
33b8afdf 5980 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
5981}
5982
2b69d0c2 5983void
864dbfa3 5984Perl_pregfree(pTHX_ struct regexp *r)
a687059c 5985{
27da23d5 5986 dVAR;
9e55ce06
JH
5987#ifdef DEBUGGING
5988 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
a3621e74 5989 SV *re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
9e55ce06 5990#endif
7821416a 5991
a3621e74 5992
7821416a
IZ
5993 if (!r || (--r->refcnt > 0))
5994 return;
a3621e74 5995 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
e1ec3a88
AL
5996 const char *s = (r->reganch & ROPT_UTF8)
5997 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 5998 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
e1ec3a88 5999 const int len = SvCUR(dsv);
9e55ce06
JH
6000 if (!PL_colorset)
6001 reginitcolors();
6002 PerlIO_printf(Perl_debug_log,
a3621e74 6003 "%sFreeing REx:%s %s%*.*s%s%s\n",
9e55ce06
JH
6004 PL_colors[4],PL_colors[5],PL_colors[0],
6005 len, len, s,
6006 PL_colors[1],
6007 len > 60 ? "..." : "");
6008 });
cad2e5aa 6009
c277df42 6010 if (r->precomp)
a0d0e21e 6011 Safefree(r->precomp);
fac92740
MJD
6012 if (r->offsets) /* 20010421 MJD */
6013 Safefree(r->offsets);
ed252734 6014 RX_MATCH_COPY_FREE(r);
f8c7b90f 6015#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
6016 if (r->saved_copy)
6017 SvREFCNT_dec(r->saved_copy);
6018#endif
a193d654
GS
6019 if (r->substrs) {
6020 if (r->anchored_substr)
6021 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6022 if (r->anchored_utf8)
6023 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6024 if (r->float_substr)
6025 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6026 if (r->float_utf8)
6027 SvREFCNT_dec(r->float_utf8);
2779dcf1 6028 Safefree(r->substrs);
a193d654 6029 }
c277df42
IZ
6030 if (r->data) {
6031 int n = r->data->count;
f3548bdc
DM
6032 PAD* new_comppad = NULL;
6033 PAD* old_comppad;
4026c95a 6034 PADOFFSET refcnt;
dfad63ad 6035
c277df42 6036 while (--n >= 0) {
261faec3 6037 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6038 switch (r->data->what[n]) {
6039 case 's':
6040 SvREFCNT_dec((SV*)r->data->data[n]);
6041 break;
653099ff
GS
6042 case 'f':
6043 Safefree(r->data->data[n]);
6044 break;
dfad63ad
HS
6045 case 'p':
6046 new_comppad = (AV*)r->data->data[n];
6047 break;
c277df42 6048 case 'o':
dfad63ad 6049 if (new_comppad == NULL)
cea2e8a9 6050 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6051 PAD_SAVE_LOCAL(old_comppad,
6052 /* Watch out for global destruction's random ordering. */
6053 (SvTYPE(new_comppad) == SVt_PVAV) ?
6054 new_comppad : Null(PAD *)
6055 );
b34c0dd4 6056 OP_REFCNT_LOCK;
4026c95a
SH
6057 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6058 OP_REFCNT_UNLOCK;
6059 if (!refcnt)
9b978d73 6060 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6061
f3548bdc 6062 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6063 SvREFCNT_dec((SV*)new_comppad);
6064 new_comppad = NULL;
c277df42
IZ
6065 break;
6066 case 'n':
9e55ce06 6067 break;
a3621e74
YO
6068 case 't':
6069 {
6070 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6071 U32 refcount;
6072 OP_REFCNT_LOCK;
6073 refcount = trie->refcount--;
6074 OP_REFCNT_UNLOCK;
6075 if ( !refcount ) {
6076 if (trie->charmap)
6077 Safefree(trie->charmap);
6078 if (trie->widecharmap)
6079 SvREFCNT_dec((SV*)trie->widecharmap);
6080 if (trie->states)
6081 Safefree(trie->states);
6082 if (trie->trans)
6083 Safefree(trie->trans);
6084#ifdef DEBUGGING
6085 if (trie->words)
6086 SvREFCNT_dec((SV*)trie->words);
6087 if (trie->revcharmap)
6088 SvREFCNT_dec((SV*)trie->revcharmap);
6089#endif
6090 Safefree(r->data->data[n]); /* do this last!!!! */
6091 }
6092 break;
6093 }
c277df42 6094 default:
830247a4 6095 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6096 }
6097 }
6098 Safefree(r->data->what);
6099 Safefree(r->data);
a0d0e21e
LW
6100 }
6101 Safefree(r->startp);
6102 Safefree(r->endp);
6103 Safefree(r);
a687059c 6104}
c277df42
IZ
6105
6106/*
6107 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6108 */
6109regnode *
864dbfa3 6110Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
6111{
6112 register I32 offset;
6113
3280af22 6114 if (p == &PL_regdummy)
c277df42
IZ
6115 return(NULL);
6116
6117 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6118 if (offset == 0)
6119 return(NULL);
6120
c277df42 6121 return(p+offset);
c277df42
IZ
6122}
6123
01f988be 6124STATIC void
cea2e8a9 6125S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6126{
6127 va_list args;
6128 STRLEN l1 = strlen(pat1);
6129 STRLEN l2 = strlen(pat2);
6130 char buf[512];
06bf62c7 6131 SV *msv;
73d840c0 6132 const char *message;
c277df42
IZ
6133
6134 if (l1 > 510)
6135 l1 = 510;
6136 if (l1 + l2 > 510)
6137 l2 = 510 - l1;
6138 Copy(pat1, buf, l1 , char);
6139 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6140 buf[l1 + l2] = '\n';
6141 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6142#ifdef I_STDARG
6143 /* ANSI variant takes additional second argument */
c277df42 6144 va_start(args, pat2);
8736538c
AS
6145#else
6146 va_start(args);
6147#endif
5a844595 6148 msv = vmess(buf, &args);
c277df42 6149 va_end(args);
cfd0369c 6150 message = SvPV_const(msv,l1);
c277df42
IZ
6151 if (l1 > 512)
6152 l1 = 512;
6153 Copy(message, buf, l1 , char);
197cf9b9 6154 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6155 Perl_croak(aTHX_ "%s", buf);
c277df42 6156}
a0ed51b3
LW
6157
6158/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6159
6160void
864dbfa3 6161Perl_save_re_context(pTHX)
b81d288d 6162{
830247a4 6163 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 6164 SAVEPPTR(PL_bostr);
a0ed51b3
LW
6165 SAVEPPTR(PL_reginput); /* String-input pointer. */
6166 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6167 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
6168 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6169 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6170 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a5db57d6 6171 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
a0ed51b3 6172 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 6173 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 6174 PL_reg_start_tmp = 0;
a0ed51b3
LW
6175 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6176 PL_reg_start_tmpl = 0;
7766f137 6177 SAVEVPTR(PL_regdata);
a0ed51b3
LW
6178 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6179 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 6180 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 6181 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
6182 SAVEVPTR(PL_regcc); /* from regexec.c */
6183 SAVEVPTR(PL_curcop);
7766f137
GS
6184 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6185 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
6186 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6187 SAVESPTR(PL_reg_sv); /* from regexec.c */
9febdf04 6188 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
7766f137 6189 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 6190 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
6191 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6192 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
a5db57d6
GS
6193 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
6194 PL_reg_oldsaved = Nullch;
6195 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6196 PL_reg_oldsavedlen = 0;
f8c7b90f 6197#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
6198 SAVESPTR(PL_nrs);
6199 PL_nrs = Nullsv;
6200#endif
a5db57d6
GS
6201 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6202 PL_reg_maxiter = 0;
6203 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6204 PL_reg_leftiter = 0;
6205 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
6206 PL_reg_poscache = Nullch;
6207 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6208 PL_reg_poscache_size = 0;
6209 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5fb7366e 6210 SAVEI32(PL_regnpar); /* () count. */
e49a9654 6211 SAVEI32(PL_regsize); /* from regexec.c */
ada6e8a9
AMS
6212
6213 {
6214 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
ada6e8a9 6215 REGEXP *rx;
ada6e8a9
AMS
6216
6217 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
1df70142 6218 U32 i;
ada6e8a9 6219 for (i = 1; i <= rx->nparens; i++) {
1df70142
AL
6220 GV *mgv;
6221 char digits[TYPE_CHARS(long)];
d994d9a1 6222 sprintf(digits, "%lu", (long)i);
ada6e8a9
AMS
6223 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
6224 save_scalar(mgv);
6225 }
6226 }
6227 }
6228
54b6e2fa 6229#ifdef DEBUGGING
b81d288d 6230 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 6231#endif
a0ed51b3 6232}
51371543 6233
51371543 6234static void
acfe0abc 6235clear_re(pTHX_ void *r)
51371543
GS
6236{
6237 ReREFCNT_dec((regexp *)r);
6238}
ffbc6a93 6239
a28509cc
AL
6240#ifdef DEBUGGING
6241
6242STATIC void
6243S_put_byte(pTHX_ SV *sv, int c)
6244{
6245 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6246 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6247 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6248 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6249 else
6250 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6251}
6252
6253
6254STATIC regnode *
6255S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
6256{
6257 register U8 op = EXACT; /* Arbitrary non-END op. */
6258 register regnode *next;
6259
6260 while (op != END && (!last || node < last)) {
6261 /* While that wasn't END last time... */
6262
6263 NODE_ALIGN(node);
6264 op = OP(node);
6265 if (op == CLOSE)
6266 l--;
6267 next = regnext(node);
6268 /* Where, what. */
6269 if (OP(node) == OPTIMIZED)
6270 goto after_print;
6271 regprop(sv, node);
6272 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6273 (int)(2*l + 1), "", SvPVX_const(sv));
6274 if (next == NULL) /* Next ptr. */
6275 PerlIO_printf(Perl_debug_log, "(0)");
6276 else
6277 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6278 (void)PerlIO_putc(Perl_debug_log, '\n');
6279 after_print:
6280 if (PL_regkind[(U8)op] == BRANCHJ) {
6281 register regnode *nnode = (OP(next) == LONGJMP
6282 ? regnext(next)
6283 : next);
6284 if (last && nnode > last)
6285 nnode = last;
6286 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6287 }
6288 else if (PL_regkind[(U8)op] == BRANCH) {
6289 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
6290 }
6291 else if ( PL_regkind[(U8)op] == TRIE ) {
6292 const I32 n = ARG(node);
6293 const reg_trie_data * const trie = (reg_trie_data*)PL_regdata->data[n];
6294 const I32 arry_len = av_len(trie->words)+1;
6295 I32 word_idx;
6296 PerlIO_printf(Perl_debug_log,
6297 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6298 (int)(2*(l+3)),
6299 "",
6300 trie->wordcount,
6301 (int)trie->charcount,
6302 trie->uniquecharcount,
6303 (IV)trie->laststate-1,
6304 node->flags ? " EVAL mode" : "");
6305
6306 for (word_idx=0; word_idx < arry_len; word_idx++) {
6307 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
6308 if (elem_ptr) {
6309 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6310 (int)(2*(l+4)), "",
6311 PL_colors[0],
cfd0369c 6312 SvPV_nolen_const(*elem_ptr),
a28509cc
AL
6313 PL_colors[1]
6314 );
6315 /*
6316 if (next == NULL)
6317 PerlIO_printf(Perl_debug_log, "(0)\n");
6318 else
6319 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6320 */
6321 }
6322
6323 }
6324
6325 node = NEXTOPER(node);
6326 node += regarglen[(U8)op];
6327
6328 }
6329 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6330 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6331 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6332 }
6333 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6334 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6335 next, sv, l + 1);
6336 }
6337 else if ( op == PLUS || op == STAR) {
6338 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6339 }
6340 else if (op == ANYOF) {
6341 /* arglen 1 + class block */
6342 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6343 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6344 node = NEXTOPER(node);
6345 }
6346 else if (PL_regkind[(U8)op] == EXACT) {
6347 /* Literal string, where present. */
6348 node += NODE_SZ_STR(node) - 1;
6349 node = NEXTOPER(node);
6350 }
6351 else {
6352 node = NEXTOPER(node);
6353 node += regarglen[(U8)op];
6354 }
6355 if (op == CURLYX || op == OPEN)
6356 l++;
6357 else if (op == WHILEM)
6358 l--;
6359 }
6360 return node;
6361}
6362
6363#endif /* DEBUGGING */
6364
241d1a3b
NC
6365/*
6366 * Local variables:
6367 * c-indentation-style: bsd
6368 * c-basic-offset: 4
6369 * indent-tabs-mode: t
6370 * End:
6371 *
37442d52
RGS
6372 * ex: set ts=8 sts=4 sw=4 noet:
6373 */