This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add -Wdeclaration-after-statement to default flags for gcc 3+
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e
AD
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
cad2e5aa 38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
d06ea78c 44/* We *really* need to overwrite these symbols: */
56953603
IZ
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
d06ea78c 48# define Perl_pregfree my_regfree
cad2e5aa
JH
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
d06ea78c 51# define Perl_regnext my_regnext
f0b8d043 52# define Perl_save_re_context my_save_re_context
b81d288d 53# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
b81d288d 56#endif
56953603 57
f0fcb552 58/*SUPPRESS 112*/
a687059c 59/*
e50aee73 60 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
61 *
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
64 *
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
68 *
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
71 * from defects in it.
72 *
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
75 *
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
78 *
79 *
80 **** Alterations to Henry's code are...
81 ****
4bb101f2 82 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
241d1a3b 83 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 84 ****
9ef589d8
LW
85 **** You may distribute under the terms of either the GNU General Public
86 **** License or the Artistic License, as specified in the README file.
87
a687059c
LW
88 *
89 * Beware that some of this code is subtly aware of the way operator
90 * precedence is structured in regular expressions. Serious changes in
91 * regular-expression syntax might require a total rethink.
92 */
93#include "EXTERN.h"
864dbfa3 94#define PERL_IN_REGCOMP_C
a687059c 95#include "perl.h"
d06ea78c 96
acfe0abc 97#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
98# include "INTERN.h"
99#endif
c277df42
IZ
100
101#define REG_COMP_C
a687059c
LW
102#include "regcomp.h"
103
d4cce5f1 104#ifdef op
11343788 105#undef op
d4cce5f1 106#endif /* op */
11343788 107
fe14fcc3 108#ifdef MSDOS
7e4e8c89 109# if defined(BUGGY_MSC6)
fe14fcc3 110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 111# pragma optimize("a",off)
fe14fcc3 112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
113# pragma optimize("w",on )
114# endif /* BUGGY_MSC6 */
fe14fcc3
LW
115#endif /* MSDOS */
116
a687059c
LW
117#ifndef STATIC
118#define STATIC static
119#endif
120
830247a4 121typedef struct RExC_state_t {
e2509266 122 U32 flags; /* are we folding, multilining? */
830247a4
IZ
123 char *precomp; /* uncompiled string. */
124 regexp *rx;
fac92740 125 char *start; /* Start of input for compile */
830247a4
IZ
126 char *end; /* End of input for compile */
127 char *parse; /* Input-scan pointer. */
128 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 129 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 130 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
131 I32 naughty; /* How bad is this pattern? */
132 I32 sawback; /* Did we see \1, ...? */
133 U32 seen;
134 I32 size; /* Code size. */
135 I32 npar; /* () count. */
136 I32 extralen;
137 I32 seen_zerolen;
138 I32 seen_evals;
1aa99e6b 139 I32 utf8;
830247a4
IZ
140#if ADD_TO_REGEXEC
141 char *starttry; /* -Dr: where regtry was called. */
142#define RExC_starttry (pRExC_state->starttry)
143#endif
144} RExC_state_t;
145
e2509266 146#define RExC_flags (pRExC_state->flags)
830247a4
IZ
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
fac92740 149#define RExC_start (pRExC_state->start)
830247a4
IZ
150#define RExC_end (pRExC_state->end)
151#define RExC_parse (pRExC_state->parse)
152#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 153#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 154#define RExC_emit (pRExC_state->emit)
fac92740 155#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
156#define RExC_naughty (pRExC_state->naughty)
157#define RExC_sawback (pRExC_state->sawback)
158#define RExC_seen (pRExC_state->seen)
159#define RExC_size (pRExC_state->size)
160#define RExC_npar (pRExC_state->npar)
161#define RExC_extralen (pRExC_state->extralen)
162#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 164#define RExC_utf8 (pRExC_state->utf8)
830247a4 165
a687059c
LW
166#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
167#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168 ((*s) == '{' && regcurly(s)))
a687059c 169
35c8bce7
LW
170#ifdef SPSTART
171#undef SPSTART /* dratted cpp namespace... */
172#endif
a687059c
LW
173/*
174 * Flags to be passed up and down.
175 */
a687059c 176#define WORST 0 /* Worst case. */
821b33a5 177#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
178#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
179#define SPSTART 0x4 /* Starts with * or +. */
180#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 181
2c2d71f5
JH
182/* Length of a variant. */
183
184typedef struct scan_data_t {
185 I32 len_min;
186 I32 len_delta;
187 I32 pos_min;
188 I32 pos_delta;
189 SV *last_found;
190 I32 last_end; /* min value, <0 unless valid. */
191 I32 last_start_min;
192 I32 last_start_max;
193 SV **longest; /* Either &l_fixed, or &l_float. */
194 SV *longest_fixed;
195 I32 offset_fixed;
196 SV *longest_float;
197 I32 offset_float_min;
198 I32 offset_float_max;
199 I32 flags;
200 I32 whilem_c;
cb434fcc 201 I32 *last_closep;
653099ff 202 struct regnode_charclass_class *start_class;
2c2d71f5
JH
203} scan_data_t;
204
a687059c 205/*
e50aee73 206 * Forward declarations for pregcomp()'s friends.
a687059c 207 */
a0d0e21e 208
27da23d5
JH
209static const scan_data_t zero_scan_data =
210 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
211
212#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213#define SF_BEFORE_SEOL 0x1
214#define SF_BEFORE_MEOL 0x2
215#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
217
09b7f37c
CB
218#ifdef NO_UNARY_PLUS
219# define SF_FIX_SHIFT_EOL (0+2)
220# define SF_FL_SHIFT_EOL (0+4)
221#else
222# define SF_FIX_SHIFT_EOL (+2)
223# define SF_FL_SHIFT_EOL (+4)
224#endif
c277df42
IZ
225
226#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228
229#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231#define SF_IS_INF 0x40
232#define SF_HAS_PAR 0x80
233#define SF_IN_PAR 0x100
234#define SF_HAS_EVAL 0x200
4bfe0158 235#define SCF_DO_SUBSTR 0x400
653099ff
GS
236#define SCF_DO_STCLASS_AND 0x0800
237#define SCF_DO_STCLASS_OR 0x1000
238#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 239#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 240
eb160463 241#define UTF (RExC_utf8 != 0)
e2509266
JH
242#define LOC ((RExC_flags & PMf_LOCALE) != 0)
243#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 244
ffc61ed2 245#define OOB_UNICODE 12345678
93733859 246#define OOB_NAMEDCLASS -1
b8c5462f 247
a0ed51b3
LW
248#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
250
8615cb43 251
b45f050a
JF
252/* length of regex to show in messages that don't mark a position within */
253#define RegexLengthToShowInErrorMessages 127
254
255/*
256 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258 * op/pragma/warn/regcomp.
259 */
7253e4e3
RK
260#define MARKER1 "<-- HERE" /* marker as it appears in the description */
261#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 262
7253e4e3 263#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
264
265/*
266 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267 * arg. Show regex, up to a maximum length. If it's too long, chop and add
268 * "...".
269 */
ccb2c380 270#define FAIL(msg) STMT_START { \
bfed75c6 271 const char *ellipses = ""; \
ccb2c380
MP
272 IV len = RExC_end - RExC_precomp; \
273 \
274 if (!SIZE_ONLY) \
275 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
276 if (len > RegexLengthToShowInErrorMessages) { \
277 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
278 len = RegexLengthToShowInErrorMessages - 10; \
279 ellipses = "..."; \
280 } \
281 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
282 msg, (int)len, RExC_precomp, ellipses); \
283} STMT_END
8615cb43 284
b45f050a
JF
285/*
286 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287 * args. Show regex, up to a maximum length. If it's too long, chop and add
288 * "...".
289 */
ccb2c380 290#define FAIL2(pat,msg) STMT_START { \
bfed75c6 291 const char *ellipses = ""; \
ccb2c380
MP
292 IV len = RExC_end - RExC_precomp; \
293 \
294 if (!SIZE_ONLY) \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 if (len > RegexLengthToShowInErrorMessages) { \
297 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
298 len = RegexLengthToShowInErrorMessages - 10; \
299 ellipses = "..."; \
300 } \
301 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
302 msg, (int)len, RExC_precomp, ellipses); \
303} STMT_END
b45f050a
JF
304
305
306/*
307 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
308 */
ccb2c380
MP
309#define Simple_vFAIL(m) STMT_START { \
310 IV offset = RExC_parse - RExC_precomp; \
311 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
312 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
313} STMT_END
b45f050a
JF
314
315/*
316 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
317 */
ccb2c380
MP
318#define vFAIL(m) STMT_START { \
319 if (!SIZE_ONLY) \
320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
321 Simple_vFAIL(m); \
322} STMT_END
b45f050a
JF
323
324/*
325 * Like Simple_vFAIL(), but accepts two arguments.
326 */
ccb2c380
MP
327#define Simple_vFAIL2(m,a1) STMT_START { \
328 IV offset = RExC_parse - RExC_precomp; \
329 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
330 (int)offset, RExC_precomp, RExC_precomp + offset); \
331} STMT_END
b45f050a
JF
332
333/*
334 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
335 */
ccb2c380
MP
336#define vFAIL2(m,a1) STMT_START { \
337 if (!SIZE_ONLY) \
338 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
339 Simple_vFAIL2(m, a1); \
340} STMT_END
b45f050a
JF
341
342
343/*
344 * Like Simple_vFAIL(), but accepts three arguments.
345 */
ccb2c380
MP
346#define Simple_vFAIL3(m, a1, a2) STMT_START { \
347 IV offset = RExC_parse - RExC_precomp; \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
349 (int)offset, RExC_precomp, RExC_precomp + offset); \
350} STMT_END
b45f050a
JF
351
352/*
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354 */
ccb2c380
MP
355#define vFAIL3(m,a1,a2) STMT_START { \
356 if (!SIZE_ONLY) \
357 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
358 Simple_vFAIL3(m, a1, a2); \
359} STMT_END
b45f050a
JF
360
361/*
362 * Like Simple_vFAIL(), but accepts four arguments.
363 */
ccb2c380
MP
364#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
365 IV offset = RExC_parse - RExC_precomp; \
366 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
367 (int)offset, RExC_precomp, RExC_precomp + offset); \
368} STMT_END
b45f050a
JF
369
370/*
371 * Like Simple_vFAIL(), but accepts five arguments.
372 */
ccb2c380
MP
373#define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
374 IV offset = RExC_parse - RExC_precomp; \
375 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
376 (int)offset, RExC_precomp, RExC_precomp + offset); \
377} STMT_END
378
379
380#define vWARN(loc,m) STMT_START { \
381 IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
383 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
384} STMT_END
385
386#define vWARNdep(loc,m) STMT_START { \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
389 "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
391} STMT_END
392
393
394#define vWARN2(loc, m, a1) STMT_START { \
395 IV offset = loc - RExC_precomp; \
396 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
397 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
398} STMT_END
399
400#define vWARN3(loc, m, a1, a2) STMT_START { \
401 IV offset = loc - RExC_precomp; \
402 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
403 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
404} STMT_END
405
406#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
407 IV offset = loc - RExC_precomp; \
408 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
409 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
410} STMT_END
411
412#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
413 IV offset = loc - RExC_precomp; \
414 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
415 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
416} STMT_END
9d1d55b5 417
8615cb43 418
cd439c50 419/* Allow for side effects in s */
ccb2c380
MP
420#define REGC(c,s) STMT_START { \
421 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
422} STMT_END
cd439c50 423
fac92740
MJD
424/* Macros for recording node offsets. 20001227 mjd@plover.com
425 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
426 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
427 * Element 0 holds the number n.
428 */
429
430#define MJD_OFFSET_DEBUG(x)
a3621e74 431/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
432
433
434#define Set_Node_Offset_To_R(node,byte) STMT_START { \
435 if (! SIZE_ONLY) { \
436 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
437 __LINE__, (node), (byte))); \
438 if((node) < 0) { \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
440 } else { \
441 RExC_offsets[2*(node)-1] = (byte); \
442 } \
443 } \
444} STMT_END
445
446#define Set_Node_Offset(node,byte) \
447 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
449
450#define Set_Node_Length_To_R(node,len) STMT_START { \
451 if (! SIZE_ONLY) { \
452 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
453 __LINE__, (node), (len))); \
454 if((node) < 0) { \
455 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
456 } else { \
457 RExC_offsets[2*(node)] = (len); \
458 } \
459 } \
460} STMT_END
461
462#define Set_Node_Length(node,len) \
463 Set_Node_Length_To_R((node)-RExC_emit_start, len)
464#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465#define Set_Node_Cur_Length(node) \
466 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
467
468/* Get offsets and lengths */
469#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
471
acfe0abc 472static void clear_re(pTHX_ void *r);
4327152a 473
653099ff
GS
474/* Mark that we cannot extend a found fixed substring at this point.
475 Updata the longest found anchored substring and the longest found
476 floating substrings if needed. */
477
4327152a 478STATIC void
830247a4 479S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 480{
e1ec3a88
AL
481 const STRLEN l = CHR_SVLEN(data->last_found);
482 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 483
c277df42 484 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 485 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
486 if (*data->longest == data->longest_fixed) {
487 data->offset_fixed = l ? data->last_start_min : data->pos_min;
488 if (data->flags & SF_BEFORE_EOL)
b81d288d 489 data->flags
c277df42
IZ
490 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
491 else
492 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
493 }
494 else {
c277df42 495 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
496 data->offset_float_max = (l
497 ? data->last_start_max
c277df42 498 : data->pos_min + data->pos_delta);
9051bda5
HS
499 if ((U32)data->offset_float_max > (U32)I32_MAX)
500 data->offset_float_max = I32_MAX;
c277df42 501 if (data->flags & SF_BEFORE_EOL)
b81d288d 502 data->flags
c277df42
IZ
503 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
504 else
505 data->flags &= ~SF_FL_BEFORE_EOL;
506 }
507 }
508 SvCUR_set(data->last_found, 0);
0eda9292
JH
509 {
510 SV * sv = data->last_found;
511 MAGIC *mg =
512 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513 if (mg && mg->mg_len > 0)
514 mg->mg_len = 0;
515 }
c277df42
IZ
516 data->last_end = -1;
517 data->flags &= ~SF_BEFORE_EOL;
518}
519
653099ff
GS
520/* Can match anything (initialization) */
521STATIC void
830247a4 522S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 523{
653099ff 524 ANYOF_CLASS_ZERO(cl);
f8bef550 525 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
527 if (LOC)
528 cl->flags |= ANYOF_LOCALE;
529}
530
531/* Can match anything (initialization) */
532STATIC int
533S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
534{
535 int value;
536
aaa51d5e 537 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
539 return 1;
1aa99e6b
IH
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
541 return 0;
f8bef550
NC
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
543 return 0;
653099ff
GS
544 return 1;
545}
546
547/* Can match anything (initialization) */
548STATIC void
830247a4 549S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 550{
8ecf7187 551 Zero(cl, 1, struct regnode_charclass_class);
653099ff 552 cl->type = ANYOF;
830247a4 553 cl_anything(pRExC_state, cl);
653099ff
GS
554}
555
556STATIC void
830247a4 557S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 558{
8ecf7187 559 Zero(cl, 1, struct regnode_charclass_class);
653099ff 560 cl->type = ANYOF;
830247a4 561 cl_anything(pRExC_state, cl);
653099ff
GS
562 if (LOC)
563 cl->flags |= ANYOF_LOCALE;
564}
565
566/* 'And' a given class with another one. Can create false positives */
567/* We assume that cl is not inverted */
568STATIC void
569S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
571{
653099ff
GS
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
577 int i;
578
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
582 else
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
1aa99e6b 588
14ebb1a2
JH
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
594 }
14ebb1a2
JH
595 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 597 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 600 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
601}
602
603/* 'OR' a given class with another one. Can create false positives */
604/* We assume that cl is not inverted */
605STATIC void
830247a4 606S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 607{
653099ff
GS
608 if (or_with->flags & ANYOF_INVERT) {
609 /* We do not use
610 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611 * <= (B1 | !B2) | (CL1 | !CL2)
612 * which is wasteful if CL2 is small, but we ignore CL2:
613 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614 * XXXX Can we handle case-fold? Unclear:
615 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
617 */
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619 && !(or_with->flags & ANYOF_FOLD)
620 && !(cl->flags & ANYOF_FOLD) ) {
621 int i;
622
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= ~or_with->bitmap[i];
625 } /* XXXX: logic is complicated otherwise */
626 else {
830247a4 627 cl_anything(pRExC_state, cl);
653099ff
GS
628 }
629 } else {
630 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 632 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
633 || (cl->flags & ANYOF_FOLD)) ) {
634 int i;
635
636 /* OR char bitmap and class bitmap separately */
637 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638 cl->bitmap[i] |= or_with->bitmap[i];
639 if (or_with->flags & ANYOF_CLASS) {
640 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641 cl->classflags[i] |= or_with->classflags[i];
642 cl->flags |= ANYOF_CLASS;
643 }
644 }
645 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 646 cl_anything(pRExC_state, cl);
653099ff
GS
647 }
648 }
649 if (or_with->flags & ANYOF_EOS)
650 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
651
652 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653 ARG(cl) != ARG(or_with)) {
654 cl->flags |= ANYOF_UNICODE_ALL;
655 cl->flags &= ~ANYOF_UNICODE;
656 }
657 if (or_with->flags & ANYOF_UNICODE_ALL) {
658 cl->flags |= ANYOF_UNICODE_ALL;
659 cl->flags &= ~ANYOF_UNICODE;
660 }
653099ff
GS
661}
662
5d1c421c 663/*
a3621e74
YO
664
665 make_trie(startbranch,first,last,tail,flags)
666 startbranch: the first branch in the whole branch sequence
667 first : start branch of sequence of branch-exact nodes.
668 May be the same as startbranch
669 last : Thing following the last branch.
670 May be the same as tail.
671 tail : item following the branch sequence
672 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
673
674Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
675
676A trie is an N'ary tree where the branches are determined by digital
677decomposition of the key. IE, at the root node you look up the 1st character and
678follow that branch repeat until you find the end of the branches. Nodes can be
679marked as "accepting" meaning they represent a complete word. Eg:
680
681 /he|she|his|hers/
682
683would convert into the following structure. Numbers represent states, letters
684following numbers represent valid transitions on the letter from that state, if
685the number is in square brackets it represents an accepting state, otherwise it
686will be in parenthesis.
687
688 +-h->+-e->[3]-+-r->(8)-+-s->[9]
689 | |
690 | (2)
691 | |
692 (1) +-i->(6)-+-s->[7]
693 |
694 +-s->(3)-+-h->(4)-+-e->[5]
695
696 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
697
698This shows that when matching against the string 'hers' we will begin at state 1
699read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
700then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
701is also accepting. Thus we know that we can match both 'he' and 'hers' with a
702single traverse. We store a mapping from accepting to state to which word was
703matched, and then when we have multiple possibilities we try to complete the
704rest of the regex in the order in which they occured in the alternation.
705
706The only prior NFA like behaviour that would be changed by the TRIE support is
707the silent ignoring of duplicate alternations which are of the form:
708
709 / (DUPE|DUPE) X? (?{ ... }) Y /x
710
711Thus EVAL blocks follwing a trie may be called a different number of times with
712and without the optimisation. With the optimisations dupes will be silently
713ignored. This inconsistant behaviour of EVAL type nodes is well established as
714the following demonstrates:
715
716 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
717
718which prints out 'word' three times, but
719
720 'words'=~/(word|word|word)(?{ print $1 })S/
721
722which doesnt print it out at all. This is due to other optimisations kicking in.
723
724Example of what happens on a structural level:
725
726The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
727
728 1: CURLYM[1] {1,32767}(18)
729 5: BRANCH(8)
730 6: EXACT <ac>(16)
731 8: BRANCH(11)
732 9: EXACT <ad>(16)
733 11: BRANCH(14)
734 12: EXACT <ab>(16)
735 16: SUCCEED(0)
736 17: NOTHING(18)
737 18: END(0)
738
739This would be optimizable with startbranch=5, first=5, last=16, tail=16
740and should turn into:
741
742 1: CURLYM[1] {1,32767}(18)
743 5: TRIE(16)
744 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
745 <ac>
746 <ad>
747 <ab>
748 16: SUCCEED(0)
749 17: NOTHING(18)
750 18: END(0)
751
752Cases where tail != last would be like /(?foo|bar)baz/:
753
754 1: BRANCH(4)
755 2: EXACT <foo>(8)
756 4: BRANCH(7)
757 5: EXACT <bar>(8)
758 7: TAIL(8)
759 8: EXACT <baz>(10)
760 10: END(0)
761
762which would be optimizable with startbranch=1, first=1, last=7, tail=8
763and would end up looking like:
764
765 1: TRIE(8)
766 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
767 <foo>
768 <bar>
769 7: TAIL(8)
770 8: EXACT <baz>(10)
771 10: END(0)
772
773*/
774
775#define TRIE_DEBUG_CHAR \
776 DEBUG_TRIE_COMPILE_r({ \
777 SV *tmp; \
778 if ( UTF ) { \
779 tmp = newSVpv( "", 0 ); \
780 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
781 } else { \
e4584336 782 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
a3621e74
YO
783 } \
784 av_push( trie->revcharmap, tmp ); \
785 })
786
787#define TRIE_READ_CHAR STMT_START { \
788 if ( UTF ) { \
789 if ( folder ) { \
790 if ( foldlen > 0 ) { \
791 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
792 foldlen -= len; \
793 scan += len; \
794 len = 0; \
795 } else { \
e1ec3a88 796 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
797 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
798 foldlen -= UNISKIP( uvc ); \
799 scan = foldbuf + UNISKIP( uvc ); \
800 } \
801 } else { \
e1ec3a88 802 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
803 } \
804 } else { \
805 uvc = (U32)*uc; \
806 len = 1; \
807 } \
808} STMT_END
809
810
811#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
812#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
813#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
814#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
815
816#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
817 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
818 TRIE_LIST_LEN( state ) *= 2; \
819 Renew( trie->states[ state ].trans.list, \
820 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
821 } \
822 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
823 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
824 TRIE_LIST_CUR( state )++; \
825} STMT_END
826
827#define TRIE_LIST_NEW(state) STMT_START { \
828 Newz( 1023, trie->states[ state ].trans.list, \
829 4, reg_trie_trans_le ); \
830 TRIE_LIST_CUR( state ) = 1; \
831 TRIE_LIST_LEN( state ) = 4; \
832} STMT_END
833
834STATIC I32
835S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
836{
27da23d5 837 dVAR;
a3621e74
YO
838 /* first pass, loop through and scan words */
839 reg_trie_data *trie;
840 regnode *cur;
e1ec3a88 841 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
a3621e74
YO
842 STRLEN len = 0;
843 UV uvc = 0;
844 U16 curword = 0;
845 U32 next_alloc = 0;
846 /* we just use folder as a flag in utf8 */
e1ec3a88 847 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
848 ? PL_fold
849 : ( flags == EXACTFL
850 ? PL_fold_locale
851 : NULL
852 )
853 );
854
e1ec3a88 855 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74
YO
856 SV *re_trie_maxbuff;
857
858 GET_RE_DEBUG_FLAGS_DECL;
859
860 Newz( 848200, trie, 1, reg_trie_data );
861 trie->refcount = 1;
862 RExC_rx->data->data[ data_slot ] = (void*)trie;
863 Newz( 848201, trie->charmap, 256, U16 );
864 DEBUG_r({
865 trie->words = newAV();
866 trie->revcharmap = newAV();
867 });
868
869
0111c4fd 870 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 871 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 872 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74
YO
873 }
874
875 /* -- First loop and Setup --
876
877 We first traverse the branches and scan each word to determine if it
878 contains widechars, and how many unique chars there are, this is
879 important as we have to build a table with at least as many columns as we
880 have unique chars.
881
882 We use an array of integers to represent the character codes 0..255
883 (trie->charmap) and we use a an HV* to store unicode characters. We use the
884 native representation of the character value as the key and IV's for the
885 coded index.
886
887 *TODO* If we keep track of how many times each character is used we can
888 remap the columns so that the table compression later on is more
889 efficient in terms of memory by ensuring most common value is in the
890 middle and the least common are on the outside. IMO this would be better
891 than a most to least common mapping as theres a decent chance the most
892 common letter will share a node with the least common, meaning the node
893 will not be compressable. With a middle is most common approach the worst
894 case is when we have the least common nodes twice.
895
896 */
897
898
899 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
900 regnode *noper = NEXTOPER( cur );
e1ec3a88
AL
901 const U8 *uc = (U8*)STRING( noper );
902 const U8 *e = uc + STR_LEN( noper );
a3621e74
YO
903 STRLEN foldlen = 0;
904 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
e1ec3a88 905 const U8 *scan;
a3621e74
YO
906
907 for ( ; uc < e ; uc += len ) {
908 trie->charcount++;
909 TRIE_READ_CHAR;
910 if ( uvc < 256 ) {
911 if ( !trie->charmap[ uvc ] ) {
912 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
913 if ( folder )
914 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
915 TRIE_DEBUG_CHAR;
916 }
917 } else {
918 SV** svpp;
919 if ( !trie->widecharmap )
920 trie->widecharmap = newHV();
921
922 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
923
924 if ( !svpp )
e4584336 925 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
926
927 if ( !SvTRUE( *svpp ) ) {
928 sv_setiv( *svpp, ++trie->uniquecharcount );
929 TRIE_DEBUG_CHAR;
930 }
931 }
932 }
933 trie->wordcount++;
934 } /* end first pass */
935 DEBUG_TRIE_COMPILE_r(
936 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
937 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
938 trie->charcount, trie->uniquecharcount )
939 );
940
941
942 /*
943 We now know what we are dealing with in terms of unique chars and
944 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
945 representation using a flat table will take. If it's over a reasonable
946 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
947 conservative but potentially much slower representation using an array
948 of lists.
949
950 At the end we convert both representations into the same compressed
951 form that will be used in regexec.c for matching with. The latter
952 is a form that cannot be used to construct with but has memory
953 properties similar to the list form and access properties similar
954 to the table form making it both suitable for fast searches and
955 small enough that its feasable to store for the duration of a program.
956
957 See the comment in the code where the compressed table is produced
958 inplace from the flat tabe representation for an explanation of how
959 the compression works.
960
961 */
962
963
964 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
965 /*
966 Second Pass -- Array Of Lists Representation
967
968 Each state will be represented by a list of charid:state records
969 (reg_trie_trans_le) the first such element holds the CUR and LEN
970 points of the allocated array. (See defines above).
971
972 We build the initial structure using the lists, and then convert
973 it into the compressed table form which allows faster lookups
974 (but cant be modified once converted).
975
976
977 */
978
979
980 STRLEN transcount = 1;
981
982 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
983 TRIE_LIST_NEW(1);
984 next_alloc = 2;
985
986 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
987
988 regnode *noper = NEXTOPER( cur );
989 U8 *uc = (U8*)STRING( noper );
990 U8 *e = uc + STR_LEN( noper );
991 U32 state = 1; /* required init */
992 U16 charid = 0; /* sanity init */
993 U8 *scan = (U8*)NULL; /* sanity init */
994 STRLEN foldlen = 0; /* required init */
995 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
996
997
998 for ( ; uc < e ; uc += len ) {
999
1000 TRIE_READ_CHAR;
1001
1002 if ( uvc < 256 ) {
1003 charid = trie->charmap[ uvc ];
1004 } else {
1005 SV** svpp=(SV**)NULL;
1006 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1007 if ( !svpp ) {
1008 charid = 0;
1009 } else {
1010 charid=(U16)SvIV( *svpp );
1011 }
1012 }
1013 if ( charid ) {
1014
1015 U16 check;
1016 U32 newstate = 0;
1017
1018 charid--;
1019 if ( !trie->states[ state ].trans.list ) {
1020 TRIE_LIST_NEW( state );
1021 }
1022 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1023 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1024 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1025 break;
1026 }
1027 }
1028 if ( ! newstate ) {
1029 newstate = next_alloc++;
1030 TRIE_LIST_PUSH( state, charid, newstate );
1031 transcount++;
1032 }
1033 state = newstate;
1034
1035 } else {
e4584336 1036 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1037 }
1038 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1039 }
1040
1041 if ( !trie->states[ state ].wordnum ) {
1042 /* we havent inserted this word into the structure yet. */
1043 trie->states[ state ].wordnum = ++curword;
1044
1045 DEBUG_r({
1046 /* store the word for dumping */
1047 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1048 if ( UTF ) SvUTF8_on( tmp );
1049 av_push( trie->words, tmp );
1050 });
1051
1052 } else {
1053 /* Its a dupe. So ignore it. */
1054 }
1055
1056 } /* end second pass */
1057
1058 trie->laststate = next_alloc;
1059 Renew( trie->states, next_alloc, reg_trie_state );
1060
1061 DEBUG_TRIE_COMPILE_MORE_r({
1062 U32 state;
1063 U16 charid;
1064
1065 /*
1066 print out the table precompression.
1067 */
1068
1069 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1070 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1071
1072 for( state=1 ; state < next_alloc ; state ++ ) {
1073
e4584336 1074 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
a3621e74
YO
1075 if ( ! trie->states[ state ].wordnum ) {
1076 PerlIO_printf( Perl_debug_log, "%5s| ","");
1077 } else {
e4584336 1078 PerlIO_printf( Perl_debug_log, "W%04x| ",
a3621e74
YO
1079 trie->states[ state ].wordnum
1080 );
1081 }
1082 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1083 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
e4584336 1084 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
a3621e74
YO
1085 SvPV_nolen( *tmp ),
1086 TRIE_LIST_ITEM(state,charid).forid,
e4584336 1087 (UV)TRIE_LIST_ITEM(state,charid).newstate
a3621e74
YO
1088 );
1089 }
1090
1091 }
1092 PerlIO_printf( Perl_debug_log, "\n\n" );
1093 });
1094
1095 Newz( 848203, trie->trans, transcount ,reg_trie_trans );
1096 {
1097 U32 state;
1098 U16 idx;
1099 U32 tp = 0;
1100 U32 zp = 0;
1101
1102
1103 for( state=1 ; state < next_alloc ; state ++ ) {
1104 U32 base=0;
1105
1106 /*
1107 DEBUG_TRIE_COMPILE_MORE_r(
1108 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1109 );
1110 */
1111
1112 if (trie->states[state].trans.list) {
1113 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1114 U16 maxid=minid;
1115
1116
1117 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1118 if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
1119 minid=TRIE_LIST_ITEM( state, idx).forid;
1120 } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
1121 maxid=TRIE_LIST_ITEM( state, idx).forid;
1122 }
1123 }
1124 if ( transcount < tp + maxid - minid + 1) {
1125 transcount *= 2;
1126 Renew( trie->trans, transcount, reg_trie_trans );
1127 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1128 }
1129 base = trie->uniquecharcount + tp - minid;
1130 if ( maxid == minid ) {
1131 U32 set = 0;
1132 for ( ; zp < tp ; zp++ ) {
1133 if ( ! trie->trans[ zp ].next ) {
1134 base = trie->uniquecharcount + zp - minid;
1135 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1136 trie->trans[ zp ].check = state;
1137 set = 1;
1138 break;
1139 }
1140 }
1141 if ( !set ) {
1142 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1143 trie->trans[ tp ].check = state;
1144 tp++;
1145 zp = tp;
1146 }
1147 } else {
1148 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1149 U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1150 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1151 trie->trans[ tid ].check = state;
1152 }
1153 tp += ( maxid - minid + 1 );
1154 }
1155 Safefree(trie->states[ state ].trans.list);
1156 }
1157 /*
1158 DEBUG_TRIE_COMPILE_MORE_r(
1159 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1160 );
1161 */
1162 trie->states[ state ].trans.base=base;
1163 }
cc601c31 1164 trie->lasttrans = tp + 1;
a3621e74
YO
1165 }
1166 } else {
1167 /*
1168 Second Pass -- Flat Table Representation.
1169
1170 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1171 We know that we will need Charcount+1 trans at most to store the data
1172 (one row per char at worst case) So we preallocate both structures
1173 assuming worst case.
1174
1175 We then construct the trie using only the .next slots of the entry
1176 structs.
1177
1178 We use the .check field of the first entry of the node temporarily to
1179 make compression both faster and easier by keeping track of how many non
1180 zero fields are in the node.
1181
1182 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1183 transition.
1184
1185 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1186 number representing the first entry of the node, and state as a
1187 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1188 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1189 are 2 entrys per node. eg:
1190
1191 A B A B
1192 1. 2 4 1. 3 7
1193 2. 0 3 3. 0 5
1194 3. 0 0 5. 0 0
1195 4. 0 0 7. 0 0
1196
1197 The table is internally in the right hand, idx form. However as we also
1198 have to deal with the states array which is indexed by nodenum we have to
1199 use TRIE_NODENUM() to convert.
1200
1201 */
1202
1203 Newz( 848203, trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1204 reg_trie_trans );
1205 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
1206 next_alloc = trie->uniquecharcount + 1;
1207
1208 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1209
1210 regnode *noper = NEXTOPER( cur );
1211 U8 *uc = (U8*)STRING( noper );
1212 U8 *e = uc + STR_LEN( noper );
1213
1214 U32 state = 1; /* required init */
1215
1216 U16 charid = 0; /* sanity init */
1217 U32 accept_state = 0; /* sanity init */
1218 U8 *scan = (U8*)NULL; /* sanity init */
1219
1220 STRLEN foldlen = 0; /* required init */
1221 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1222
1223
1224 for ( ; uc < e ; uc += len ) {
1225
1226 TRIE_READ_CHAR;
1227
1228 if ( uvc < 256 ) {
1229 charid = trie->charmap[ uvc ];
1230 } else {
1231 SV** svpp=(SV**)NULL;
1232 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1233 if ( !svpp ) {
1234 charid = 0;
1235 } else {
1236 charid=(U16)SvIV( *svpp );
1237 }
1238 }
1239 if ( charid ) {
1240 charid--;
1241 if ( !trie->trans[ state + charid ].next ) {
1242 trie->trans[ state + charid ].next = next_alloc;
1243 trie->trans[ state ].check++;
1244 next_alloc += trie->uniquecharcount;
1245 }
1246 state = trie->trans[ state + charid ].next;
1247 } else {
e4584336 1248 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1249 }
1250 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1251 }
1252
1253 accept_state = TRIE_NODENUM( state );
1254 if ( !trie->states[ accept_state ].wordnum ) {
1255 /* we havent inserted this word into the structure yet. */
1256 trie->states[ accept_state ].wordnum = ++curword;
1257
1258 DEBUG_r({
1259 /* store the word for dumping */
1260 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1261 if ( UTF ) SvUTF8_on( tmp );
1262 av_push( trie->words, tmp );
1263 });
1264
1265 } else {
1266 /* Its a dupe. So ignore it. */
1267 }
1268
1269 } /* end second pass */
1270
1271 DEBUG_TRIE_COMPILE_MORE_r({
1272 /*
1273 print out the table precompression so that we can do a visual check
1274 that they are identical.
1275 */
1276 U32 state;
1277 U16 charid;
1278 PerlIO_printf( Perl_debug_log, "\nChar : " );
1279
1280 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1281 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1282 if ( tmp ) {
1283 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1284 }
1285 }
1286
1287 PerlIO_printf( Perl_debug_log, "\nState+-" );
1288
1289 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1290 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1291 }
1292
1293 PerlIO_printf( Perl_debug_log, "\n" );
1294
1295 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1296
e4584336 1297 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
a3621e74
YO
1298
1299 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
e4584336
RB
1300 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1301 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
a3621e74
YO
1302 }
1303 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
e4584336 1304 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
a3621e74 1305 } else {
e4584336 1306 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
a3621e74
YO
1307 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1308 }
1309 }
1310 PerlIO_printf( Perl_debug_log, "\n\n" );
1311 });
1312 {
1313 /*
1314 * Inplace compress the table.*
1315
1316 For sparse data sets the table constructed by the trie algorithm will
1317 be mostly 0/FAIL transitions or to put it another way mostly empty.
1318 (Note that leaf nodes will not contain any transitions.)
1319
1320 This algorithm compresses the tables by eliminating most such
1321 transitions, at the cost of a modest bit of extra work during lookup:
1322
1323 - Each states[] entry contains a .base field which indicates the
1324 index in the state[] array wheres its transition data is stored.
1325
1326 - If .base is 0 there are no valid transitions from that node.
1327
1328 - If .base is nonzero then charid is added to it to find an entry in
1329 the trans array.
1330
1331 -If trans[states[state].base+charid].check!=state then the
1332 transition is taken to be a 0/Fail transition. Thus if there are fail
1333 transitions at the front of the node then the .base offset will point
1334 somewhere inside the previous nodes data (or maybe even into a node
1335 even earlier), but the .check field determines if the transition is
1336 valid.
1337
1338 The following process inplace converts the table to the compressed
1339 table: We first do not compress the root node 1,and mark its all its
1340 .check pointers as 1 and set its .base pointer as 1 as well. This
1341 allows to do a DFA construction from the compressed table later, and
1342 ensures that any .base pointers we calculate later are greater than
1343 0.
1344
1345 - We set 'pos' to indicate the first entry of the second node.
1346
1347 - We then iterate over the columns of the node, finding the first and
1348 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1349 and set the .check pointers accordingly, and advance pos
1350 appropriately and repreat for the next node. Note that when we copy
1351 the next pointers we have to convert them from the original
1352 NODEIDX form to NODENUM form as the former is not valid post
1353 compression.
1354
1355 - If a node has no transitions used we mark its base as 0 and do not
1356 advance the pos pointer.
1357
1358 - If a node only has one transition we use a second pointer into the
1359 structure to fill in allocated fail transitions from other states.
1360 This pointer is independent of the main pointer and scans forward
1361 looking for null transitions that are allocated to a state. When it
1362 finds one it writes the single transition into the "hole". If the
1363 pointer doesnt find one the single transition is appeneded as normal.
1364
1365 - Once compressed we can Renew/realloc the structures to release the
1366 excess space.
1367
1368 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1369 specifically Fig 3.47 and the associated pseudocode.
1370
1371 demq
1372 */
1373 U32 laststate = TRIE_NODENUM( next_alloc );
1374 U32 used , state, charid;
1375 U32 pos = 0, zp=0;
1376 trie->laststate = laststate;
1377
1378 for ( state = 1 ; state < laststate ; state++ ) {
1379 U8 flag = 0;
1380 U32 stateidx = TRIE_NODEIDX( state );
1381 U32 o_used=trie->trans[ stateidx ].check;
1382 used = trie->trans[ stateidx ].check;
1383 trie->trans[ stateidx ].check = 0;
1384
1385 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1386 if ( flag || trie->trans[ stateidx + charid ].next ) {
1387 if ( trie->trans[ stateidx + charid ].next ) {
1388 if (o_used == 1) {
1389 for ( ; zp < pos ; zp++ ) {
1390 if ( ! trie->trans[ zp ].next ) {
1391 break;
1392 }
1393 }
1394 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1395 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1396 trie->trans[ zp ].check = state;
1397 if ( ++zp > pos ) pos = zp;
1398 break;
1399 }
1400 used--;
1401 }
1402 if ( !flag ) {
1403 flag = 1;
1404 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1405 }
1406 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1407 trie->trans[ pos ].check = state;
1408 pos++;
1409 }
1410 }
1411 }
cc601c31 1412 trie->lasttrans = pos + 1;
a3621e74
YO
1413 Renew( trie->states, laststate + 1, reg_trie_state);
1414 DEBUG_TRIE_COMPILE_MORE_r(
e4584336
RB
1415 PerlIO_printf( Perl_debug_log,
1416 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1417 ( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ), (IV)next_alloc, (IV)pos,
a3621e74
YO
1418 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1419 );
1420
1421 } /* end table compress */
1422 }
cc601c31
YO
1423 /* resize the trans array to remove unused space */
1424 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74
YO
1425
1426 DEBUG_TRIE_COMPILE_r({
1427 U32 state;
1428 /*
1429 Now we print it out again, in a slightly different form as there is additional
1430 info we want to be able to see when its compressed. They are close enough for
1431 visual comparison though.
1432 */
1433 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1434
1435 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1436 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1437 if ( tmp ) {
1438 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1439 }
1440 }
1441 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
cc601c31 1442
a3621e74
YO
1443 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1444 PerlIO_printf( Perl_debug_log, "-----");
1445 PerlIO_printf( Perl_debug_log, "\n");
cc601c31 1446
a3621e74
YO
1447 for( state = 1 ; state < trie->laststate ; state++ ) {
1448 U32 base = trie->states[ state ].trans.base;
1449
e4584336 1450 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
a3621e74
YO
1451
1452 if ( trie->states[ state ].wordnum ) {
1453 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1454 } else {
1455 PerlIO_printf( Perl_debug_log, "%6s", "" );
1456 }
1457
e4584336 1458 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
a3621e74
YO
1459
1460 if ( base ) {
1461 U32 ofs = 0;
1462
cc601c31
YO
1463 while( ( base + ofs < trie->uniquecharcount ) ||
1464 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1465 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
a3621e74
YO
1466 ofs++;
1467
e4584336 1468 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
a3621e74
YO
1469
1470 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1471 if ( ( base + ofs >= trie->uniquecharcount ) &&
1472 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1473 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1474 {
e4584336
RB
1475 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1476 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
a3621e74
YO
1477 } else {
1478 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1479 }
1480 }
1481
e4584336 1482 PerlIO_printf( Perl_debug_log, "]");
a3621e74
YO
1483
1484 }
1485 PerlIO_printf( Perl_debug_log, "\n" );
1486 }
1487 });
1488
1489 {
1490 /* now finally we "stitch in" the new TRIE node
1491 This means we convert either the first branch or the first Exact,
1492 depending on whether the thing following (in 'last') is a branch
1493 or not and whther first is the startbranch (ie is it a sub part of
1494 the alternation or is it the whole thing.)
1495 Assuming its a sub part we conver the EXACT otherwise we convert
1496 the whole branch sequence, including the first.
1497 */
1498 regnode *convert;
1499
1500
1501
1502
1503 if ( first == startbranch && OP( last ) != BRANCH ) {
1504 convert = first;
1505 } else {
1506 convert = NEXTOPER( first );
1507 NEXT_OFF( first ) = (U16)(last - first);
1508 }
1509
1510 OP( convert ) = TRIE + (U8)( flags - EXACT );
1511 NEXT_OFF( convert ) = (U16)(tail - convert);
1512 ARG_SET( convert, data_slot );
1513
1514 /* tells us if we need to handle accept buffers specially */
1515 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1516
1517
1518 /* needed for dumping*/
1519 DEBUG_r({
1520 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1521 /* We now need to mark all of the space originally used by the
1522 branches as optimized away. This keeps the dumpuntil from
1523 throwing a wobbly as it doesnt use regnext() to traverse the
1524 opcodes.
1525 */
1526 while( optimize < last ) {
1527 OP( optimize ) = OPTIMIZED;
1528 optimize++;
1529 }
1530 });
1531 } /* end node insert */
1532 return 1;
1533}
1534
1535
1536
1537/*
5d1c421c
JH
1538 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1539 * These need to be revisited when a newer toolchain becomes available.
1540 */
1541#if defined(__sparc64__) && defined(__GNUC__)
1542# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1543# undef SPARC64_GCC_WORKAROUND
1544# define SPARC64_GCC_WORKAROUND 1
1545# endif
1546#endif
1547
653099ff
GS
1548/* REx optimizer. Converts nodes into quickier variants "in place".
1549 Finds fixed substrings. */
1550
c277df42
IZ
1551/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
1552 to the position after last scanned or to NULL. */
1553
a3621e74 1554
76e3520e 1555STATIC I32
a3621e74 1556S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1557 /* scanp: Start here (read-write). */
1558 /* deltap: Write maxlen-minlen here. */
1559 /* last: Stop before this one. */
1560{
1561 I32 min = 0, pars = 0, code;
1562 regnode *scan = *scanp, *next;
1563 I32 delta = 0;
1564 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1565 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1566 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1567 scan_data_t data_fake;
653099ff 1568 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1569 SV *re_trie_maxbuff = NULL;
1570
1571 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1572
c277df42
IZ
1573 while (scan && OP(scan) != END && scan < last) {
1574 /* Peephole optimizer: */
a3621e74
YO
1575 DEBUG_OPTIMISE_r({
1576 SV *mysv=sv_newmortal();
1577 regprop( mysv, scan);
e4584336
RB
1578 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
1579 (int)depth*2, "", SvPV_nolen(mysv), PTR2UV(scan));
a3621e74 1580 });
c277df42 1581
22c35a8c 1582 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 1583 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1584 regnode *n = regnext(scan);
1585 U32 stringok = 1;
1586#ifdef DEBUGGING
1587 regnode *stop = scan;
b81d288d 1588#endif
c277df42 1589
cd439c50 1590 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1591 /* Skip NOTHING, merge EXACT*. */
1592 while (n &&
b81d288d 1593 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
1594 (stringok && (OP(n) == OP(scan))))
1595 && NEXT_OFF(n)
1596 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1597 if (OP(n) == TAIL || n > next)
1598 stringok = 0;
22c35a8c 1599 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
1600 NEXT_OFF(scan) += NEXT_OFF(n);
1601 next = n + NODE_STEP_REGNODE;
1602#ifdef DEBUGGING
1603 if (stringok)
1604 stop = n;
b81d288d 1605#endif
c277df42 1606 n = regnext(n);
a0ed51b3 1607 }
f49d4d0f 1608 else if (stringok) {
cd439c50 1609 int oldl = STR_LEN(scan);
c277df42 1610 regnode *nnext = regnext(n);
f49d4d0f 1611
b81d288d 1612 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1613 break;
1614 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1615 STR_LEN(scan) += STR_LEN(n);
1616 next = n + NODE_SZ_STR(n);
c277df42 1617 /* Now we can overwrite *n : */
f49d4d0f 1618 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1619#ifdef DEBUGGING
f49d4d0f 1620 stop = next - 1;
b81d288d 1621#endif
c277df42
IZ
1622 n = nnext;
1623 }
1624 }
61a36c01 1625
a3621e74 1626 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1627/*
1628 Two problematic code points in Unicode casefolding of EXACT nodes:
1629
1630 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1631 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1632
1633 which casefold to
1634
1635 Unicode UTF-8
1636
1637 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1638 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1639
1640 This means that in case-insensitive matching (or "loose matching",
1641 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1642 length of the above casefolded versions) can match a target string
1643 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1644 This would rather mess up the minimum length computation.
1645
1646 What we'll do is to look for the tail four bytes, and then peek
1647 at the preceding two bytes to see whether we need to decrease
1648 the minimum length by four (six minus two).
1649
1650 Thanks to the design of UTF-8, there cannot be false matches:
1651 A sequence of valid UTF-8 bytes cannot be a subsequence of
1652 another valid sequence of UTF-8 bytes.
1653
1654*/
1655 char *s0 = STRING(scan), *s, *t;
1656 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
bfed75c6
AL
1657 const char *t0 = "\xcc\x88\xcc\x81";
1658 const char *t1 = t0 + 3;
61a36c01
JH
1659
1660 for (s = s0 + 2;
1661 s < s2 && (t = ninstr(s, s1, t0, t1));
1662 s = t + 4) {
1663 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1664 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1665 min -= 4;
1666 }
1667 }
1668
c277df42
IZ
1669#ifdef DEBUGGING
1670 /* Allow dumping */
cd439c50 1671 n = scan + NODE_SZ_STR(scan);
c277df42 1672 while (n <= stop) {
22c35a8c 1673 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1674 OP(n) = OPTIMIZED;
1675 NEXT_OFF(n) = 0;
1676 }
1677 n++;
1678 }
653099ff 1679#endif
c277df42 1680 }
a3621e74
YO
1681
1682
1683
653099ff
GS
1684 /* Follow the next-chain of the current node and optimize
1685 away all the NOTHINGs from it. */
c277df42 1686 if (OP(scan) != CURLYX) {
048cfca1
GS
1687 int max = (reg_off_by_arg[OP(scan)]
1688 ? I32_MAX
1689 /* I32 may be smaller than U16 on CRAYs! */
1690 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1691 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1692 int noff;
1693 regnode *n = scan;
b81d288d 1694
c277df42
IZ
1695 /* Skip NOTHING and LONGJMP. */
1696 while ((n = regnext(n))
22c35a8c 1697 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1698 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1699 && off + noff < max)
1700 off += noff;
1701 if (reg_off_by_arg[OP(scan)])
1702 ARG(scan) = off;
b81d288d 1703 else
c277df42
IZ
1704 NEXT_OFF(scan) = off;
1705 }
a3621e74 1706
653099ff
GS
1707 /* The principal pseudo-switch. Cannot be a switch, since we
1708 look into several different things. */
b81d288d 1709 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1710 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1711 next = regnext(scan);
1712 code = OP(scan);
a3621e74 1713 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1714
1715 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1716 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1717 struct regnode_charclass_class accum;
a3621e74 1718 regnode *startbranch=scan;
c277df42 1719
653099ff 1720 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1721 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1722 if (flags & SCF_DO_STCLASS)
830247a4 1723 cl_init_zero(pRExC_state, &accum);
a3621e74 1724
c277df42 1725 while (OP(scan) == code) {
830247a4 1726 I32 deltanext, minnext, f = 0, fake;
653099ff 1727 struct regnode_charclass_class this_class;
c277df42
IZ
1728
1729 num++;
1730 data_fake.flags = 0;
b81d288d 1731 if (data) {
2c2d71f5 1732 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1733 data_fake.last_closep = data->last_closep;
1734 }
1735 else
1736 data_fake.last_closep = &fake;
c277df42
IZ
1737 next = regnext(scan);
1738 scan = NEXTOPER(scan);
1739 if (code != BRANCH)
1740 scan = NEXTOPER(scan);
653099ff 1741 if (flags & SCF_DO_STCLASS) {
830247a4 1742 cl_init(pRExC_state, &this_class);
653099ff
GS
1743 data_fake.start_class = &this_class;
1744 f = SCF_DO_STCLASS_AND;
b81d288d 1745 }
e1901655
IZ
1746 if (flags & SCF_WHILEM_VISITED_POS)
1747 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1748
653099ff 1749 /* we suppose the run is continuous, last=next...*/
830247a4 1750 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1751 next, &data_fake, f,depth+1);
b81d288d 1752 if (min1 > minnext)
c277df42
IZ
1753 min1 = minnext;
1754 if (max1 < minnext + deltanext)
1755 max1 = minnext + deltanext;
1756 if (deltanext == I32_MAX)
aca2d497 1757 is_inf = is_inf_internal = 1;
c277df42
IZ
1758 scan = next;
1759 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1760 pars++;
405ff068 1761 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1762 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1763 if (data)
1764 data->whilem_c = data_fake.whilem_c;
653099ff 1765 if (flags & SCF_DO_STCLASS)
830247a4 1766 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1767 if (code == SUSPEND)
c277df42
IZ
1768 break;
1769 }
1770 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1771 min1 = 0;
1772 if (flags & SCF_DO_SUBSTR) {
1773 data->pos_min += min1;
1774 data->pos_delta += max1 - min1;
1775 if (max1 != min1 || is_inf)
1776 data->longest = &(data->longest_float);
1777 }
1778 min += min1;
1779 delta += max1 - min1;
653099ff 1780 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1781 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1782 if (min1) {
1783 cl_and(data->start_class, &and_with);
1784 flags &= ~SCF_DO_STCLASS;
1785 }
1786 }
1787 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1788 if (min1) {
1789 cl_and(data->start_class, &accum);
653099ff 1790 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1791 }
1792 else {
b81d288d 1793 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1794 * data->start_class */
1795 StructCopy(data->start_class, &and_with,
1796 struct regnode_charclass_class);
1797 flags &= ~SCF_DO_STCLASS_AND;
1798 StructCopy(&accum, data->start_class,
1799 struct regnode_charclass_class);
1800 flags |= SCF_DO_STCLASS_OR;
1801 data->start_class->flags |= ANYOF_EOS;
1802 }
653099ff 1803 }
a3621e74
YO
1804
1805 /* demq.
1806
1807 Assuming this was/is a branch we are dealing with: 'scan' now
1808 points at the item that follows the branch sequence, whatever
1809 it is. We now start at the beginning of the sequence and look
1810 for subsequences of
1811
1812 BRANCH->EXACT=>X
1813 BRANCH->EXACT=>X
1814
1815 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1816
1817 If we can find such a subseqence we need to turn the first
1818 element into a trie and then add the subsequent branch exact
1819 strings to the trie.
1820
1821 We have two cases
1822
1823 1. patterns where the whole set of branch can be converted to a trie,
1824
1825 2. patterns where only a subset of the alternations can be
1826 converted to a trie.
1827
1828 In case 1 we can replace the whole set with a single regop
1829 for the trie. In case 2 we need to keep the start and end
1830 branchs so
1831
1832 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1833 becomes BRANCH TRIE; BRANCH X;
1834
1835 Hypthetically when we know the regex isnt anchored we can
1836 turn a case 1 into a DFA and let it rip... Every time it finds a match
1837 it would just call its tail, no WHILEM/CURLY needed.
1838
1839 */
0111c4fd
RGS
1840 if (DO_TRIE) {
1841 if (!re_trie_maxbuff) {
1842 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1843 if (!SvIOK(re_trie_maxbuff))
1844 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1845 }
a3621e74
YO
1846 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1847 regnode *cur;
1848 regnode *first = (regnode *)NULL;
1849 regnode *last = (regnode *)NULL;
1850 regnode *tail = scan;
1851 U8 optype = 0;
1852 U32 count=0;
1853
1854#ifdef DEBUGGING
1855 SV *mysv = sv_newmortal(); /* for dumping */
1856#endif
1857 /* var tail is used because there may be a TAIL
1858 regop in the way. Ie, the exacts will point to the
1859 thing following the TAIL, but the last branch will
1860 point at the TAIL. So we advance tail. If we
1861 have nested (?:) we may have to move through several
1862 tails.
1863 */
1864
1865 while ( OP( tail ) == TAIL ) {
1866 /* this is the TAIL generated by (?:) */
1867 tail = regnext( tail );
1868 }
1869
1870 DEBUG_OPTIMISE_r({
1871 regprop( mysv, tail );
1872 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
e4584336 1873 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen( mysv ),
a3621e74
YO
1874 (RExC_seen_evals) ? "[EVAL]" : ""
1875 );
1876 });
1877 /*
1878
1879 step through the branches, cur represents each
1880 branch, noper is the first thing to be matched
1881 as part of that branch and noper_next is the
1882 regnext() of that node. if noper is an EXACT
1883 and noper_next is the same as scan (our current
1884 position in the regex) then the EXACT branch is
1885 a possible optimization target. Once we have
1886 two or more consequetive such branches we can
1887 create a trie of the EXACT's contents and stich
1888 it in place. If the sequence represents all of
1889 the branches we eliminate the whole thing and
1890 replace it with a single TRIE. If it is a
1891 subsequence then we need to stitch it in. This
1892 means the first branch has to remain, and needs
1893 to be repointed at the item on the branch chain
1894 following the last branch optimized. This could
1895 be either a BRANCH, in which case the
1896 subsequence is internal, or it could be the
1897 item following the branch sequence in which
1898 case the subsequence is at the end.
1899
1900 */
1901
1902 /* dont use tail as the end marker for this traverse */
1903 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1904 regnode *noper = NEXTOPER( cur );
1905 regnode *noper_next = regnext( noper );
1906
a3621e74
YO
1907 DEBUG_OPTIMISE_r({
1908 regprop( mysv, cur);
1909 PerlIO_printf( Perl_debug_log, "%*s%s",
e4584336 1910 (int)depth * 2 + 2," ", SvPV_nolen( mysv ) );
a3621e74
YO
1911
1912 regprop( mysv, noper);
1913 PerlIO_printf( Perl_debug_log, " -> %s",
1914 SvPV_nolen(mysv));
1915
1916 if ( noper_next ) {
1917 regprop( mysv, noper_next );
1918 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1919 SvPV_nolen(mysv));
1920 }
1921 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1922 first, last, cur );
1923 });
1924 if ( ( first ? OP( noper ) == optype
1925 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1926 && noper_next == tail && count<U16_MAX)
1927 {
1928 count++;
1929 if ( !first ) {
1930 first = cur;
1931 optype = OP( noper );
1932 } else {
1933 DEBUG_OPTIMISE_r(
1934 if (!last ) {
1935 regprop( mysv, first);
1936 PerlIO_printf( Perl_debug_log, "%*s%s",
e4584336 1937 (int)depth * 2 + 2, "F:", SvPV_nolen( mysv ) );
a3621e74
YO
1938 regprop( mysv, NEXTOPER(first) );
1939 PerlIO_printf( Perl_debug_log, " -> %s\n",
1940 SvPV_nolen( mysv ) );
1941 }
1942 );
1943 last = cur;
1944 DEBUG_OPTIMISE_r({
1945 regprop( mysv, cur);
1946 PerlIO_printf( Perl_debug_log, "%*s%s",
e4584336 1947 (int)depth * 2 + 2, "N:", SvPV_nolen( mysv ) );
a3621e74
YO
1948 regprop( mysv, noper );
1949 PerlIO_printf( Perl_debug_log, " -> %s\n",
1950 SvPV_nolen( mysv ) );
1951 });
1952 }
1953 } else {
1954 if ( last ) {
1955 DEBUG_OPTIMISE_r(
1956 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1957 (int)depth * 2 + 2, "E:", "**END**" );
a3621e74
YO
1958 );
1959 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1960 }
1961 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1962 && noper_next == tail )
1963 {
1964 count = 1;
1965 first = cur;
1966 optype = OP( noper );
1967 } else {
1968 count = 0;
1969 first = NULL;
1970 optype = 0;
1971 }
1972 last = NULL;
1973 }
1974 }
1975 DEBUG_OPTIMISE_r({
1976 regprop( mysv, cur);
1977 PerlIO_printf( Perl_debug_log,
e4584336 1978 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
a3621e74
YO
1979 " ", SvPV_nolen( mysv ), first, last, cur);
1980
1981 });
1982 if ( last ) {
1983 DEBUG_OPTIMISE_r(
1984 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1985 (int)depth * 2 + 2, "E:", "==END==" );
a3621e74
YO
1986 );
1987 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1988 }
1989 }
1990 }
a0ed51b3 1991 }
a3621e74 1992 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 1993 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 1994 } else /* single branch is optimized. */
c277df42
IZ
1995 scan = NEXTOPER(scan);
1996 continue;
a0ed51b3
LW
1997 }
1998 else if (OP(scan) == EXACT) {
cd439c50 1999 I32 l = STR_LEN(scan);
1aa99e6b 2000 UV uc = *((U8*)STRING(scan));
a0ed51b3 2001 if (UTF) {
1aa99e6b
IH
2002 U8 *s = (U8*)STRING(scan);
2003 l = utf8_length(s, s + l);
9041c2e3 2004 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2005 }
2006 min += l;
c277df42 2007 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2008 /* The code below prefers earlier match for fixed
2009 offset, later match for variable offset. */
2010 if (data->last_end == -1) { /* Update the start info. */
2011 data->last_start_min = data->pos_min;
2012 data->last_start_max = is_inf
b81d288d 2013 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2014 }
cd439c50 2015 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292
JH
2016 {
2017 SV * sv = data->last_found;
2018 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2019 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2020 if (mg && mg->mg_len >= 0)
5e43f467
JH
2021 mg->mg_len += utf8_length((U8*)STRING(scan),
2022 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2023 }
33b8afdf
JH
2024 if (UTF)
2025 SvUTF8_on(data->last_found);
c277df42
IZ
2026 data->last_end = data->pos_min + l;
2027 data->pos_min += l; /* As in the first entry. */
2028 data->flags &= ~SF_BEFORE_EOL;
2029 }
653099ff
GS
2030 if (flags & SCF_DO_STCLASS_AND) {
2031 /* Check whether it is compatible with what we know already! */
2032 int compat = 1;
2033
1aa99e6b 2034 if (uc >= 0x100 ||
516a5887 2035 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2036 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2037 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2038 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2039 )
653099ff
GS
2040 compat = 0;
2041 ANYOF_CLASS_ZERO(data->start_class);
2042 ANYOF_BITMAP_ZERO(data->start_class);
2043 if (compat)
1aa99e6b 2044 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2045 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2046 if (uc < 0x100)
2047 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2048 }
2049 else if (flags & SCF_DO_STCLASS_OR) {
2050 /* false positive possible if the class is case-folded */
1aa99e6b 2051 if (uc < 0x100)
9b877dbb
IH
2052 ANYOF_BITMAP_SET(data->start_class, uc);
2053 else
2054 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2055 data->start_class->flags &= ~ANYOF_EOS;
2056 cl_and(data->start_class, &and_with);
2057 }
2058 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2059 }
653099ff 2060 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2061 I32 l = STR_LEN(scan);
1aa99e6b 2062 UV uc = *((U8*)STRING(scan));
653099ff
GS
2063
2064 /* Search for fixed substrings supports EXACT only. */
b81d288d 2065 if (flags & SCF_DO_SUBSTR)
830247a4 2066 scan_commit(pRExC_state, data);
a0ed51b3 2067 if (UTF) {
1aa99e6b
IH
2068 U8 *s = (U8 *)STRING(scan);
2069 l = utf8_length(s, s + l);
9041c2e3 2070 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2071 }
2072 min += l;
c277df42 2073 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 2074 data->pos_min += l;
653099ff
GS
2075 if (flags & SCF_DO_STCLASS_AND) {
2076 /* Check whether it is compatible with what we know already! */
2077 int compat = 1;
2078
1aa99e6b 2079 if (uc >= 0x100 ||
516a5887 2080 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2081 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2082 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2083 compat = 0;
2084 ANYOF_CLASS_ZERO(data->start_class);
2085 ANYOF_BITMAP_ZERO(data->start_class);
2086 if (compat) {
1aa99e6b 2087 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2088 data->start_class->flags &= ~ANYOF_EOS;
2089 data->start_class->flags |= ANYOF_FOLD;
2090 if (OP(scan) == EXACTFL)
2091 data->start_class->flags |= ANYOF_LOCALE;
2092 }
2093 }
2094 else if (flags & SCF_DO_STCLASS_OR) {
2095 if (data->start_class->flags & ANYOF_FOLD) {
2096 /* false positive possible if the class is case-folded.
2097 Assume that the locale settings are the same... */
1aa99e6b
IH
2098 if (uc < 0x100)
2099 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2100 data->start_class->flags &= ~ANYOF_EOS;
2101 }
2102 cl_and(data->start_class, &and_with);
2103 }
2104 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2105 }
bfed75c6 2106 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2107 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2108 I32 f = flags, pos_before = 0;
c277df42 2109 regnode *oscan = scan;
653099ff
GS
2110 struct regnode_charclass_class this_class;
2111 struct regnode_charclass_class *oclass = NULL;
727f22e3 2112 I32 next_is_eval = 0;
653099ff 2113
22c35a8c 2114 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2115 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2116 scan = NEXTOPER(scan);
2117 goto finish;
2118 case PLUS:
653099ff 2119 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2120 next = NEXTOPER(scan);
653099ff 2121 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2122 mincount = 1;
2123 maxcount = REG_INFTY;
c277df42
IZ
2124 next = regnext(scan);
2125 scan = NEXTOPER(scan);
2126 goto do_curly;
2127 }
2128 }
2129 if (flags & SCF_DO_SUBSTR)
2130 data->pos_min++;
2131 min++;
2132 /* Fall through. */
2133 case STAR:
653099ff
GS
2134 if (flags & SCF_DO_STCLASS) {
2135 mincount = 0;
b81d288d 2136 maxcount = REG_INFTY;
653099ff
GS
2137 next = regnext(scan);
2138 scan = NEXTOPER(scan);
2139 goto do_curly;
2140 }
b81d288d 2141 is_inf = is_inf_internal = 1;
c277df42
IZ
2142 scan = regnext(scan);
2143 if (flags & SCF_DO_SUBSTR) {
830247a4 2144 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2145 data->longest = &(data->longest_float);
2146 }
2147 goto optimize_curly_tail;
2148 case CURLY:
b81d288d 2149 mincount = ARG1(scan);
c277df42
IZ
2150 maxcount = ARG2(scan);
2151 next = regnext(scan);
cb434fcc
IZ
2152 if (OP(scan) == CURLYX) {
2153 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2154 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2155 }
c277df42 2156 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2157 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2158 do_curly:
2159 if (flags & SCF_DO_SUBSTR) {
830247a4 2160 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2161 pos_before = data->pos_min;
2162 }
2163 if (data) {
2164 fl = data->flags;
2165 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2166 if (is_inf)
2167 data->flags |= SF_IS_INF;
2168 }
653099ff 2169 if (flags & SCF_DO_STCLASS) {
830247a4 2170 cl_init(pRExC_state, &this_class);
653099ff
GS
2171 oclass = data->start_class;
2172 data->start_class = &this_class;
2173 f |= SCF_DO_STCLASS_AND;
2174 f &= ~SCF_DO_STCLASS_OR;
2175 }
e1901655
IZ
2176 /* These are the cases when once a subexpression
2177 fails at a particular position, it cannot succeed
2178 even after backtracking at the enclosing scope.
b81d288d 2179
e1901655
IZ
2180 XXXX what if minimal match and we are at the
2181 initial run of {n,m}? */
2182 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2183 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2184
c277df42 2185 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2186 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2187 (mincount == 0
2188 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2189
2190 if (flags & SCF_DO_STCLASS)
2191 data->start_class = oclass;
2192 if (mincount == 0 || minnext == 0) {
2193 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2194 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2195 }
2196 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2197 /* Switch to OR mode: cache the old value of
653099ff
GS
2198 * data->start_class */
2199 StructCopy(data->start_class, &and_with,
2200 struct regnode_charclass_class);
2201 flags &= ~SCF_DO_STCLASS_AND;
2202 StructCopy(&this_class, data->start_class,
2203 struct regnode_charclass_class);
2204 flags |= SCF_DO_STCLASS_OR;
2205 data->start_class->flags |= ANYOF_EOS;
2206 }
2207 } else { /* Non-zero len */
2208 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2209 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2210 cl_and(data->start_class, &and_with);
2211 }
2212 else if (flags & SCF_DO_STCLASS_AND)
2213 cl_and(data->start_class, &this_class);
2214 flags &= ~SCF_DO_STCLASS;
2215 }
c277df42
IZ
2216 if (!scan) /* It was not CURLYX, but CURLY. */
2217 scan = next;
84037bb0 2218 if (ckWARN(WARN_REGEXP)
727f22e3
JP
2219 /* ? quantifier ok, except for (?{ ... }) */
2220 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2221 && (minnext == 0) && (deltanext == 0)
99799961 2222 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 2223 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 2224 {
830247a4 2225 vWARN(RExC_parse,
b45f050a
JF
2226 "Quantifier unexpected on zero-length expression");
2227 }
2228
c277df42 2229 min += minnext * mincount;
b81d288d 2230 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2231 && (minnext + deltanext) > 0)
2232 || deltanext == I32_MAX);
aca2d497 2233 is_inf |= is_inf_internal;
c277df42
IZ
2234 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2235
2236 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2237 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2238 && data->flags & SF_IN_PAR
2239 && !(data->flags & SF_HAS_EVAL)
2240 && !deltanext && minnext == 1 ) {
2241 /* Try to optimize to CURLYN. */
2242 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
2243 regnode *nxt1 = nxt;
2244#ifdef DEBUGGING
2245 regnode *nxt2;
2246#endif
c277df42
IZ
2247
2248 /* Skip open. */
2249 nxt = regnext(nxt);
bfed75c6 2250 if (!strchr((const char*)PL_simple,OP(nxt))
22c35a8c 2251 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 2252 && STR_LEN(nxt) == 1))
c277df42 2253 goto nogo;
497b47a8 2254#ifdef DEBUGGING
c277df42 2255 nxt2 = nxt;
497b47a8 2256#endif
c277df42 2257 nxt = regnext(nxt);
b81d288d 2258 if (OP(nxt) != CLOSE)
c277df42
IZ
2259 goto nogo;
2260 /* Now we know that nxt2 is the only contents: */
eb160463 2261 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2262 OP(oscan) = CURLYN;
2263 OP(nxt1) = NOTHING; /* was OPEN. */
2264#ifdef DEBUGGING
2265 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2266 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2267 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2268 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2269 OP(nxt + 1) = OPTIMIZED; /* was count. */
2270 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2271#endif
c277df42 2272 }
c277df42
IZ
2273 nogo:
2274
2275 /* Try optimization CURLYX => CURLYM. */
b81d288d 2276 if ( OP(oscan) == CURLYX && data
c277df42 2277 && !(data->flags & SF_HAS_PAR)
c277df42 2278 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2279 && !deltanext /* atom is fixed width */
2280 && minnext != 0 /* CURLYM can't handle zero width */
2281 ) {
c277df42
IZ
2282 /* XXXX How to optimize if data == 0? */
2283 /* Optimize to a simpler form. */
2284 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2285 regnode *nxt2;
2286
2287 OP(oscan) = CURLYM;
2288 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2289 && (OP(nxt2) != WHILEM))
c277df42
IZ
2290 nxt = nxt2;
2291 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2292 /* Need to optimize away parenths. */
2293 if (data->flags & SF_IN_PAR) {
2294 /* Set the parenth number. */
2295 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2296
b81d288d 2297 if (OP(nxt) != CLOSE)
b45f050a 2298 FAIL("Panic opt close");
eb160463 2299 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2300 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2301 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2302#ifdef DEBUGGING
2303 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2304 OP(nxt + 1) = OPTIMIZED; /* was count. */
2305 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2306 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2307#endif
c277df42
IZ
2308#if 0
2309 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2310 regnode *nnxt = regnext(nxt1);
b81d288d 2311
c277df42
IZ
2312 if (nnxt == nxt) {
2313 if (reg_off_by_arg[OP(nxt1)])
2314 ARG_SET(nxt1, nxt2 - nxt1);
2315 else if (nxt2 - nxt1 < U16_MAX)
2316 NEXT_OFF(nxt1) = nxt2 - nxt1;
2317 else
2318 OP(nxt) = NOTHING; /* Cannot beautify */
2319 }
2320 nxt1 = nnxt;
2321 }
2322#endif
2323 /* Optimize again: */
b81d288d 2324 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2325 NULL, 0,depth+1);
a0ed51b3
LW
2326 }
2327 else
c277df42 2328 oscan->flags = 0;
c277df42 2329 }
e1901655
IZ
2330 else if ((OP(oscan) == CURLYX)
2331 && (flags & SCF_WHILEM_VISITED_POS)
2332 /* See the comment on a similar expression above.
2333 However, this time it not a subexpression
2334 we care about, but the expression itself. */
2335 && (maxcount == REG_INFTY)
2336 && data && ++data->whilem_c < 16) {
2337 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2338 /* Find WHILEM (as in regexec.c) */
2339 regnode *nxt = oscan + NEXT_OFF(oscan);
2340
2341 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2342 nxt += ARG(nxt);
eb160463
GS
2343 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2344 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2345 }
b81d288d 2346 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2347 pars++;
2348 if (flags & SCF_DO_SUBSTR) {
2349 SV *last_str = Nullsv;
2350 int counted = mincount != 0;
2351
2352 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2353#if defined(SPARC64_GCC_WORKAROUND)
2354 I32 b = 0;
2355 STRLEN l = 0;
2356 char *s = NULL;
2357 I32 old = 0;
2358
2359 if (pos_before >= data->last_start_min)
2360 b = pos_before;
2361 else
2362 b = data->last_start_min;
2363
2364 l = 0;
2365 s = SvPV(data->last_found, l);
2366 old = b - data->last_start_min;
2367
2368#else
b81d288d 2369 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2370 ? pos_before : data->last_start_min;
2371 STRLEN l;
2372 char *s = SvPV(data->last_found, l);
a0ed51b3 2373 I32 old = b - data->last_start_min;
5d1c421c 2374#endif
a0ed51b3
LW
2375
2376 if (UTF)
2377 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2378
a0ed51b3 2379 l -= old;
c277df42 2380 /* Get the added string: */
79cb57f6 2381 last_str = newSVpvn(s + old, l);
0e933229
IH
2382 if (UTF)
2383 SvUTF8_on(last_str);
c277df42
IZ
2384 if (deltanext == 0 && pos_before == b) {
2385 /* What was added is a constant string */
2386 if (mincount > 1) {
2387 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2388 repeatcpy(SvPVX(last_str) + l,
c277df42 2389 SvPVX(last_str), l, mincount - 1);
b162af07 2390 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2391 /* Add additional parts. */
b81d288d 2392 SvCUR_set(data->last_found,
c277df42
IZ
2393 SvCUR(data->last_found) - l);
2394 sv_catsv(data->last_found, last_str);
0eda9292
JH
2395 {
2396 SV * sv = data->last_found;
2397 MAGIC *mg =
2398 SvUTF8(sv) && SvMAGICAL(sv) ?
2399 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2400 if (mg && mg->mg_len >= 0)
2401 mg->mg_len += CHR_SVLEN(last_str);
2402 }
c277df42
IZ
2403 data->last_end += l * (mincount - 1);
2404 }
2a8d9689
HS
2405 } else {
2406 /* start offset must point into the last copy */
2407 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2408 data->last_start_max += is_inf ? I32_MAX
2409 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2410 }
2411 }
2412 /* It is counted once already... */
2413 data->pos_min += minnext * (mincount - counted);
2414 data->pos_delta += - counted * deltanext +
2415 (minnext + deltanext) * maxcount - minnext * mincount;
2416 if (mincount != maxcount) {
653099ff
GS
2417 /* Cannot extend fixed substrings found inside
2418 the group. */
830247a4 2419 scan_commit(pRExC_state,data);
c277df42
IZ
2420 if (mincount && last_str) {
2421 sv_setsv(data->last_found, last_str);
2422 data->last_end = data->pos_min;
b81d288d 2423 data->last_start_min =
a0ed51b3 2424 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2425 data->last_start_max = is_inf
2426 ? I32_MAX
c277df42 2427 : data->pos_min + data->pos_delta
a0ed51b3 2428 - CHR_SVLEN(last_str);
c277df42
IZ
2429 }
2430 data->longest = &(data->longest_float);
2431 }
aca2d497 2432 SvREFCNT_dec(last_str);
c277df42 2433 }
405ff068 2434 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2435 data->flags |= SF_HAS_EVAL;
2436 optimize_curly_tail:
c277df42 2437 if (OP(oscan) != CURLYX) {
22c35a8c 2438 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2439 && NEXT_OFF(next))
2440 NEXT_OFF(oscan) += NEXT_OFF(next);
2441 }
c277df42 2442 continue;
653099ff 2443 default: /* REF and CLUMP only? */
c277df42 2444 if (flags & SCF_DO_SUBSTR) {
830247a4 2445 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2446 data->longest = &(data->longest_float);
2447 }
aca2d497 2448 is_inf = is_inf_internal = 1;
653099ff 2449 if (flags & SCF_DO_STCLASS_OR)
830247a4 2450 cl_anything(pRExC_state, data->start_class);
653099ff 2451 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2452 break;
2453 }
a0ed51b3 2454 }
bfed75c6 2455 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2456 int value = 0;
653099ff 2457
c277df42 2458 if (flags & SCF_DO_SUBSTR) {
830247a4 2459 scan_commit(pRExC_state,data);
c277df42
IZ
2460 data->pos_min++;
2461 }
2462 min++;
653099ff
GS
2463 if (flags & SCF_DO_STCLASS) {
2464 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2465
2466 /* Some of the logic below assumes that switching
2467 locale on will only add false positives. */
2468 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2469 case SANY:
653099ff
GS
2470 default:
2471 do_default:
2472 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2473 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2474 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2475 break;
2476 case REG_ANY:
2477 if (OP(scan) == SANY)
2478 goto do_default;
2479 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2480 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2481 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2482 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2483 }
2484 if (flags & SCF_DO_STCLASS_AND || !value)
2485 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2486 break;
2487 case ANYOF:
2488 if (flags & SCF_DO_STCLASS_AND)
2489 cl_and(data->start_class,
2490 (struct regnode_charclass_class*)scan);
2491 else
830247a4 2492 cl_or(pRExC_state, data->start_class,
653099ff
GS
2493 (struct regnode_charclass_class*)scan);
2494 break;
2495 case ALNUM:
2496 if (flags & SCF_DO_STCLASS_AND) {
2497 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2498 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2499 for (value = 0; value < 256; value++)
2500 if (!isALNUM(value))
2501 ANYOF_BITMAP_CLEAR(data->start_class, value);
2502 }
2503 }
2504 else {
2505 if (data->start_class->flags & ANYOF_LOCALE)
2506 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2507 else {
2508 for (value = 0; value < 256; value++)
2509 if (isALNUM(value))
b81d288d 2510 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2511 }
2512 }
2513 break;
2514 case ALNUML:
2515 if (flags & SCF_DO_STCLASS_AND) {
2516 if (data->start_class->flags & ANYOF_LOCALE)
2517 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2518 }
2519 else {
2520 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2521 data->start_class->flags |= ANYOF_LOCALE;
2522 }
2523 break;
2524 case NALNUM:
2525 if (flags & SCF_DO_STCLASS_AND) {
2526 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2527 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2528 for (value = 0; value < 256; value++)
2529 if (isALNUM(value))
2530 ANYOF_BITMAP_CLEAR(data->start_class, value);
2531 }
2532 }
2533 else {
2534 if (data->start_class->flags & ANYOF_LOCALE)
2535 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2536 else {
2537 for (value = 0; value < 256; value++)
2538 if (!isALNUM(value))
b81d288d 2539 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2540 }
2541 }
2542 break;
2543 case NALNUML:
2544 if (flags & SCF_DO_STCLASS_AND) {
2545 if (data->start_class->flags & ANYOF_LOCALE)
2546 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2547 }
2548 else {
2549 data->start_class->flags |= ANYOF_LOCALE;
2550 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2551 }
2552 break;
2553 case SPACE:
2554 if (flags & SCF_DO_STCLASS_AND) {
2555 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2556 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2557 for (value = 0; value < 256; value++)
2558 if (!isSPACE(value))
2559 ANYOF_BITMAP_CLEAR(data->start_class, value);
2560 }
2561 }
2562 else {
2563 if (data->start_class->flags & ANYOF_LOCALE)
2564 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2565 else {
2566 for (value = 0; value < 256; value++)
2567 if (isSPACE(value))
b81d288d 2568 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2569 }
2570 }
2571 break;
2572 case SPACEL:
2573 if (flags & SCF_DO_STCLASS_AND) {
2574 if (data->start_class->flags & ANYOF_LOCALE)
2575 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2576 }
2577 else {
2578 data->start_class->flags |= ANYOF_LOCALE;
2579 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2580 }
2581 break;
2582 case NSPACE:
2583 if (flags & SCF_DO_STCLASS_AND) {
2584 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2585 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2586 for (value = 0; value < 256; value++)
2587 if (isSPACE(value))
2588 ANYOF_BITMAP_CLEAR(data->start_class, value);
2589 }
2590 }
2591 else {
2592 if (data->start_class->flags & ANYOF_LOCALE)
2593 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2594 else {
2595 for (value = 0; value < 256; value++)
2596 if (!isSPACE(value))
b81d288d 2597 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2598 }
2599 }
2600 break;
2601 case NSPACEL:
2602 if (flags & SCF_DO_STCLASS_AND) {
2603 if (data->start_class->flags & ANYOF_LOCALE) {
2604 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2605 for (value = 0; value < 256; value++)
2606 if (!isSPACE(value))
2607 ANYOF_BITMAP_CLEAR(data->start_class, value);
2608 }
2609 }
2610 else {
2611 data->start_class->flags |= ANYOF_LOCALE;
2612 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2613 }
2614 break;
2615 case DIGIT:
2616 if (flags & SCF_DO_STCLASS_AND) {
2617 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2618 for (value = 0; value < 256; value++)
2619 if (!isDIGIT(value))
2620 ANYOF_BITMAP_CLEAR(data->start_class, value);
2621 }
2622 else {
2623 if (data->start_class->flags & ANYOF_LOCALE)
2624 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2625 else {
2626 for (value = 0; value < 256; value++)
2627 if (isDIGIT(value))
b81d288d 2628 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2629 }
2630 }
2631 break;
2632 case NDIGIT:
2633 if (flags & SCF_DO_STCLASS_AND) {
2634 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2635 for (value = 0; value < 256; value++)
2636 if (isDIGIT(value))
2637 ANYOF_BITMAP_CLEAR(data->start_class, value);
2638 }
2639 else {
2640 if (data->start_class->flags & ANYOF_LOCALE)
2641 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2642 else {
2643 for (value = 0; value < 256; value++)
2644 if (!isDIGIT(value))
b81d288d 2645 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2646 }
2647 }
2648 break;
2649 }
2650 if (flags & SCF_DO_STCLASS_OR)
2651 cl_and(data->start_class, &and_with);
2652 flags &= ~SCF_DO_STCLASS;
2653 }
a0ed51b3 2654 }
22c35a8c 2655 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2656 data->flags |= (OP(scan) == MEOL
2657 ? SF_BEFORE_MEOL
2658 : SF_BEFORE_SEOL);
a0ed51b3 2659 }
653099ff
GS
2660 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2661 /* Lookbehind, or need to calculate parens/evals/stclass: */
2662 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2663 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2664 /* Lookahead/lookbehind */
cb434fcc 2665 I32 deltanext, minnext, fake = 0;
c277df42 2666 regnode *nscan;
653099ff
GS
2667 struct regnode_charclass_class intrnl;
2668 int f = 0;
c277df42
IZ
2669
2670 data_fake.flags = 0;
b81d288d 2671 if (data) {
2c2d71f5 2672 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2673 data_fake.last_closep = data->last_closep;
2674 }
2675 else
2676 data_fake.last_closep = &fake;
653099ff
GS
2677 if ( flags & SCF_DO_STCLASS && !scan->flags
2678 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2679 cl_init(pRExC_state, &intrnl);
653099ff 2680 data_fake.start_class = &intrnl;
e1901655 2681 f |= SCF_DO_STCLASS_AND;
653099ff 2682 }
e1901655
IZ
2683 if (flags & SCF_WHILEM_VISITED_POS)
2684 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2685 next = regnext(scan);
2686 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2687 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2688 if (scan->flags) {
2689 if (deltanext) {
9baa0206 2690 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2691 }
2692 else if (minnext > U8_MAX) {
9baa0206 2693 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2694 }
eb160463 2695 scan->flags = (U8)minnext;
c277df42
IZ
2696 }
2697 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2698 pars++;
405ff068 2699 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2700 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2701 if (data)
2702 data->whilem_c = data_fake.whilem_c;
e1901655 2703 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
2704 int was = (data->start_class->flags & ANYOF_EOS);
2705
2706 cl_and(data->start_class, &intrnl);
2707 if (was)
2708 data->start_class->flags |= ANYOF_EOS;
2709 }
a0ed51b3
LW
2710 }
2711 else if (OP(scan) == OPEN) {
c277df42 2712 pars++;
a0ed51b3 2713 }
cb434fcc 2714 else if (OP(scan) == CLOSE) {
eb160463 2715 if ((I32)ARG(scan) == is_par) {
cb434fcc 2716 next = regnext(scan);
c277df42 2717
cb434fcc
IZ
2718 if ( next && (OP(next) != WHILEM) && next < last)
2719 is_par = 0; /* Disable optimization */
2720 }
2721 if (data)
2722 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2723 }
2724 else if (OP(scan) == EVAL) {
c277df42
IZ
2725 if (data)
2726 data->flags |= SF_HAS_EVAL;
2727 }
96776eda 2728 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2729 if (flags & SCF_DO_SUBSTR) {
830247a4 2730 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2731 data->longest = &(data->longest_float);
2732 }
2733 is_inf = is_inf_internal = 1;
653099ff 2734 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2735 cl_anything(pRExC_state, data->start_class);
96776eda 2736 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2737 }
c277df42
IZ
2738 /* Else: zero-length, ignore. */
2739 scan = regnext(scan);
2740 }
2741
2742 finish:
2743 *scanp = scan;
aca2d497 2744 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2745 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2746 data->pos_delta = I32_MAX - data->pos_min;
2747 if (is_par > U8_MAX)
2748 is_par = 0;
2749 if (is_par && pars==1 && data) {
2750 data->flags |= SF_IN_PAR;
2751 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2752 }
2753 else if (pars && data) {
c277df42
IZ
2754 data->flags |= SF_HAS_PAR;
2755 data->flags &= ~SF_IN_PAR;
2756 }
653099ff
GS
2757 if (flags & SCF_DO_STCLASS_OR)
2758 cl_and(data->start_class, &and_with);
c277df42
IZ
2759 return min;
2760}
2761
76e3520e 2762STATIC I32
bfed75c6 2763S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2764{
830247a4 2765 if (RExC_rx->data) {
b81d288d
AB
2766 Renewc(RExC_rx->data,
2767 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2768 char, struct reg_data);
830247a4
IZ
2769 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2770 RExC_rx->data->count += n;
a0ed51b3
LW
2771 }
2772 else {
830247a4 2773 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2774 char, struct reg_data);
830247a4
IZ
2775 New(1208, RExC_rx->data->what, n, U8);
2776 RExC_rx->data->count = n;
c277df42 2777 }
830247a4
IZ
2778 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2779 return RExC_rx->data->count - n;
c277df42
IZ
2780}
2781
d88dccdf 2782void
864dbfa3 2783Perl_reginitcolors(pTHX)
d88dccdf 2784{
d88dccdf
IZ
2785 int i = 0;
2786 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 2787
d88dccdf
IZ
2788 if (s) {
2789 PL_colors[0] = s = savepv(s);
2790 while (++i < 6) {
2791 s = strchr(s, '\t');
2792 if (s) {
2793 *s = '\0';
2794 PL_colors[i] = ++s;
2795 }
2796 else
dd374669 2797 PL_colors[i] = s = (char *)"";
d88dccdf
IZ
2798 }
2799 } else {
b81d288d 2800 while (i < 6)
06b5626a 2801 PL_colors[i++] = (char *)"";
d88dccdf
IZ
2802 }
2803 PL_colorset = 1;
2804}
2805
8615cb43 2806
a687059c 2807/*
e50aee73 2808 - pregcomp - compile a regular expression into internal code
a687059c
LW
2809 *
2810 * We can't allocate space until we know how big the compiled form will be,
2811 * but we can't compile it (and thus know how big it is) until we've got a
2812 * place to put the code. So we cheat: we compile it twice, once with code
2813 * generation turned off and size counting turned on, and once "for real".
2814 * This also means that we don't allocate space until we are sure that the
2815 * thing really will compile successfully, and we never have to move the
2816 * code and thus invalidate pointers into it. (Note that it has to be in
2817 * one piece because free() must be able to free it all.) [NB: not true in perl]
2818 *
2819 * Beware that the optimization-preparation code in here knows about some
2820 * of the structure of the compiled regexp. [I'll say.]
2821 */
2822regexp *
864dbfa3 2823Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2824{
a0d0e21e 2825 register regexp *r;
c277df42 2826 regnode *scan;
c277df42 2827 regnode *first;
a0d0e21e 2828 I32 flags;
a0d0e21e
LW
2829 I32 minlen = 0;
2830 I32 sawplus = 0;
2831 I32 sawopen = 0;
2c2d71f5 2832 scan_data_t data;
830247a4
IZ
2833 RExC_state_t RExC_state;
2834 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 2835
a3621e74
YO
2836 GET_RE_DEBUG_FLAGS_DECL;
2837
a0d0e21e 2838 if (exp == NULL)
c277df42 2839 FAIL("NULL regexp argument");
a0d0e21e 2840
a5961de5 2841 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 2842
5cfc7842 2843 RExC_precomp = exp;
a3621e74
YO
2844 DEBUG_r(if (!PL_colorset) reginitcolors());
2845 DEBUG_COMPILE_r({
2846 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
2847 PL_colors[4],PL_colors[5],PL_colors[0],
2848 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2849 });
e2509266 2850 RExC_flags = pm->op_pmflags;
830247a4 2851 RExC_sawback = 0;
bbce6d69 2852
830247a4
IZ
2853 RExC_seen = 0;
2854 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2855 RExC_seen_evals = 0;
2856 RExC_extralen = 0;
c277df42 2857
bbce6d69 2858 /* First pass: determine size, legality. */
830247a4 2859 RExC_parse = exp;
fac92740 2860 RExC_start = exp;
830247a4
IZ
2861 RExC_end = xend;
2862 RExC_naughty = 0;
2863 RExC_npar = 1;
2864 RExC_size = 0L;
2865 RExC_emit = &PL_regdummy;
2866 RExC_whilem_seen = 0;
85ddcde9
JH
2867#if 0 /* REGC() is (currently) a NOP at the first pass.
2868 * Clever compilers notice this and complain. --jhi */
830247a4 2869 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 2870#endif
830247a4 2871 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 2872 RExC_precomp = Nullch;
a0d0e21e
LW
2873 return(NULL);
2874 }
a3621e74 2875 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 2876
c277df42
IZ
2877 /* Small enough for pointer-storage convention?
2878 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
2879 if (RExC_size >= 0x10000L && RExC_extralen)
2880 RExC_size += RExC_extralen;
c277df42 2881 else
830247a4
IZ
2882 RExC_extralen = 0;
2883 if (RExC_whilem_seen > 15)
2884 RExC_whilem_seen = 15;
a0d0e21e 2885
bbce6d69 2886 /* Allocate space and initialize. */
830247a4 2887 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 2888 char, regexp);
a0d0e21e 2889 if (r == NULL)
b45f050a
JF
2890 FAIL("Regexp out of space");
2891
0f79a09d
GS
2892#ifdef DEBUGGING
2893 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 2894 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 2895#endif
c277df42 2896 r->refcnt = 1;
bbce6d69 2897 r->prelen = xend - exp;
5cfc7842 2898 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 2899 r->subbeg = NULL;
ed252734
NC
2900#ifdef PERL_COPY_ON_WRITE
2901 r->saved_copy = Nullsv;
2902#endif
cf93c79d 2903 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 2904 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
2905
2906 r->substrs = 0; /* Useful during FAIL. */
2907 r->startp = 0; /* Useful during FAIL. */
2908 r->endp = 0; /* Useful during FAIL. */
2909
fac92740
MJD
2910 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2911 if (r->offsets) {
2912 r->offsets[0] = RExC_size;
2913 }
a3621e74 2914 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
392fbf5d 2915 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 2916 r->offsets ? "Got" : "Couldn't get",
392fbf5d 2917 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 2918
830247a4 2919 RExC_rx = r;
bbce6d69 2920
2921 /* Second pass: emit code. */
e2509266 2922 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
2923 RExC_parse = exp;
2924 RExC_end = xend;
2925 RExC_naughty = 0;
2926 RExC_npar = 1;
fac92740 2927 RExC_emit_start = r->program;
830247a4 2928 RExC_emit = r->program;
2cd61cdb 2929 /* Store the count of eval-groups for security checks: */
eb160463 2930 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 2931 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 2932 r->data = 0;
830247a4 2933 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
2934 return(NULL);
2935
a3621e74 2936
a0d0e21e 2937 /* Dig out information for optimizations. */
cf93c79d 2938 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 2939 pm->op_pmflags = RExC_flags;
a0ed51b3 2940 if (UTF)
5ff6fc6d 2941 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 2942 r->regstclass = NULL;
830247a4 2943 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 2944 r->reganch |= ROPT_NAUGHTY;
c277df42 2945 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
2946
2947 /* XXXX To minimize changes to RE engine we always allocate
2948 3-units-long substrs field. */
2949 Newz(1004, r->substrs, 1, struct reg_substr_data);
2950
2c2d71f5 2951 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 2952 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 2953 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 2954 I32 fake;
c5254dd6 2955 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
2956 struct regnode_charclass_class ch_class;
2957 int stclass_flag;
cb434fcc 2958 I32 last_close = 0;
a0d0e21e
LW
2959
2960 first = scan;
c277df42 2961 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 2962 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 2963 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
2964 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2965 (OP(first) == PLUS) ||
2966 (OP(first) == MINMOD) ||
653099ff 2967 /* An {n,m} with n>0 */
22c35a8c 2968 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
2969 if (OP(first) == PLUS)
2970 sawplus = 1;
2971 else
2972 first += regarglen[(U8)OP(first)];
2973 first = NEXTOPER(first);
a687059c
LW
2974 }
2975
a0d0e21e
LW
2976 /* Starting-point info. */
2977 again:
653099ff 2978 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
2979 if (OP(first) == EXACT)
2980 ; /* Empty, get anchored substr later. */
2981 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
2982 r->regstclass = first;
2983 }
bfed75c6 2984 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 2985 r->regstclass = first;
22c35a8c
GS
2986 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2987 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 2988 r->regstclass = first;
22c35a8c 2989 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
2990 r->reganch |= (OP(first) == MBOL
2991 ? ROPT_ANCH_MBOL
2992 : (OP(first) == SBOL
2993 ? ROPT_ANCH_SBOL
2994 : ROPT_ANCH_BOL));
a0d0e21e 2995 first = NEXTOPER(first);
774d564b 2996 goto again;
2997 }
2998 else if (OP(first) == GPOS) {
2999 r->reganch |= ROPT_ANCH_GPOS;
3000 first = NEXTOPER(first);
3001 goto again;
a0d0e21e 3002 }
e09294f4 3003 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 3004 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3005 !(r->reganch & ROPT_ANCH) )
3006 {
3007 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
3008 int type = OP(NEXTOPER(first));
3009
ffc61ed2 3010 if (type == REG_ANY)
cad2e5aa
JH
3011 type = ROPT_ANCH_MBOL;
3012 else
3013 type = ROPT_ANCH_SBOL;
3014
3015 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3016 first = NEXTOPER(first);
774d564b 3017 goto again;
a0d0e21e 3018 }
b81d288d 3019 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3020 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3021 /* x+ must match at the 1st pos of run of x's */
3022 r->reganch |= ROPT_SKIP;
a0d0e21e 3023
c277df42 3024 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 3025 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 3026 (IV)(first - scan + 1)));
a0d0e21e
LW
3027 /*
3028 * If there's something expensive in the r.e., find the
3029 * longest literal string that must appear and make it the
3030 * regmust. Resolve ties in favor of later strings, since
3031 * the regstart check works with the beginning of the r.e.
3032 * and avoiding duplication strengthens checking. Not a
3033 * strong reason, but sufficient in the absence of others.
3034 * [Now we resolve ties in favor of the earlier string if
c277df42 3035 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3036 * earlier string may buy us something the later one won't.]
3037 */
a0d0e21e 3038 minlen = 0;
a687059c 3039
79cb57f6
GS
3040 data.longest_fixed = newSVpvn("",0);
3041 data.longest_float = newSVpvn("",0);
3042 data.last_found = newSVpvn("",0);
c277df42
IZ
3043 data.longest = &(data.longest_fixed);
3044 first = scan;
653099ff 3045 if (!r->regstclass) {
830247a4 3046 cl_init(pRExC_state, &ch_class);
653099ff
GS
3047 data.start_class = &ch_class;
3048 stclass_flag = SCF_DO_STCLASS_AND;
3049 } else /* XXXX Check for BOUND? */
3050 stclass_flag = 0;
cb434fcc 3051 data.last_closep = &last_close;
653099ff 3052
830247a4 3053 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3054 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3055 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3056 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3057 && !RExC_seen_zerolen
3058 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3059 r->reganch |= ROPT_CHECK_ALL;
830247a4 3060 scan_commit(pRExC_state, &data);
c277df42
IZ
3061 SvREFCNT_dec(data.last_found);
3062
a0ed51b3 3063 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3064 if (longest_float_length
c277df42
IZ
3065 || (data.flags & SF_FL_BEFORE_EOL
3066 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3067 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3068 int t;
3069
a0ed51b3 3070 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3071 && data.offset_fixed == data.offset_float_min
3072 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3073 goto remove_float; /* As in (a)+. */
3074
33b8afdf
JH
3075 if (SvUTF8(data.longest_float)) {
3076 r->float_utf8 = data.longest_float;
3077 r->float_substr = Nullsv;
3078 } else {
3079 r->float_substr = data.longest_float;
3080 r->float_utf8 = Nullsv;
3081 }
c277df42
IZ
3082 r->float_min_offset = data.offset_float_min;
3083 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3084 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3085 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3086 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3087 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3088 }
3089 else {
aca2d497 3090 remove_float:
33b8afdf 3091 r->float_substr = r->float_utf8 = Nullsv;
c277df42 3092 SvREFCNT_dec(data.longest_float);
c5254dd6 3093 longest_float_length = 0;
a0d0e21e 3094 }
c277df42 3095
a0ed51b3 3096 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3097 if (longest_fixed_length
c277df42
IZ
3098 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3099 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3100 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3101 int t;
3102
33b8afdf
JH
3103 if (SvUTF8(data.longest_fixed)) {
3104 r->anchored_utf8 = data.longest_fixed;
3105 r->anchored_substr = Nullsv;
3106 } else {
3107 r->anchored_substr = data.longest_fixed;
3108 r->anchored_utf8 = Nullsv;
3109 }
c277df42 3110 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3111 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3112 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3113 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3114 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3115 }
3116 else {
33b8afdf 3117 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 3118 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3119 longest_fixed_length = 0;
a0d0e21e 3120 }
b81d288d 3121 if (r->regstclass
ffc61ed2 3122 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3123 r->regstclass = NULL;
33b8afdf
JH
3124 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3125 && stclass_flag
653099ff 3126 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3127 && !cl_is_anything(data.start_class))
3128 {
830247a4 3129 I32 n = add_data(pRExC_state, 1, "f");
653099ff 3130
b81d288d 3131 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
3132 struct regnode_charclass_class);
3133 StructCopy(data.start_class,
830247a4 3134 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3135 struct regnode_charclass_class);
830247a4 3136 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3137 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 3138 PL_regdata = r->data; /* for regprop() */
a3621e74 3139 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
9c5ffd7c
JH
3140 regprop(sv, (regnode*)data.start_class);
3141 PerlIO_printf(Perl_debug_log,
3142 "synthetic stclass `%s'.\n",
3143 SvPVX(sv));});
653099ff 3144 }
c277df42
IZ
3145
3146 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3147 if (longest_fixed_length > longest_float_length) {
c277df42 3148 r->check_substr = r->anchored_substr;
33b8afdf 3149 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3150 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3151 if (r->reganch & ROPT_ANCH_SINGLE)
3152 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3153 }
3154 else {
c277df42 3155 r->check_substr = r->float_substr;
33b8afdf 3156 r->check_utf8 = r->float_utf8;
c277df42
IZ
3157 r->check_offset_min = data.offset_float_min;
3158 r->check_offset_max = data.offset_float_max;
a0d0e21e 3159 }
30382c73
IZ
3160 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3161 This should be changed ASAP! */
33b8afdf 3162 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3163 r->reganch |= RE_USE_INTUIT;
33b8afdf 3164 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3165 r->reganch |= RE_INTUIT_TAIL;
3166 }
a0ed51b3
LW
3167 }
3168 else {
c277df42
IZ
3169 /* Several toplevels. Best we can is to set minlen. */
3170 I32 fake;
653099ff 3171 struct regnode_charclass_class ch_class;
cb434fcc 3172 I32 last_close = 0;
c277df42 3173
a3621e74 3174 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3175 scan = r->program + 1;
830247a4 3176 cl_init(pRExC_state, &ch_class);
653099ff 3177 data.start_class = &ch_class;
cb434fcc 3178 data.last_closep = &last_close;
a3621e74 3179 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf
JH
3180 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3181 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 3182 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3183 && !cl_is_anything(data.start_class))
3184 {
830247a4 3185 I32 n = add_data(pRExC_state, 1, "f");
653099ff 3186
b81d288d 3187 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
3188 struct regnode_charclass_class);
3189 StructCopy(data.start_class,
830247a4 3190 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3191 struct regnode_charclass_class);
830247a4 3192 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3193 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3194 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
9c5ffd7c
JH
3195 regprop(sv, (regnode*)data.start_class);
3196 PerlIO_printf(Perl_debug_log,
3197 "synthetic stclass `%s'.\n",
3198 SvPVX(sv));});
653099ff 3199 }
a0d0e21e
LW
3200 }
3201
a0d0e21e 3202 r->minlen = minlen;
b81d288d 3203 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3204 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3205 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3206 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3207 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3208 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3209 if (RExC_seen & REG_SEEN_CANY)
3210 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
3211 Newz(1002, r->startp, RExC_npar, I32);
3212 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 3213 PL_regdata = r->data; /* for regprop() */
a3621e74 3214 DEBUG_COMPILE_r(regdump(r));
a0d0e21e 3215 return(r);
a687059c
LW
3216}
3217
3218/*
3219 - reg - regular expression, i.e. main body or parenthesized thing
3220 *
3221 * Caller must absorb opening parenthesis.
3222 *
3223 * Combining parenthesis handling with the base level of regular expression
3224 * is a trifle forced, but the need to tie the tails of the branches to what
3225 * follows makes it hard to avoid.
3226 */
76e3520e 3227STATIC regnode *
830247a4 3228S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 3229 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3230{
27da23d5 3231 dVAR;
c277df42
IZ
3232 register regnode *ret; /* Will be the head of the group. */
3233 register regnode *br;
3234 register regnode *lastbr;
3235 register regnode *ender = 0;
a0d0e21e 3236 register I32 parno = 0;
e2509266 3237 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
3238
3239 /* for (?g), (?gc), and (?o) warnings; warning
3240 about (?c) will warn about (?g) -- japhy */
3241
3242 I32 wastedflags = 0x00,
3243 wasted_o = 0x01,
3244 wasted_g = 0x02,
3245 wasted_gc = 0x02 | 0x04,
3246 wasted_c = 0x04;
3247
fac92740 3248 char * parse_start = RExC_parse; /* MJD */
830247a4 3249 char *oregcomp_parse = RExC_parse;
c277df42 3250 char c;
a0d0e21e 3251
821b33a5 3252 *flagp = 0; /* Tentatively. */
a0d0e21e 3253
9d1d55b5 3254
a0d0e21e
LW
3255 /* Make an OPEN node, if parenthesized. */
3256 if (paren) {
fac92740 3257 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3258 U32 posflags = 0, negflags = 0;
3259 U32 *flagsp = &posflags;
0f5d15d6 3260 int logical = 0;
830247a4 3261 char *seqstart = RExC_parse;
ca9dfc88 3262
830247a4
IZ
3263 RExC_parse++;
3264 paren = *RExC_parse++;
c277df42 3265 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3266 switch (paren) {
fac92740 3267 case '<': /* (?<...) */
830247a4 3268 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3269 if (*RExC_parse == '!')
c277df42 3270 paren = ',';
b81d288d 3271 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3272 goto unknown;
830247a4 3273 RExC_parse++;
fac92740
MJD
3274 case '=': /* (?=...) */
3275 case '!': /* (?!...) */
830247a4 3276 RExC_seen_zerolen++;
fac92740
MJD
3277 case ':': /* (?:...) */
3278 case '>': /* (?>...) */
a0d0e21e 3279 break;
fac92740
MJD
3280 case '$': /* (?$...) */
3281 case '@': /* (?@...) */
8615cb43 3282 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3283 break;
fac92740 3284 case '#': /* (?#...) */
830247a4
IZ
3285 while (*RExC_parse && *RExC_parse != ')')
3286 RExC_parse++;
3287 if (*RExC_parse != ')')
c277df42 3288 FAIL("Sequence (?#... not terminated");
830247a4 3289 nextchar(pRExC_state);
a0d0e21e
LW
3290 *flagp = TRYAGAIN;
3291 return NULL;
fac92740 3292 case 'p': /* (?p...) */
9014280d 3293 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3294 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3295 /* FALL THROUGH*/
fac92740 3296 case '?': /* (??...) */
0f5d15d6 3297 logical = 1;
438a3801
YST
3298 if (*RExC_parse != '{')
3299 goto unknown;
830247a4 3300 paren = *RExC_parse++;
0f5d15d6 3301 /* FALL THROUGH */
fac92740 3302 case '{': /* (?{...}) */
c277df42 3303 {
c277df42
IZ
3304 I32 count = 1, n = 0;
3305 char c;
830247a4 3306 char *s = RExC_parse;
c277df42
IZ
3307 SV *sv;
3308 OP_4tree *sop, *rop;
3309
830247a4
IZ
3310 RExC_seen_zerolen++;
3311 RExC_seen |= REG_SEEN_EVAL;
3312 while (count && (c = *RExC_parse)) {
3313 if (c == '\\' && RExC_parse[1])
3314 RExC_parse++;
b81d288d 3315 else if (c == '{')
c277df42 3316 count++;
b81d288d 3317 else if (c == '}')
c277df42 3318 count--;
830247a4 3319 RExC_parse++;
c277df42 3320 }
830247a4 3321 if (*RExC_parse != ')')
b45f050a 3322 {
b81d288d 3323 RExC_parse = s;
b45f050a
JF
3324 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3325 }
c277df42 3326 if (!SIZE_ONLY) {
f3548bdc 3327 PAD *pad;
b81d288d
AB
3328
3329 if (RExC_parse - 1 - s)
830247a4 3330 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3331 else
79cb57f6 3332 sv = newSVpvn("", 0);
c277df42 3333
569233ed
SB
3334 ENTER;
3335 Perl_save_re_context(aTHX);
f3548bdc 3336 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3337 sop->op_private |= OPpREFCOUNTED;
3338 /* re_dup will OpREFCNT_inc */
3339 OpREFCNT_set(sop, 1);
569233ed 3340 LEAVE;
c277df42 3341
830247a4
IZ
3342 n = add_data(pRExC_state, 3, "nop");
3343 RExC_rx->data->data[n] = (void*)rop;
3344 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3345 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3346 SvREFCNT_dec(sv);
a0ed51b3 3347 }
e24b16f9 3348 else { /* First pass */
830247a4 3349 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3350 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3351 /* No compiled RE interpolated, has runtime
3352 components ===> unsafe. */
3353 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3354 if (PL_tainting && PL_tainted)
cc6b7395 3355 FAIL("Eval-group in insecure regular expression");
923e4eb5 3356 if (IN_PERL_COMPILETIME)
b5c19bd7 3357 PL_cv_has_eval = 1;
c277df42 3358 }
b5c19bd7 3359
830247a4 3360 nextchar(pRExC_state);
0f5d15d6 3361 if (logical) {
830247a4 3362 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3363 if (!SIZE_ONLY)
3364 ret->flags = 2;
830247a4 3365 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3366 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3367 return ret;
3368 }
ccb2c380
MP
3369 ret = reganode(pRExC_state, EVAL, n);
3370 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3371 Set_Node_Offset(ret, parse_start);
3372 return ret;
c277df42 3373 }
fac92740 3374 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3375 {
fac92740 3376 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3377 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3378 || RExC_parse[1] == '<'
830247a4 3379 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3380 I32 flag;
3381
830247a4 3382 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3383 if (!SIZE_ONLY)
3384 ret->flags = 1;
830247a4 3385 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 3386 goto insert_if;
b81d288d 3387 }
a0ed51b3 3388 }
830247a4 3389 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3390 /* (?(1)...) */
830247a4 3391 parno = atoi(RExC_parse++);
c277df42 3392
830247a4
IZ
3393 while (isDIGIT(*RExC_parse))
3394 RExC_parse++;
fac92740
MJD
3395 ret = reganode(pRExC_state, GROUPP, parno);
3396
830247a4 3397 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3398 vFAIL("Switch condition not recognized");
c277df42 3399 insert_if:
830247a4
IZ
3400 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3401 br = regbranch(pRExC_state, &flags, 1);
c277df42 3402 if (br == NULL)
830247a4 3403 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3404 else
830247a4
IZ
3405 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3406 c = *nextchar(pRExC_state);
d1b80229
IZ
3407 if (flags&HASWIDTH)
3408 *flagp |= HASWIDTH;
c277df42 3409 if (c == '|') {
830247a4
IZ
3410 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3411 regbranch(pRExC_state, &flags, 1);
3412 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
3413 if (flags&HASWIDTH)
3414 *flagp |= HASWIDTH;
830247a4 3415 c = *nextchar(pRExC_state);
a0ed51b3
LW
3416 }
3417 else
c277df42
IZ
3418 lastbr = NULL;
3419 if (c != ')')
8615cb43 3420 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
3421 ender = reg_node(pRExC_state, TAIL);
3422 regtail(pRExC_state, br, ender);
c277df42 3423 if (lastbr) {
830247a4
IZ
3424 regtail(pRExC_state, lastbr, ender);
3425 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3426 }
3427 else
830247a4 3428 regtail(pRExC_state, ret, ender);
c277df42 3429 return ret;
a0ed51b3
LW
3430 }
3431 else {
830247a4 3432 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3433 }
3434 }
1b1626e4 3435 case 0:
830247a4 3436 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3437 vFAIL("Sequence (? incomplete");
1b1626e4 3438 break;
a0d0e21e 3439 default:
830247a4 3440 --RExC_parse;
fac92740 3441 parse_flags: /* (?i) */
830247a4 3442 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3443 /* (?g), (?gc) and (?o) are useless here
3444 and must be globally applied -- japhy */
3445
3446 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3447 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3448 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3449 if (! (wastedflags & wflagbit) ) {
3450 wastedflags |= wflagbit;
3451 vWARN5(
3452 RExC_parse + 1,
3453 "Useless (%s%c) - %suse /%c modifier",
3454 flagsp == &negflags ? "?-" : "?",
3455 *RExC_parse,
3456 flagsp == &negflags ? "don't " : "",
3457 *RExC_parse
3458 );
3459 }
3460 }
3461 }
3462 else if (*RExC_parse == 'c') {
3463 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3464 if (! (wastedflags & wasted_c) ) {
3465 wastedflags |= wasted_gc;
3466 vWARN3(
3467 RExC_parse + 1,
3468 "Useless (%sc) - %suse /gc modifier",
3469 flagsp == &negflags ? "?-" : "?",
3470 flagsp == &negflags ? "don't " : ""
3471 );
3472 }
3473 }
3474 }
3475 else { pmflag(flagsp, *RExC_parse); }
3476
830247a4 3477 ++RExC_parse;
ca9dfc88 3478 }
830247a4 3479 if (*RExC_parse == '-') {
ca9dfc88 3480 flagsp = &negflags;
9d1d55b5 3481 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3482 ++RExC_parse;
ca9dfc88 3483 goto parse_flags;
48c036b1 3484 }
e2509266
JH
3485 RExC_flags |= posflags;
3486 RExC_flags &= ~negflags;
830247a4
IZ
3487 if (*RExC_parse == ':') {
3488 RExC_parse++;
ca9dfc88
IZ
3489 paren = ':';
3490 break;
3491 }
c277df42 3492 unknown:
830247a4
IZ
3493 if (*RExC_parse != ')') {
3494 RExC_parse++;
3495 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3496 }
830247a4 3497 nextchar(pRExC_state);
a0d0e21e
LW
3498 *flagp = TRYAGAIN;
3499 return NULL;
3500 }
3501 }
fac92740 3502 else { /* (...) */
830247a4
IZ
3503 parno = RExC_npar;
3504 RExC_npar++;
3505 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3506 Set_Node_Length(ret, 1); /* MJD */
3507 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 3508 open = 1;
a0d0e21e 3509 }
a0ed51b3 3510 }
fac92740 3511 else /* ! paren */
a0d0e21e
LW
3512 ret = NULL;
3513
3514 /* Pick up the branches, linking them together. */
fac92740 3515 parse_start = RExC_parse; /* MJD */
830247a4 3516 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
3517 /* branch_len = (paren != 0); */
3518
a0d0e21e
LW
3519 if (br == NULL)
3520 return(NULL);
830247a4
IZ
3521 if (*RExC_parse == '|') {
3522 if (!SIZE_ONLY && RExC_extralen) {
3523 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3524 }
fac92740 3525 else { /* MJD */
830247a4 3526 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3527 Set_Node_Length(br, paren != 0);
3528 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3529 }
c277df42
IZ
3530 have_branch = 1;
3531 if (SIZE_ONLY)
830247a4 3532 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3533 }
3534 else if (paren == ':') {
c277df42
IZ
3535 *flagp |= flags&SIMPLE;
3536 }
3537 if (open) { /* Starts with OPEN. */
830247a4 3538 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3539 }
3540 else if (paren != '?') /* Not Conditional */
a0d0e21e 3541 ret = br;
32a0ca98 3542 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3543 lastbr = br;
830247a4
IZ
3544 while (*RExC_parse == '|') {
3545 if (!SIZE_ONLY && RExC_extralen) {
3546 ender = reganode(pRExC_state, LONGJMP,0);
3547 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3548 }
3549 if (SIZE_ONLY)
830247a4
IZ
3550 RExC_extralen += 2; /* Account for LONGJMP. */
3551 nextchar(pRExC_state);
3552 br = regbranch(pRExC_state, &flags, 0);
fac92740 3553
a687059c 3554 if (br == NULL)
a0d0e21e 3555 return(NULL);
830247a4 3556 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3557 lastbr = br;
821b33a5
IZ
3558 if (flags&HASWIDTH)
3559 *flagp |= HASWIDTH;
a687059c 3560 *flagp |= flags&SPSTART;
a0d0e21e
LW
3561 }
3562
c277df42
IZ
3563 if (have_branch || paren != ':') {
3564 /* Make a closing node, and hook it on the end. */
3565 switch (paren) {
3566 case ':':
830247a4 3567 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3568 break;
3569 case 1:
830247a4 3570 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3571 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3572 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3573 break;
3574 case '<':
c277df42
IZ
3575 case ',':
3576 case '=':
3577 case '!':
c277df42 3578 *flagp &= ~HASWIDTH;
821b33a5
IZ
3579 /* FALL THROUGH */
3580 case '>':
830247a4 3581 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3582 break;
3583 case 0:
830247a4 3584 ender = reg_node(pRExC_state, END);
c277df42
IZ
3585 break;
3586 }
830247a4 3587 regtail(pRExC_state, lastbr, ender);
a0d0e21e 3588
c277df42
IZ
3589 if (have_branch) {
3590 /* Hook the tails of the branches to the closing node. */
3591 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 3592 regoptail(pRExC_state, br, ender);
c277df42
IZ
3593 }
3594 }
a0d0e21e 3595 }
c277df42
IZ
3596
3597 {
e1ec3a88
AL
3598 const char *p;
3599 static const char parens[] = "=!<,>";
c277df42
IZ
3600
3601 if (paren && (p = strchr(parens, paren))) {
eb160463 3602 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
3603 int flag = (p - parens) > 1;
3604
3605 if (paren == '>')
3606 node = SUSPEND, flag = 0;
830247a4 3607 reginsert(pRExC_state, node,ret);
45948336
EP
3608 Set_Node_Cur_Length(ret);
3609 Set_Node_Offset(ret, parse_start + 1);
c277df42 3610 ret->flags = flag;
830247a4 3611 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 3612 }
a0d0e21e
LW
3613 }
3614
3615 /* Check for proper termination. */
ce3e6498 3616 if (paren) {
e2509266 3617 RExC_flags = oregflags;
830247a4
IZ
3618 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3619 RExC_parse = oregcomp_parse;
380a0633 3620 vFAIL("Unmatched (");
ce3e6498 3621 }
a0ed51b3 3622 }
830247a4
IZ
3623 else if (!paren && RExC_parse < RExC_end) {
3624 if (*RExC_parse == ')') {
3625 RExC_parse++;
380a0633 3626 vFAIL("Unmatched )");
a0ed51b3
LW
3627 }
3628 else
b45f050a 3629 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
3630 /* NOTREACHED */
3631 }
a687059c 3632
a0d0e21e 3633 return(ret);
a687059c
LW
3634}
3635
3636/*
3637 - regbranch - one alternative of an | operator
3638 *
3639 * Implements the concatenation operator.
3640 */
76e3520e 3641STATIC regnode *
830247a4 3642S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 3643{
c277df42
IZ
3644 register regnode *ret;
3645 register regnode *chain = NULL;
3646 register regnode *latest;
3647 I32 flags = 0, c = 0;
a0d0e21e 3648
b81d288d 3649 if (first)
c277df42
IZ
3650 ret = NULL;
3651 else {
b81d288d 3652 if (!SIZE_ONLY && RExC_extralen)
830247a4 3653 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 3654 else {
830247a4 3655 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
3656 Set_Node_Length(ret, 1);
3657 }
c277df42
IZ
3658 }
3659
b81d288d 3660 if (!first && SIZE_ONLY)
830247a4 3661 RExC_extralen += 1; /* BRANCHJ */
b81d288d 3662
c277df42 3663 *flagp = WORST; /* Tentatively. */
a0d0e21e 3664
830247a4
IZ
3665 RExC_parse--;
3666 nextchar(pRExC_state);
3667 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 3668 flags &= ~TRYAGAIN;
830247a4 3669 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
3670 if (latest == NULL) {
3671 if (flags & TRYAGAIN)
3672 continue;
3673 return(NULL);
a0ed51b3
LW
3674 }
3675 else if (ret == NULL)
c277df42 3676 ret = latest;
a0d0e21e 3677 *flagp |= flags&HASWIDTH;
c277df42 3678 if (chain == NULL) /* First piece. */
a0d0e21e
LW
3679 *flagp |= flags&SPSTART;
3680 else {
830247a4
IZ
3681 RExC_naughty++;
3682 regtail(pRExC_state, chain, latest);
a687059c 3683 }
a0d0e21e 3684 chain = latest;
c277df42
IZ
3685 c++;
3686 }
3687 if (chain == NULL) { /* Loop ran zero times. */
830247a4 3688 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
3689 if (ret == NULL)
3690 ret = chain;
3691 }
3692 if (c == 1) {
3693 *flagp |= flags&SIMPLE;
a0d0e21e 3694 }
a687059c 3695
a0d0e21e 3696 return(ret);
a687059c
LW
3697}
3698
3699/*
3700 - regpiece - something followed by possible [*+?]
3701 *
3702 * Note that the branching code sequences used for ? and the general cases
3703 * of * and + are somewhat optimized: they use the same NOTHING node as
3704 * both the endmarker for their branch list and the body of the last branch.
3705 * It might seem that this node could be dispensed with entirely, but the
3706 * endmarker role is not redundant.
3707 */
76e3520e 3708STATIC regnode *
830247a4 3709S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3710{
c277df42 3711 register regnode *ret;
a0d0e21e
LW
3712 register char op;
3713 register char *next;
3714 I32 flags;
830247a4 3715 char *origparse = RExC_parse;
a0d0e21e
LW
3716 char *maxpos;
3717 I32 min;
c277df42 3718 I32 max = REG_INFTY;
fac92740 3719 char *parse_start;
a0d0e21e 3720
830247a4 3721 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
3722 if (ret == NULL) {
3723 if (flags & TRYAGAIN)
3724 *flagp |= TRYAGAIN;
3725 return(NULL);
3726 }
3727
830247a4 3728 op = *RExC_parse;
a0d0e21e 3729
830247a4 3730 if (op == '{' && regcurly(RExC_parse)) {
fac92740 3731 parse_start = RExC_parse; /* MJD */
830247a4 3732 next = RExC_parse + 1;
a0d0e21e
LW
3733 maxpos = Nullch;
3734 while (isDIGIT(*next) || *next == ',') {
3735 if (*next == ',') {
3736 if (maxpos)
3737 break;
3738 else
3739 maxpos = next;
a687059c 3740 }
a0d0e21e
LW
3741 next++;
3742 }
3743 if (*next == '}') { /* got one */
3744 if (!maxpos)
3745 maxpos = next;
830247a4
IZ
3746 RExC_parse++;
3747 min = atoi(RExC_parse);
a0d0e21e
LW
3748 if (*maxpos == ',')
3749 maxpos++;
3750 else
830247a4 3751 maxpos = RExC_parse;
a0d0e21e
LW
3752 max = atoi(maxpos);
3753 if (!max && *maxpos != '0')
c277df42
IZ
3754 max = REG_INFTY; /* meaning "infinity" */
3755 else if (max >= REG_INFTY)
8615cb43 3756 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
3757 RExC_parse = next;
3758 nextchar(pRExC_state);
a0d0e21e
LW
3759
3760 do_curly:
3761 if ((flags&SIMPLE)) {
830247a4
IZ
3762 RExC_naughty += 2 + RExC_naughty / 2;
3763 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
3764 Set_Node_Offset(ret, parse_start+1); /* MJD */
3765 Set_Node_Cur_Length(ret);
a0d0e21e
LW
3766 }
3767 else {
830247a4 3768 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
3769
3770 w->flags = 0;
830247a4
IZ
3771 regtail(pRExC_state, ret, w);
3772 if (!SIZE_ONLY && RExC_extralen) {
3773 reginsert(pRExC_state, LONGJMP,ret);
3774 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
3775 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3776 }
830247a4 3777 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
3778 /* MJD hk */
3779 Set_Node_Offset(ret, parse_start+1);
3780 Set_Node_Length(ret,
3781 op == '{' ? (RExC_parse - parse_start) : 1);
3782
830247a4 3783 if (!SIZE_ONLY && RExC_extralen)
c277df42 3784 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 3785 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 3786 if (SIZE_ONLY)
830247a4
IZ
3787 RExC_whilem_seen++, RExC_extralen += 3;
3788 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 3789 }
c277df42 3790 ret->flags = 0;
a0d0e21e
LW
3791
3792 if (min > 0)
821b33a5
IZ
3793 *flagp = WORST;
3794 if (max > 0)
3795 *flagp |= HASWIDTH;
a0d0e21e 3796 if (max && max < min)
8615cb43 3797 vFAIL("Can't do {n,m} with n > m");
c277df42 3798 if (!SIZE_ONLY) {
eb160463
GS
3799 ARG1_SET(ret, (U16)min);
3800 ARG2_SET(ret, (U16)max);
a687059c 3801 }
a687059c 3802
a0d0e21e 3803 goto nest_check;
a687059c 3804 }
a0d0e21e 3805 }
a687059c 3806
a0d0e21e
LW
3807 if (!ISMULT1(op)) {
3808 *flagp = flags;
a687059c 3809 return(ret);
a0d0e21e 3810 }
bb20fd44 3811
c277df42 3812#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
3813
3814 /* if this is reinstated, don't forget to put this back into perldiag:
3815
3816 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3817
3818 (F) The part of the regexp subject to either the * or + quantifier
3819 could match an empty string. The {#} shows in the regular
3820 expression about where the problem was discovered.
3821
3822 */
3823
bb20fd44 3824 if (!(flags&HASWIDTH) && op != '?')
b45f050a 3825 vFAIL("Regexp *+ operand could be empty");
b81d288d 3826#endif
bb20fd44 3827
fac92740 3828 parse_start = RExC_parse;
830247a4 3829 nextchar(pRExC_state);
a0d0e21e 3830
821b33a5 3831 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
3832
3833 if (op == '*' && (flags&SIMPLE)) {
830247a4 3834 reginsert(pRExC_state, STAR, ret);
c277df42 3835 ret->flags = 0;
830247a4 3836 RExC_naughty += 4;
a0d0e21e
LW
3837 }
3838 else if (op == '*') {
3839 min = 0;
3840 goto do_curly;
a0ed51b3
LW
3841 }
3842 else if (op == '+' && (flags&SIMPLE)) {
830247a4 3843 reginsert(pRExC_state, PLUS, ret);
c277df42 3844 ret->flags = 0;
830247a4 3845 RExC_naughty += 3;
a0d0e21e
LW
3846 }
3847 else if (op == '+') {
3848 min = 1;
3849 goto do_curly;
a0ed51b3
LW
3850 }
3851 else if (op == '?') {
a0d0e21e
LW
3852 min = 0; max = 1;
3853 goto do_curly;
3854 }
3855 nest_check:
e476b1b5 3856 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 3857 vWARN3(RExC_parse,
b45f050a 3858 "%.*s matches null string many times",
830247a4 3859 RExC_parse - origparse,
b45f050a 3860 origparse);
a0d0e21e
LW
3861 }
3862
830247a4
IZ
3863 if (*RExC_parse == '?') {
3864 nextchar(pRExC_state);
3865 reginsert(pRExC_state, MINMOD, ret);
3866 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 3867 }
830247a4
IZ
3868 if (ISMULT2(RExC_parse)) {
3869 RExC_parse++;
b45f050a
JF
3870 vFAIL("Nested quantifiers");
3871 }
a0d0e21e
LW
3872
3873 return(ret);
a687059c
LW
3874}
3875
3876/*
3877 - regatom - the lowest level
3878 *
3879 * Optimization: gobbles an entire sequence of ordinary characters so that
3880 * it can turn them into a single node, which is smaller to store and
3881 * faster to run. Backslashed characters are exceptions, each becoming a
3882 * separate node; the code is simpler that way and it's not worth fixing.
3883 *
b45f050a 3884 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 3885STATIC regnode *
830247a4 3886S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3887{
c277df42 3888 register regnode *ret = 0;
a0d0e21e 3889 I32 flags;
45948336 3890 char *parse_start = RExC_parse;
a0d0e21e
LW
3891
3892 *flagp = WORST; /* Tentatively. */
3893
3894tryagain:
830247a4 3895 switch (*RExC_parse) {
a0d0e21e 3896 case '^':
830247a4
IZ
3897 RExC_seen_zerolen++;
3898 nextchar(pRExC_state);
e2509266 3899 if (RExC_flags & PMf_MULTILINE)
830247a4 3900 ret = reg_node(pRExC_state, MBOL);
e2509266 3901 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3902 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3903 else
830247a4 3904 ret = reg_node(pRExC_state, BOL);
fac92740 3905 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3906 break;
3907 case '$':
830247a4 3908 nextchar(pRExC_state);
b81d288d 3909 if (*RExC_parse)
830247a4 3910 RExC_seen_zerolen++;
e2509266 3911 if (RExC_flags & PMf_MULTILINE)
830247a4 3912 ret = reg_node(pRExC_state, MEOL);
e2509266 3913 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3914 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3915 else
830247a4 3916 ret = reg_node(pRExC_state, EOL);
fac92740 3917 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3918 break;
3919 case '.':
830247a4 3920 nextchar(pRExC_state);
e2509266 3921 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
3922 ret = reg_node(pRExC_state, SANY);
3923 else
3924 ret = reg_node(pRExC_state, REG_ANY);
3925 *flagp |= HASWIDTH|SIMPLE;
830247a4 3926 RExC_naughty++;
fac92740 3927 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3928 break;
3929 case '[':
b45f050a 3930 {
830247a4 3931 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 3932 ret = regclass(pRExC_state);
830247a4
IZ
3933 if (*RExC_parse != ']') {
3934 RExC_parse = oregcomp_parse;
b45f050a
JF
3935 vFAIL("Unmatched [");
3936 }
830247a4 3937 nextchar(pRExC_state);
a0d0e21e 3938 *flagp |= HASWIDTH|SIMPLE;
fac92740 3939 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 3940 break;
b45f050a 3941 }
a0d0e21e 3942 case '(':
830247a4
IZ
3943 nextchar(pRExC_state);
3944 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 3945 if (ret == NULL) {
bf93d4cc 3946 if (flags & TRYAGAIN) {
830247a4 3947 if (RExC_parse == RExC_end) {
bf93d4cc
GS
3948 /* Make parent create an empty node if needed. */
3949 *flagp |= TRYAGAIN;
3950 return(NULL);
3951 }
a0d0e21e 3952 goto tryagain;
bf93d4cc 3953 }
a0d0e21e
LW
3954 return(NULL);
3955 }
c277df42 3956 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
3957 break;
3958 case '|':
3959 case ')':
3960 if (flags & TRYAGAIN) {
3961 *flagp |= TRYAGAIN;
3962 return NULL;
3963 }
b45f050a 3964 vFAIL("Internal urp");
a0d0e21e
LW
3965 /* Supposed to be caught earlier. */
3966 break;
85afd4ae 3967 case '{':
830247a4
IZ
3968 if (!regcurly(RExC_parse)) {
3969 RExC_parse++;
85afd4ae
CS
3970 goto defchar;
3971 }
3972 /* FALL THROUGH */
a0d0e21e
LW
3973 case '?':
3974 case '+':
3975 case '*':
830247a4 3976 RExC_parse++;
b45f050a 3977 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
3978 break;
3979 case '\\':
830247a4 3980 switch (*++RExC_parse) {
a0d0e21e 3981 case 'A':
830247a4
IZ
3982 RExC_seen_zerolen++;
3983 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3984 *flagp |= SIMPLE;
830247a4 3985 nextchar(pRExC_state);
fac92740 3986 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3987 break;
3988 case 'G':
830247a4
IZ
3989 ret = reg_node(pRExC_state, GPOS);
3990 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 3991 *flagp |= SIMPLE;
830247a4 3992 nextchar(pRExC_state);
fac92740 3993 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3994 break;
3995 case 'Z':
830247a4 3996 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3997 *flagp |= SIMPLE;
a1917ab9 3998 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 3999 nextchar(pRExC_state);
a0d0e21e 4000 break;
b85d18e9 4001 case 'z':
830247a4 4002 ret = reg_node(pRExC_state, EOS);
b85d18e9 4003 *flagp |= SIMPLE;
830247a4
IZ
4004 RExC_seen_zerolen++; /* Do not optimize RE away */
4005 nextchar(pRExC_state);
fac92740 4006 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 4007 break;
4a2d328f 4008 case 'C':
f33976b4
DB
4009 ret = reg_node(pRExC_state, CANY);
4010 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 4011 *flagp |= HASWIDTH|SIMPLE;
830247a4 4012 nextchar(pRExC_state);
fac92740 4013 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
4014 break;
4015 case 'X':
830247a4 4016 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 4017 *flagp |= HASWIDTH;
830247a4 4018 nextchar(pRExC_state);
fac92740 4019 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 4020 break;
a0d0e21e 4021 case 'w':
eb160463 4022 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 4023 *flagp |= HASWIDTH|SIMPLE;
830247a4 4024 nextchar(pRExC_state);
fac92740 4025 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4026 break;
4027 case 'W':
eb160463 4028 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4029 *flagp |= HASWIDTH|SIMPLE;
830247a4 4030 nextchar(pRExC_state);
fac92740 4031 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4032 break;
4033 case 'b':
830247a4
IZ
4034 RExC_seen_zerolen++;
4035 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4036 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4037 *flagp |= SIMPLE;
830247a4 4038 nextchar(pRExC_state);
fac92740 4039 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4040 break;
4041 case 'B':
830247a4
IZ
4042 RExC_seen_zerolen++;
4043 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4044 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4045 *flagp |= SIMPLE;
830247a4 4046 nextchar(pRExC_state);
fac92740 4047 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4048 break;
4049 case 's':
eb160463 4050 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4051 *flagp |= HASWIDTH|SIMPLE;
830247a4 4052 nextchar(pRExC_state);
fac92740 4053 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4054 break;
4055 case 'S':
eb160463 4056 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4057 *flagp |= HASWIDTH|SIMPLE;
830247a4 4058 nextchar(pRExC_state);
fac92740 4059 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4060 break;
4061 case 'd':
ffc61ed2 4062 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4063 *flagp |= HASWIDTH|SIMPLE;
830247a4 4064 nextchar(pRExC_state);
fac92740 4065 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4066 break;
4067 case 'D':
ffc61ed2 4068 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4069 *flagp |= HASWIDTH|SIMPLE;
830247a4 4070 nextchar(pRExC_state);
fac92740 4071 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4072 break;
a14b48bc
LW
4073 case 'p':
4074 case 'P':
3568d838 4075 {
830247a4 4076 char* oldregxend = RExC_end;
ccb2c380 4077 char* parse_start = RExC_parse - 2;
a14b48bc 4078
830247a4 4079 if (RExC_parse[1] == '{') {
3568d838 4080 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4081 RExC_end = strchr(RExC_parse, '}');
4082 if (!RExC_end) {
0da60cf5 4083 U8 c = (U8)*RExC_parse;
830247a4
IZ
4084 RExC_parse += 2;
4085 RExC_end = oldregxend;
0da60cf5 4086 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4087 }
830247a4 4088 RExC_end++;
a14b48bc 4089 }
af6f566e 4090 else {
830247a4 4091 RExC_end = RExC_parse + 2;
af6f566e
HS
4092 if (RExC_end > oldregxend)
4093 RExC_end = oldregxend;
4094 }
830247a4 4095 RExC_parse--;
a14b48bc 4096
ffc61ed2 4097 ret = regclass(pRExC_state);
a14b48bc 4098
830247a4
IZ
4099 RExC_end = oldregxend;
4100 RExC_parse--;
ccb2c380
MP
4101
4102 Set_Node_Offset(ret, parse_start + 2);
4103 Set_Node_Cur_Length(ret);
830247a4 4104 nextchar(pRExC_state);
a14b48bc
LW
4105 *flagp |= HASWIDTH|SIMPLE;
4106 }
4107 break;
a0d0e21e
LW
4108 case 'n':
4109 case 'r':
4110 case 't':
4111 case 'f':
4112 case 'e':
4113 case 'a':
4114 case 'x':
4115 case 'c':
4116 case '0':
4117 goto defchar;
4118 case '1': case '2': case '3': case '4':
4119 case '5': case '6': case '7': case '8': case '9':
4120 {
830247a4 4121 I32 num = atoi(RExC_parse);
a0d0e21e 4122
830247a4 4123 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4124 goto defchar;
4125 else {
fac92740 4126 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4127 while (isDIGIT(*RExC_parse))
4128 RExC_parse++;
b45f050a 4129
eb160463 4130 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4131 vFAIL("Reference to nonexistent group");
830247a4 4132 RExC_sawback = 1;
eb160463
GS
4133 ret = reganode(pRExC_state,
4134 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4135 num);
a0d0e21e 4136 *flagp |= HASWIDTH;
fac92740
MJD
4137
4138 /* override incorrect value set in reganode MJD */
4139 Set_Node_Offset(ret, parse_start+1);
4140 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4141 RExC_parse--;
4142 nextchar(pRExC_state);
a0d0e21e
LW
4143 }
4144 }
4145 break;
4146 case '\0':
830247a4 4147 if (RExC_parse >= RExC_end)
b45f050a 4148 FAIL("Trailing \\");
a0d0e21e
LW
4149 /* FALL THROUGH */
4150 default:
c9f97d15
IZ
4151 /* Do not generate `unrecognized' warnings here, we fall
4152 back into the quick-grab loop below */
45948336 4153 parse_start--;
a0d0e21e
LW
4154 goto defchar;
4155 }
4156 break;
4633a7c4
LW
4157
4158 case '#':
e2509266 4159 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4160 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4161 if (RExC_parse < RExC_end)
4633a7c4
LW
4162 goto tryagain;
4163 }
4164 /* FALL THROUGH */
4165
a0d0e21e 4166 default: {
ba210ebe 4167 register STRLEN len;
58ae7d3f 4168 register UV ender;
a0d0e21e 4169 register char *p;
c277df42 4170 char *oldp, *s;
ba210ebe 4171 STRLEN numlen;
80aecb99 4172 STRLEN foldlen;
89ebb4a3 4173 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4174
4175 parse_start = RExC_parse - 1;
a0d0e21e 4176
830247a4 4177 RExC_parse++;
a0d0e21e
LW
4178
4179 defchar:
58ae7d3f 4180 ender = 0;
eb160463
GS
4181 ret = reg_node(pRExC_state,
4182 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4183 s = STRING(ret);
830247a4
IZ
4184 for (len = 0, p = RExC_parse - 1;
4185 len < 127 && p < RExC_end;
a0d0e21e
LW
4186 len++)
4187 {
4188 oldp = p;
5b5a24f7 4189
e2509266 4190 if (RExC_flags & PMf_EXTENDED)
830247a4 4191 p = regwhite(p, RExC_end);
a0d0e21e
LW
4192 switch (*p) {
4193 case '^':
4194 case '$':
4195 case '.':
4196 case '[':
4197 case '(':
4198 case ')':
4199 case '|':
4200 goto loopdone;
4201 case '\\':
4202 switch (*++p) {
4203 case 'A':
1ed8eac0
JF
4204 case 'C':
4205 case 'X':
a0d0e21e
LW
4206 case 'G':
4207 case 'Z':
b85d18e9 4208 case 'z':
a0d0e21e
LW
4209 case 'w':
4210 case 'W':
4211 case 'b':
4212 case 'B':
4213 case 's':
4214 case 'S':
4215 case 'd':
4216 case 'D':
a14b48bc
LW
4217 case 'p':
4218 case 'P':
a0d0e21e
LW
4219 --p;
4220 goto loopdone;
4221 case 'n':
4222 ender = '\n';
4223 p++;
a687059c 4224 break;
a0d0e21e
LW
4225 case 'r':
4226 ender = '\r';
4227 p++;
a687059c 4228 break;
a0d0e21e
LW
4229 case 't':
4230 ender = '\t';
4231 p++;
a687059c 4232 break;
a0d0e21e
LW
4233 case 'f':
4234 ender = '\f';
4235 p++;
a687059c 4236 break;
a0d0e21e 4237 case 'e':
c7f1f016 4238 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4239 p++;
a687059c 4240 break;
a0d0e21e 4241 case 'a':
c7f1f016 4242 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4243 p++;
a687059c 4244 break;
a0d0e21e 4245 case 'x':
a0ed51b3
LW
4246 if (*++p == '{') {
4247 char* e = strchr(p, '}');
b81d288d 4248
b45f050a 4249 if (!e) {
830247a4 4250 RExC_parse = p + 1;
b45f050a
JF
4251 vFAIL("Missing right brace on \\x{}");
4252 }
de5f0749 4253 else {
a4c04bdc
NC
4254 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4255 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4256 numlen = e - p - 1;
4257 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4258 if (ender > 0xff)
4259 RExC_utf8 = 1;
a0ed51b3
LW
4260 p = e + 1;
4261 }
a0ed51b3
LW
4262 }
4263 else {
a4c04bdc 4264 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4265 numlen = 2;
4266 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4267 p += numlen;
4268 }
a687059c 4269 break;
a0d0e21e
LW
4270 case 'c':
4271 p++;
bbce6d69 4272 ender = UCHARAT(p++);
4273 ender = toCTRL(ender);
a687059c 4274 break;
a0d0e21e
LW
4275 case '0': case '1': case '2': case '3':case '4':
4276 case '5': case '6': case '7': case '8':case '9':
4277 if (*p == '0' ||
830247a4 4278 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
4279 I32 flags = 0;
4280 numlen = 3;
4281 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4282 p += numlen;
4283 }
4284 else {
4285 --p;
4286 goto loopdone;
a687059c
LW
4287 }
4288 break;
a0d0e21e 4289 case '\0':
830247a4 4290 if (p >= RExC_end)
b45f050a 4291 FAIL("Trailing \\");
a687059c 4292 /* FALL THROUGH */
a0d0e21e 4293 default:
e476b1b5 4294 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 4295 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4296 goto normal_default;
a0d0e21e
LW
4297 }
4298 break;
a687059c 4299 default:
a0ed51b3 4300 normal_default:
fd400ab9 4301 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 4302 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 4303 &numlen, 0);
a0ed51b3
LW
4304 p += numlen;
4305 }
4306 else
4307 ender = *p++;
a0d0e21e 4308 break;
a687059c 4309 }
e2509266 4310 if (RExC_flags & PMf_EXTENDED)
830247a4 4311 p = regwhite(p, RExC_end);
60a8b682
JH
4312 if (UTF && FOLD) {
4313 /* Prime the casefolded buffer. */
ac7e0132 4314 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4315 }
a0d0e21e
LW
4316 if (ISMULT2(p)) { /* Back off on ?+*. */
4317 if (len)
4318 p = oldp;
16ea2a2e 4319 else if (UTF) {
0ebc6274
JH
4320 STRLEN unilen;
4321
80aecb99 4322 if (FOLD) {
60a8b682 4323 /* Emit all the Unicode characters. */
80aecb99
JH
4324 for (foldbuf = tmpbuf;
4325 foldlen;
4326 foldlen -= numlen) {
4327 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4328 if (numlen > 0) {
0ebc6274
JH
4329 reguni(pRExC_state, ender, s, &unilen);
4330 s += unilen;
4331 len += unilen;
4332 /* In EBCDIC the numlen
4333 * and unilen can differ. */
9dc45d57 4334 foldbuf += numlen;
47654450
JH
4335 if (numlen >= foldlen)
4336 break;
9dc45d57
JH
4337 }
4338 else
4339 break; /* "Can't happen." */
80aecb99
JH
4340 }
4341 }
4342 else {
0ebc6274 4343 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4344 if (unilen > 0) {
0ebc6274
JH
4345 s += unilen;
4346 len += unilen;
9dc45d57 4347 }
80aecb99 4348 }
a0ed51b3 4349 }
a0d0e21e
LW
4350 else {
4351 len++;
eb160463 4352 REGC((char)ender, s++);
a0d0e21e
LW
4353 }
4354 break;
a687059c 4355 }
16ea2a2e 4356 if (UTF) {
0ebc6274
JH
4357 STRLEN unilen;
4358
80aecb99 4359 if (FOLD) {
60a8b682 4360 /* Emit all the Unicode characters. */
80aecb99
JH
4361 for (foldbuf = tmpbuf;
4362 foldlen;
4363 foldlen -= numlen) {
4364 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4365 if (numlen > 0) {
0ebc6274
JH
4366 reguni(pRExC_state, ender, s, &unilen);
4367 len += unilen;
4368 s += unilen;
4369 /* In EBCDIC the numlen
4370 * and unilen can differ. */
9dc45d57 4371 foldbuf += numlen;
47654450
JH
4372 if (numlen >= foldlen)
4373 break;
9dc45d57
JH
4374 }
4375 else
4376 break;
80aecb99
JH
4377 }
4378 }
4379 else {
0ebc6274 4380 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4381 if (unilen > 0) {
0ebc6274
JH
4382 s += unilen;
4383 len += unilen;
9dc45d57 4384 }
80aecb99
JH
4385 }
4386 len--;
a0ed51b3
LW
4387 }
4388 else
eb160463 4389 REGC((char)ender, s++);
a0d0e21e
LW
4390 }
4391 loopdone:
830247a4 4392 RExC_parse = p - 1;
fac92740 4393 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4394 nextchar(pRExC_state);
793db0cb
JH
4395 {
4396 /* len is STRLEN which is unsigned, need to copy to signed */
4397 IV iv = len;
4398 if (iv < 0)
4399 vFAIL("Internal disaster");
4400 }
a0d0e21e
LW
4401 if (len > 0)
4402 *flagp |= HASWIDTH;
090f7165 4403 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4404 *flagp |= SIMPLE;
c277df42 4405 if (!SIZE_ONLY)
cd439c50
IZ
4406 STR_LEN(ret) = len;
4407 if (SIZE_ONLY)
830247a4 4408 RExC_size += STR_SZ(len);
cd439c50 4409 else
830247a4 4410 RExC_emit += STR_SZ(len);
a687059c 4411 }
a0d0e21e
LW
4412 break;
4413 }
a687059c 4414
60a8b682
JH
4415 /* If the encoding pragma is in effect recode the text of
4416 * any EXACT-kind nodes. */
22c54be3 4417 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
4418 STRLEN oldlen = STR_LEN(ret);
4419 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4420
4421 if (RExC_utf8)
4422 SvUTF8_on(sv);
4423 if (sv_utf8_downgrade(sv, TRUE)) {
4424 char *s = sv_recode_to_utf8(sv, PL_encoding);
4425 STRLEN newlen = SvCUR(sv);
4426
4427 if (SvUTF8(sv))
4428 RExC_utf8 = 1;
4429 if (!SIZE_ONLY) {
a3621e74
YO
4430 GET_RE_DEBUG_FLAGS_DECL;
4431 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
4432 (int)oldlen, STRING(ret),
4433 (int)newlen, s));
4434 Copy(s, STRING(ret), newlen, char);
4435 STR_LEN(ret) += newlen - oldlen;
4436 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4437 } else
4438 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4439 }
a72c7584
JH
4440 }
4441
a0d0e21e 4442 return(ret);
a687059c
LW
4443}
4444
873ef191 4445STATIC char *
cea2e8a9 4446S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
4447{
4448 while (p < e) {
4449 if (isSPACE(*p))
4450 ++p;
4451 else if (*p == '#') {
4452 do {
4453 p++;
4454 } while (p < e && *p != '\n');
4455 }
4456 else
4457 break;
4458 }
4459 return p;
4460}
4461
b8c5462f
JH
4462/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4463 Character classes ([:foo:]) can also be negated ([:^foo:]).
4464 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4465 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 4466 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
4467
4468#define POSIXCC_DONE(c) ((c) == ':')
4469#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4470#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4471
b8c5462f 4472STATIC I32
830247a4 4473S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
4474{
4475 char *posixcc = 0;
936ed897 4476 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 4477
830247a4 4478 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 4479 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
4480 POSIXCC(UCHARAT(RExC_parse))) {
4481 char c = UCHARAT(RExC_parse);
830247a4 4482 char* s = RExC_parse++;
b81d288d 4483
9a86a77b 4484 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
4485 RExC_parse++;
4486 if (RExC_parse == RExC_end)
620e46c5 4487 /* Grandfather lone [:, [=, [. */
830247a4 4488 RExC_parse = s;
620e46c5 4489 else {
830247a4 4490 char* t = RExC_parse++; /* skip over the c */
b8c5462f 4491
80916619
NC
4492 assert(*t == c);
4493
9a86a77b 4494 if (UCHARAT(RExC_parse) == ']') {
830247a4 4495 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
4496 posixcc = s + 1;
4497 if (*s == ':') {
4498 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
80916619
NC
4499 I32 skip = t - posixcc;
4500
4501 /* Initially switch on the length of the name. */
4502 switch (skip) {
4503 case 4:
4504 if (memEQ(posixcc, "word", 4)) {
4505 /* this is not POSIX, this is the Perl \w */;
4506 namedclass
4507 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4508 }
cc4319de 4509 break;
80916619
NC
4510 case 5:
4511 /* Names all of length 5. */
4512 /* alnum alpha ascii blank cntrl digit graph lower
4513 print punct space upper */
4514 /* Offset 4 gives the best switch position. */
4515 switch (posixcc[4]) {
4516 case 'a':
4517 if (memEQ(posixcc, "alph", 4)) {
4518 /* a */
4519 namedclass
4520 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4521 }
4522 break;
4523 case 'e':
4524 if (memEQ(posixcc, "spac", 4)) {
4525 /* e */
4526 namedclass
4527 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4528 }
4529 break;
4530 case 'h':
4531 if (memEQ(posixcc, "grap", 4)) {
4532 /* h */
4533 namedclass
4534 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4535 }
4536 break;
4537 case 'i':
4538 if (memEQ(posixcc, "asci", 4)) {
4539 /* i */
4540 namedclass
4541 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4542 }
4543 break;
4544 case 'k':
4545 if (memEQ(posixcc, "blan", 4)) {
4546 /* k */
4547 namedclass
4548 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4549 }
4550 break;
4551 case 'l':
4552 if (memEQ(posixcc, "cntr", 4)) {
4553 /* l */
4554 namedclass
4555 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4556 }
4557 break;
4558 case 'm':
4559 if (memEQ(posixcc, "alnu", 4)) {
4560 /* m */
4561 namedclass
4562 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4563 }
4564 break;
4565 case 'r':
4566 if (memEQ(posixcc, "lowe", 4)) {
4567 /* r */
4568 namedclass
4569 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4570 }
4571 if (memEQ(posixcc, "uppe", 4)) {
8fdec511 4572 /* r */
80916619
NC
4573 namedclass
4574 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4575 }
4576 break;
4577 case 't':
4578 if (memEQ(posixcc, "digi", 4)) {
4579 /* t */
4580 namedclass
4581 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4582 }
4583 if (memEQ(posixcc, "prin", 4)) {
8fdec511 4584 /* t */
80916619
NC
4585 namedclass
4586 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4587 }
4588 if (memEQ(posixcc, "punc", 4)) {
4589 /* t */
4590 namedclass
4591 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4592 }
4593 break;
b8c5462f
JH
4594 }
4595 break;
80916619
NC
4596 case 6:
4597 if (memEQ(posixcc, "xdigit", 6)) {
4598 namedclass
4599 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
4600 }
4601 break;
4602 }
80916619
NC
4603
4604 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
4605 {
4606 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4607 t - s - 1, s + 1);
4608 }
80916619
NC
4609 assert (posixcc[skip] == ':');
4610 assert (posixcc[skip+1] == ']');
b45f050a 4611 } else if (!SIZE_ONLY) {
b8c5462f 4612 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 4613
830247a4 4614 /* adjust RExC_parse so the warning shows after
b45f050a 4615 the class closes */
9a86a77b 4616 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 4617 RExC_parse++;
b45f050a
JF
4618 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4619 }
b8c5462f
JH
4620 } else {
4621 /* Maternal grandfather:
4622 * "[:" ending in ":" but not in ":]" */
830247a4 4623 RExC_parse = s;
767d463e 4624 }
620e46c5
JH
4625 }
4626 }
4627
b8c5462f
JH
4628 return namedclass;
4629}
4630
4631STATIC void
830247a4 4632S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 4633{
b938889d 4634 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
830247a4 4635 char *s = RExC_parse;
93733859 4636 char c = *s++;
b8c5462f
JH
4637
4638 while(*s && isALNUM(*s))
4639 s++;
4640 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
4641 if (ckWARN(WARN_REGEXP))
4642 vWARN3(s+2,
4643 "POSIX syntax [%c %c] belongs inside character classes",
4644 c, c);
b45f050a
JF
4645
4646 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 4647 if (POSIXCC_NOTYET(c)) {
830247a4 4648 /* adjust RExC_parse so the error shows after
b45f050a 4649 the class closes */
9a86a77b 4650 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
4651 ;
4652 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4653 }
b8c5462f
JH
4654 }
4655 }
620e46c5
JH
4656}
4657
76e3520e 4658STATIC regnode *
830247a4 4659S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 4660{
ffc61ed2 4661 register UV value;
9a86a77b 4662 register UV nextvalue;
3568d838 4663 register IV prevvalue = OOB_UNICODE;
ffc61ed2 4664 register IV range = 0;
c277df42 4665 register regnode *ret;
ba210ebe 4666 STRLEN numlen;
ffc61ed2 4667 IV namedclass;
9c5ffd7c 4668 char *rangebegin = 0;
936ed897 4669 bool need_class = 0;
9c5ffd7c 4670 SV *listsv = Nullsv;
ffc61ed2
JH
4671 register char *e;
4672 UV n;
9e55ce06
JH
4673 bool optimize_invert = TRUE;
4674 AV* unicode_alternate = 0;
1b2d223b
JH
4675#ifdef EBCDIC
4676 UV literal_endpoint = 0;
4677#endif
ffc61ed2
JH
4678
4679 ret = reganode(pRExC_state, ANYOF, 0);
4680
4681 if (!SIZE_ONLY)
4682 ANYOF_FLAGS(ret) = 0;
4683
9a86a77b 4684 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
4685 RExC_naughty++;
4686 RExC_parse++;
4687 if (!SIZE_ONLY)
4688 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4689 }
a0d0e21e 4690
936ed897 4691 if (SIZE_ONLY)
830247a4 4692 RExC_size += ANYOF_SKIP;
936ed897 4693 else {
830247a4 4694 RExC_emit += ANYOF_SKIP;
936ed897
IZ
4695 if (FOLD)
4696 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4697 if (LOC)
4698 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
4699 ANYOF_BITMAP_ZERO(ret);
4700 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 4701 }
b8c5462f 4702
9a86a77b
JH
4703 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4704
b938889d 4705 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 4706 checkposixcc(pRExC_state);
b8c5462f 4707
f064b6ad
HS
4708 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4709 if (UCHARAT(RExC_parse) == ']')
4710 goto charclassloop;
ffc61ed2 4711
9a86a77b 4712 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
4713
4714 charclassloop:
4715
4716 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4717
73b437c8 4718 if (!range)
830247a4 4719 rangebegin = RExC_parse;
ffc61ed2 4720 if (UTF) {
5e12f4fb 4721 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
4722 RExC_end - RExC_parse,
4723 &numlen, 0);
ffc61ed2
JH
4724 RExC_parse += numlen;
4725 }
4726 else
4727 value = UCHARAT(RExC_parse++);
9a86a77b
JH
4728 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4729 if (value == '[' && POSIXCC(nextvalue))
830247a4 4730 namedclass = regpposixcc(pRExC_state, value);
620e46c5 4731 else if (value == '\\') {
ffc61ed2 4732 if (UTF) {
5e12f4fb 4733 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
4734 RExC_end - RExC_parse,
4735 &numlen, 0);
4736 RExC_parse += numlen;
4737 }
4738 else
4739 value = UCHARAT(RExC_parse++);
470c3474 4740 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 4741 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
4742 * be a problem later if we want switch on Unicode.
4743 * A similar issue a little bit later when switching on
4744 * namedclass. --jhi */
ffc61ed2 4745 switch ((I32)value) {
b8c5462f
JH
4746 case 'w': namedclass = ANYOF_ALNUM; break;
4747 case 'W': namedclass = ANYOF_NALNUM; break;
4748 case 's': namedclass = ANYOF_SPACE; break;
4749 case 'S': namedclass = ANYOF_NSPACE; break;
4750 case 'd': namedclass = ANYOF_DIGIT; break;
4751 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
4752 case 'p':
4753 case 'P':
af6f566e 4754 if (RExC_parse >= RExC_end)
2a4859cd 4755 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 4756 if (*RExC_parse == '{') {
0da60cf5 4757 U8 c = (U8)value;
ffc61ed2
JH
4758 e = strchr(RExC_parse++, '}');
4759 if (!e)
0da60cf5 4760 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
4761 while (isSPACE(UCHARAT(RExC_parse)))
4762 RExC_parse++;
4763 if (e == RExC_parse)
0da60cf5 4764 vFAIL2("Empty \\%c{}", c);
ffc61ed2 4765 n = e - RExC_parse;
ab13f0c7
JH
4766 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4767 n--;
ffc61ed2
JH
4768 }
4769 else {
4770 e = RExC_parse;
4771 n = 1;
4772 }
4773 if (!SIZE_ONLY) {
ab13f0c7
JH
4774 if (UCHARAT(RExC_parse) == '^') {
4775 RExC_parse++;
4776 n--;
4777 value = value == 'p' ? 'P' : 'p'; /* toggle */
4778 while (isSPACE(UCHARAT(RExC_parse))) {
4779 RExC_parse++;
4780 n--;
4781 }
4782 }
ffc61ed2 4783 if (value == 'p')
ab13f0c7
JH
4784 Perl_sv_catpvf(aTHX_ listsv,
4785 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 4786 else
ab13f0c7
JH
4787 Perl_sv_catpvf(aTHX_ listsv,
4788 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
4789 }
4790 RExC_parse = e + 1;
4791 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2
JP
4792 namedclass = ANYOF_MAX; /* no official name, but it's named */
4793 break;
b8c5462f
JH
4794 case 'n': value = '\n'; break;
4795 case 'r': value = '\r'; break;
4796 case 't': value = '\t'; break;
4797 case 'f': value = '\f'; break;
4798 case 'b': value = '\b'; break;
c7f1f016
NIS
4799 case 'e': value = ASCII_TO_NATIVE('\033');break;
4800 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 4801 case 'x':
ffc61ed2 4802 if (*RExC_parse == '{') {
a4c04bdc
NC
4803 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4804 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 4805 e = strchr(RExC_parse++, '}');
b81d288d 4806 if (!e)
ffc61ed2 4807 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
4808
4809 numlen = e - RExC_parse;
4810 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4811 RExC_parse = e + 1;
4812 }
4813 else {
a4c04bdc 4814 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4815 numlen = 2;
4816 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4817 RExC_parse += numlen;
4818 }
b8c5462f
JH
4819 break;
4820 case 'c':
830247a4 4821 value = UCHARAT(RExC_parse++);
b8c5462f
JH
4822 value = toCTRL(value);
4823 break;
4824 case '0': case '1': case '2': case '3': case '4':
4825 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
4826 {
4827 I32 flags = 0;
4828 numlen = 3;
4829 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 4830 RExC_parse += numlen;
b8c5462f 4831 break;
53305cf1 4832 }
1028017a 4833 default:
e476b1b5 4834 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
4835 vWARN2(RExC_parse,
4836 "Unrecognized escape \\%c in character class passed through",
4837 (int)value);
1028017a 4838 break;
b8c5462f 4839 }
ffc61ed2 4840 } /* end of \blah */
1b2d223b
JH
4841#ifdef EBCDIC
4842 else
4843 literal_endpoint++;
4844#endif
ffc61ed2
JH
4845
4846 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4847
4848 if (!SIZE_ONLY && !need_class)
936ed897 4849 ANYOF_CLASS_ZERO(ret);
ffc61ed2 4850
936ed897 4851 need_class = 1;
ffc61ed2
JH
4852
4853 /* a bad range like a-\d, a-[:digit:] ? */
4854 if (range) {
73b437c8 4855 if (!SIZE_ONLY) {
e476b1b5 4856 if (ckWARN(WARN_REGEXP))
830247a4 4857 vWARN4(RExC_parse,
b45f050a 4858 "False [] range \"%*.*s\"",
830247a4
IZ
4859 RExC_parse - rangebegin,
4860 RExC_parse - rangebegin,
b45f050a 4861 rangebegin);
3568d838
JH
4862 if (prevvalue < 256) {
4863 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
4864 ANYOF_BITMAP_SET(ret, '-');
4865 }
4866 else {
4867 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4868 Perl_sv_catpvf(aTHX_ listsv,
3568d838 4869 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 4870 }
b8c5462f 4871 }
ffc61ed2
JH
4872
4873 range = 0; /* this was not a true range */
73b437c8 4874 }
ffc61ed2 4875
73b437c8 4876 if (!SIZE_ONLY) {
c49a72a9
NC
4877 const char *what = NULL;
4878 char yesno = 0;
4879
3568d838
JH
4880 if (namedclass > OOB_NAMEDCLASS)
4881 optimize_invert = FALSE;
e2962f66
JH
4882 /* Possible truncation here but in some 64-bit environments
4883 * the compiler gets heartburn about switch on 64-bit values.
4884 * A similar issue a little earlier when switching on value.
98f323fa 4885 * --jhi */
e2962f66 4886 switch ((I32)namedclass) {
73b437c8
JH
4887 case ANYOF_ALNUM:
4888 if (LOC)
936ed897 4889 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
4890 else {
4891 for (value = 0; value < 256; value++)
4892 if (isALNUM(value))
936ed897 4893 ANYOF_BITMAP_SET(ret, value);
73b437c8 4894 }
c49a72a9
NC
4895 yesno = '+';
4896 what = "Word";
73b437c8
JH
4897 break;
4898 case ANYOF_NALNUM:
4899 if (LOC)
936ed897 4900 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
4901 else {
4902 for (value = 0; value < 256; value++)
4903 if (!isALNUM(value))
936ed897 4904 ANYOF_BITMAP_SET(ret, value);
73b437c8 4905 }
c49a72a9
NC
4906 yesno = '!';
4907 what = "Word";
73b437c8 4908 break;
ffc61ed2 4909 case ANYOF_ALNUMC:
73b437c8 4910 if (LOC)
ffc61ed2 4911 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
4912 else {
4913 for (value = 0; value < 256; value++)
ffc61ed2 4914 if (isALNUMC(value))
936ed897 4915 ANYOF_BITMAP_SET(ret, value);
73b437c8 4916 }
c49a72a9
NC
4917 yesno = '+';
4918 what = "Alnum";
73b437c8
JH
4919 break;
4920 case ANYOF_NALNUMC:
4921 if (LOC)
936ed897 4922 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
4923 else {
4924 for (value = 0; value < 256; value++)
4925 if (!isALNUMC(value))
936ed897 4926 ANYOF_BITMAP_SET(ret, value);
73b437c8 4927 }
c49a72a9
NC
4928 yesno = '!';
4929 what = "Alnum";
73b437c8
JH
4930 break;
4931 case ANYOF_ALPHA:
4932 if (LOC)
936ed897 4933 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
4934 else {
4935 for (value = 0; value < 256; value++)
4936 if (isALPHA(value))
936ed897 4937 ANYOF_BITMAP_SET(ret, value);
73b437c8 4938 }
c49a72a9
NC
4939 yesno = '+';
4940 what = "Alpha";
73b437c8
JH
4941 break;
4942 case ANYOF_NALPHA:
4943 if (LOC)
936ed897 4944 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
4945 else {
4946 for (value = 0; value < 256; value++)
4947 if (!isALPHA(value))
936ed897 4948 ANYOF_BITMAP_SET(ret, value);
73b437c8 4949 }
c49a72a9
NC
4950 yesno = '!';
4951 what = "Alpha";
73b437c8
JH
4952 break;
4953 case ANYOF_ASCII:
4954 if (LOC)
936ed897 4955 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 4956 else {
c7f1f016 4957#ifndef EBCDIC
1ba5c669
JH
4958 for (value = 0; value < 128; value++)
4959 ANYOF_BITMAP_SET(ret, value);
4960#else /* EBCDIC */
ffbc6a93 4961 for (value = 0; value < 256; value++) {
3a3c4447
JH
4962 if (isASCII(value))
4963 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4964 }
1ba5c669 4965#endif /* EBCDIC */
73b437c8 4966 }
c49a72a9
NC
4967 yesno = '+';
4968 what = "ASCII";
73b437c8
JH
4969 break;
4970 case ANYOF_NASCII:
4971 if (LOC)
936ed897 4972 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 4973 else {
c7f1f016 4974#ifndef EBCDIC
1ba5c669
JH
4975 for (value = 128; value < 256; value++)
4976 ANYOF_BITMAP_SET(ret, value);
4977#else /* EBCDIC */
ffbc6a93 4978 for (value = 0; value < 256; value++) {
3a3c4447
JH
4979 if (!isASCII(value))
4980 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4981 }
1ba5c669 4982#endif /* EBCDIC */
73b437c8 4983 }
c49a72a9
NC
4984 yesno = '!';
4985 what = "ASCII";
73b437c8 4986 break;
aaa51d5e
JF
4987 case ANYOF_BLANK:
4988 if (LOC)
4989 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4990 else {
4991 for (value = 0; value < 256; value++)
4992 if (isBLANK(value))
4993 ANYOF_BITMAP_SET(ret, value);
4994 }
c49a72a9
NC
4995 yesno = '+';
4996 what = "Blank";
aaa51d5e
JF
4997 break;
4998 case ANYOF_NBLANK:
4999 if (LOC)
5000 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5001 else {
5002 for (value = 0; value < 256; value++)
5003 if (!isBLANK(value))
5004 ANYOF_BITMAP_SET(ret, value);
5005 }
c49a72a9
NC
5006 yesno = '!';
5007 what = "Blank";
aaa51d5e 5008 break;
73b437c8
JH
5009 case ANYOF_CNTRL:
5010 if (LOC)
936ed897 5011 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
5012 else {
5013 for (value = 0; value < 256; value++)
5014 if (isCNTRL(value))
936ed897 5015 ANYOF_BITMAP_SET(ret, value);
73b437c8 5016 }
c49a72a9
NC
5017 yesno = '+';
5018 what = "Cntrl";
73b437c8
JH
5019 break;
5020 case ANYOF_NCNTRL:
5021 if (LOC)
936ed897 5022 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5023 else {
5024 for (value = 0; value < 256; value++)
5025 if (!isCNTRL(value))
936ed897 5026 ANYOF_BITMAP_SET(ret, value);
73b437c8 5027 }
c49a72a9
NC
5028 yesno = '!';
5029 what = "Cntrl";
ffc61ed2
JH
5030 break;
5031 case ANYOF_DIGIT:
5032 if (LOC)
5033 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5034 else {
5035 /* consecutive digits assumed */
5036 for (value = '0'; value <= '9'; value++)
5037 ANYOF_BITMAP_SET(ret, value);
5038 }
c49a72a9
NC
5039 yesno = '+';
5040 what = "Digit";
ffc61ed2
JH
5041 break;
5042 case ANYOF_NDIGIT:
5043 if (LOC)
5044 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5045 else {
5046 /* consecutive digits assumed */
5047 for (value = 0; value < '0'; value++)
5048 ANYOF_BITMAP_SET(ret, value);
5049 for (value = '9' + 1; value < 256; value++)
5050 ANYOF_BITMAP_SET(ret, value);
5051 }
c49a72a9
NC
5052 yesno = '!';
5053 what = "Digit";
73b437c8
JH
5054 break;
5055 case ANYOF_GRAPH:
5056 if (LOC)
936ed897 5057 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5058 else {
5059 for (value = 0; value < 256; value++)
5060 if (isGRAPH(value))
936ed897 5061 ANYOF_BITMAP_SET(ret, value);
73b437c8 5062 }
c49a72a9
NC
5063 yesno = '+';
5064 what = "Graph";
73b437c8
JH
5065 break;
5066 case ANYOF_NGRAPH:
5067 if (LOC)
936ed897 5068 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5069 else {
5070 for (value = 0; value < 256; value++)
5071 if (!isGRAPH(value))
936ed897 5072 ANYOF_BITMAP_SET(ret, value);
73b437c8 5073 }
c49a72a9
NC
5074 yesno = '!';
5075 what = "Graph";
73b437c8
JH
5076 break;
5077 case ANYOF_LOWER:
5078 if (LOC)
936ed897 5079 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5080 else {
5081 for (value = 0; value < 256; value++)
5082 if (isLOWER(value))
936ed897 5083 ANYOF_BITMAP_SET(ret, value);
73b437c8 5084 }
c49a72a9
NC
5085 yesno = '+';
5086 what = "Lower";
73b437c8
JH
5087 break;
5088 case ANYOF_NLOWER:
5089 if (LOC)
936ed897 5090 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5091 else {
5092 for (value = 0; value < 256; value++)
5093 if (!isLOWER(value))
936ed897 5094 ANYOF_BITMAP_SET(ret, value);
73b437c8 5095 }
c49a72a9
NC
5096 yesno = '!';
5097 what = "Lower";
73b437c8
JH
5098 break;
5099 case ANYOF_PRINT:
5100 if (LOC)
936ed897 5101 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5102 else {
5103 for (value = 0; value < 256; value++)
5104 if (isPRINT(value))
936ed897 5105 ANYOF_BITMAP_SET(ret, value);
73b437c8 5106 }
c49a72a9
NC
5107 yesno = '+';
5108 what = "Print";
73b437c8
JH
5109 break;
5110 case ANYOF_NPRINT:
5111 if (LOC)
936ed897 5112 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5113 else {
5114 for (value = 0; value < 256; value++)
5115 if (!isPRINT(value))
936ed897 5116 ANYOF_BITMAP_SET(ret, value);
73b437c8 5117 }
c49a72a9
NC
5118 yesno = '!';
5119 what = "Print";
73b437c8 5120 break;
aaa51d5e
JF
5121 case ANYOF_PSXSPC:
5122 if (LOC)
5123 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5124 else {
5125 for (value = 0; value < 256; value++)
5126 if (isPSXSPC(value))
5127 ANYOF_BITMAP_SET(ret, value);
5128 }
c49a72a9
NC
5129 yesno = '+';
5130 what = "Space";
aaa51d5e
JF
5131 break;
5132 case ANYOF_NPSXSPC:
5133 if (LOC)
5134 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5135 else {
5136 for (value = 0; value < 256; value++)
5137 if (!isPSXSPC(value))
5138 ANYOF_BITMAP_SET(ret, value);
5139 }
c49a72a9
NC
5140 yesno = '!';
5141 what = "Space";
aaa51d5e 5142 break;
73b437c8
JH
5143 case ANYOF_PUNCT:
5144 if (LOC)
936ed897 5145 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5146 else {
5147 for (value = 0; value < 256; value++)
5148 if (isPUNCT(value))
936ed897 5149 ANYOF_BITMAP_SET(ret, value);
73b437c8 5150 }
c49a72a9
NC
5151 yesno = '+';
5152 what = "Punct";
73b437c8
JH
5153 break;
5154 case ANYOF_NPUNCT:
5155 if (LOC)
936ed897 5156 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5157 else {
5158 for (value = 0; value < 256; value++)
5159 if (!isPUNCT(value))
936ed897 5160 ANYOF_BITMAP_SET(ret, value);
73b437c8 5161 }
c49a72a9
NC
5162 yesno = '!';
5163 what = "Punct";
ffc61ed2
JH
5164 break;
5165 case ANYOF_SPACE:
5166 if (LOC)
5167 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5168 else {
5169 for (value = 0; value < 256; value++)
5170 if (isSPACE(value))
5171 ANYOF_BITMAP_SET(ret, value);
5172 }
c49a72a9
NC
5173 yesno = '+';
5174 what = "SpacePerl";
ffc61ed2
JH
5175 break;
5176 case ANYOF_NSPACE:
5177 if (LOC)
5178 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5179 else {
5180 for (value = 0; value < 256; value++)
5181 if (!isSPACE(value))
5182 ANYOF_BITMAP_SET(ret, value);
5183 }
c49a72a9
NC
5184 yesno = '!';
5185 what = "SpacePerl";
73b437c8
JH
5186 break;
5187 case ANYOF_UPPER:
5188 if (LOC)
936ed897 5189 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5190 else {
5191 for (value = 0; value < 256; value++)
5192 if (isUPPER(value))
936ed897 5193 ANYOF_BITMAP_SET(ret, value);
73b437c8 5194 }
c49a72a9
NC
5195 yesno = '+';
5196 what = "Upper";
73b437c8
JH
5197 break;
5198 case ANYOF_NUPPER:
5199 if (LOC)
936ed897 5200 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5201 else {
5202 for (value = 0; value < 256; value++)
5203 if (!isUPPER(value))
936ed897 5204 ANYOF_BITMAP_SET(ret, value);
73b437c8 5205 }
c49a72a9
NC
5206 yesno = '!';
5207 what = "Upper";
73b437c8
JH
5208 break;
5209 case ANYOF_XDIGIT:
5210 if (LOC)
936ed897 5211 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5212 else {
5213 for (value = 0; value < 256; value++)
5214 if (isXDIGIT(value))
936ed897 5215 ANYOF_BITMAP_SET(ret, value);
73b437c8 5216 }
c49a72a9
NC
5217 yesno = '+';
5218 what = "XDigit";
73b437c8
JH
5219 break;
5220 case ANYOF_NXDIGIT:
5221 if (LOC)
936ed897 5222 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5223 else {
5224 for (value = 0; value < 256; value++)
5225 if (!isXDIGIT(value))
936ed897 5226 ANYOF_BITMAP_SET(ret, value);
73b437c8 5227 }
c49a72a9
NC
5228 yesno = '!';
5229 what = "XDigit";
73b437c8 5230 break;
f81125e2
JP
5231 case ANYOF_MAX:
5232 /* this is to handle \p and \P */
5233 break;
73b437c8 5234 default:
b45f050a 5235 vFAIL("Invalid [::] class");
73b437c8 5236 break;
b8c5462f 5237 }
c49a72a9
NC
5238 if (what) {
5239 /* Strings such as "+utf8::isWord\n" */
5240 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5241 }
b8c5462f 5242 if (LOC)
936ed897 5243 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5244 continue;
a0d0e21e 5245 }
ffc61ed2
JH
5246 } /* end of namedclass \blah */
5247
a0d0e21e 5248 if (range) {
eb160463 5249 if (prevvalue > (IV)value) /* b-a */ {
b45f050a 5250 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
5251 RExC_parse - rangebegin,
5252 RExC_parse - rangebegin,
b45f050a 5253 rangebegin);
3568d838 5254 range = 0; /* not a valid range */
73b437c8 5255 }
a0d0e21e
LW
5256 }
5257 else {
3568d838 5258 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5259 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5260 RExC_parse[1] != ']') {
5261 RExC_parse++;
ffc61ed2
JH
5262
5263 /* a bad range like \w-, [:word:]- ? */
5264 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 5265 if (ckWARN(WARN_REGEXP))
830247a4 5266 vWARN4(RExC_parse,
b45f050a 5267 "False [] range \"%*.*s\"",
830247a4
IZ
5268 RExC_parse - rangebegin,
5269 RExC_parse - rangebegin,
b45f050a 5270 rangebegin);
73b437c8 5271 if (!SIZE_ONLY)
936ed897 5272 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5273 } else
ffc61ed2
JH
5274 range = 1; /* yeah, it's a range! */
5275 continue; /* but do it the next time */
a0d0e21e 5276 }
a687059c 5277 }
ffc61ed2 5278
93733859 5279 /* now is the next time */
ae5c130c 5280 if (!SIZE_ONLY) {
3568d838
JH
5281 IV i;
5282
5283 if (prevvalue < 256) {
5284 IV ceilvalue = value < 256 ? value : 255;
5285
5286#ifdef EBCDIC
1b2d223b
JH
5287 /* In EBCDIC [\x89-\x91] should include
5288 * the \x8e but [i-j] should not. */
5289 if (literal_endpoint == 2 &&
5290 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5291 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5292 {
3568d838
JH
5293 if (isLOWER(prevvalue)) {
5294 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5295 if (isLOWER(i))
5296 ANYOF_BITMAP_SET(ret, i);
5297 } else {
3568d838 5298 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5299 if (isUPPER(i))
5300 ANYOF_BITMAP_SET(ret, i);
5301 }
8ada0baa 5302 }
ffc61ed2 5303 else
8ada0baa 5304#endif
a5961de5
JH
5305 for (i = prevvalue; i <= ceilvalue; i++)
5306 ANYOF_BITMAP_SET(ret, i);
3568d838 5307 }
a5961de5 5308 if (value > 255 || UTF) {
b08decb7
JH
5309 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5310 UV natvalue = NATIVE_TO_UNI(value);
5311
ffc61ed2 5312 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5313 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5314 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5315 prevnatvalue, natvalue);
5316 }
5317 else if (prevnatvalue == natvalue) {
5318 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5319 if (FOLD) {
89ebb4a3 5320 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5321 STRLEN foldlen;
2f3bf011 5322 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5323
c840d2a2
JH
5324 /* If folding and foldable and a single
5325 * character, insert also the folded version
5326 * to the charclass. */
9e55ce06 5327 if (f != value) {
eb160463 5328 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5329 Perl_sv_catpvf(aTHX_ listsv,
5330 "%04"UVxf"\n", f);
5331 else {
5332 /* Any multicharacter foldings
5333 * require the following transform:
5334 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5335 * where E folds into "pq" and F folds
5336 * into "rst", all other characters
5337 * fold to single characters. We save
5338 * away these multicharacter foldings,
5339 * to be later saved as part of the
5340 * additional "s" data. */
5341 SV *sv;
5342
5343 if (!unicode_alternate)
5344 unicode_alternate = newAV();
5345 sv = newSVpvn((char*)foldbuf, foldlen);
5346 SvUTF8_on(sv);
5347 av_push(unicode_alternate, sv);
5348 }
5349 }
254ba52a 5350
60a8b682
JH
5351 /* If folding and the value is one of the Greek
5352 * sigmas insert a few more sigmas to make the
5353 * folding rules of the sigmas to work right.
5354 * Note that not all the possible combinations
5355 * are handled here: some of them are handled
9e55ce06
JH
5356 * by the standard folding rules, and some of
5357 * them (literal or EXACTF cases) are handled
5358 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5359 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5360 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5361 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5362 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5363 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5364 }
5365 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5366 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5367 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5368 }
5369 }
ffc61ed2 5370 }
1b2d223b
JH
5371#ifdef EBCDIC
5372 literal_endpoint = 0;
5373#endif
8ada0baa 5374 }
ffc61ed2
JH
5375
5376 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5377 }
ffc61ed2 5378
936ed897 5379 if (need_class) {
4f66b38d 5380 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5381 if (SIZE_ONLY)
830247a4 5382 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5383 else
830247a4 5384 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5385 }
ffc61ed2 5386
ae5c130c 5387 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 5388 if (!SIZE_ONLY &&
ffc61ed2 5389 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5390 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5391 ) {
a0ed51b3 5392 for (value = 0; value < 256; ++value) {
936ed897 5393 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5394 UV fold = PL_fold[value];
ffc61ed2
JH
5395
5396 if (fold != value)
5397 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5398 }
5399 }
936ed897 5400 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5401 }
ffc61ed2 5402
ae5c130c 5403 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 5404 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
5405 /* If the only flag is inversion. */
5406 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5407 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5408 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5409 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5410 }
a0d0e21e 5411
b81d288d 5412 if (!SIZE_ONLY) {
fde631ed 5413 AV *av = newAV();
ffc61ed2
JH
5414 SV *rv;
5415
9e55ce06 5416 /* The 0th element stores the character class description
6a0407ee 5417 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5418 * to initialize the appropriate swash (which gets stored in
5419 * the 1st element), and also useful for dumping the regnode.
5420 * The 2nd element stores the multicharacter foldings,
6a0407ee 5421 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
5422 av_store(av, 0, listsv);
5423 av_store(av, 1, NULL);
9e55ce06 5424 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 5425 rv = newRV_noinc((SV*)av);
19860706 5426 n = add_data(pRExC_state, 1, "s");
830247a4 5427 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 5428 ARG_SET(ret, n);
a0ed51b3
LW
5429 }
5430
5431 return ret;
5432}
5433
76e3520e 5434STATIC char*
830247a4 5435S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 5436{
830247a4 5437 char* retval = RExC_parse++;
a0d0e21e 5438
4633a7c4 5439 for (;;) {
830247a4
IZ
5440 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5441 RExC_parse[2] == '#') {
e994fd66
AE
5442 while (*RExC_parse != ')') {
5443 if (RExC_parse == RExC_end)
5444 FAIL("Sequence (?#... not terminated");
830247a4 5445 RExC_parse++;
e994fd66 5446 }
830247a4 5447 RExC_parse++;
4633a7c4
LW
5448 continue;
5449 }
e2509266 5450 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
5451 if (isSPACE(*RExC_parse)) {
5452 RExC_parse++;
748a9306
LW
5453 continue;
5454 }
830247a4 5455 else if (*RExC_parse == '#') {
e994fd66
AE
5456 while (RExC_parse < RExC_end)
5457 if (*RExC_parse++ == '\n') break;
748a9306
LW
5458 continue;
5459 }
748a9306 5460 }
4633a7c4 5461 return retval;
a0d0e21e 5462 }
a687059c
LW
5463}
5464
5465/*
c277df42 5466- reg_node - emit a node
a0d0e21e 5467*/
76e3520e 5468STATIC regnode * /* Location. */
830247a4 5469S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 5470{
c277df42
IZ
5471 register regnode *ret;
5472 register regnode *ptr;
a687059c 5473
830247a4 5474 ret = RExC_emit;
c277df42 5475 if (SIZE_ONLY) {
830247a4
IZ
5476 SIZE_ALIGN(RExC_size);
5477 RExC_size += 1;
a0d0e21e
LW
5478 return(ret);
5479 }
a687059c 5480
c277df42 5481 NODE_ALIGN_FILL(ret);
a0d0e21e 5482 ptr = ret;
c277df42 5483 FILL_ADVANCE_NODE(ptr, op);
fac92740 5484 if (RExC_offsets) { /* MJD */
ccb2c380 5485 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
5486 "reg_node", __LINE__,
5487 reg_name[op],
5488 RExC_emit - RExC_emit_start > RExC_offsets[0]
5489 ? "Overwriting end of array!\n" : "OK",
5490 RExC_emit - RExC_emit_start,
5491 RExC_parse - RExC_start,
5492 RExC_offsets[0]));
ccb2c380 5493 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
5494 }
5495
830247a4 5496 RExC_emit = ptr;
a687059c 5497
a0d0e21e 5498 return(ret);
a687059c
LW
5499}
5500
5501/*
a0d0e21e
LW
5502- reganode - emit a node with an argument
5503*/
76e3520e 5504STATIC regnode * /* Location. */
830247a4 5505S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 5506{
c277df42
IZ
5507 register regnode *ret;
5508 register regnode *ptr;
fe14fcc3 5509
830247a4 5510 ret = RExC_emit;
c277df42 5511 if (SIZE_ONLY) {
830247a4
IZ
5512 SIZE_ALIGN(RExC_size);
5513 RExC_size += 2;
a0d0e21e
LW
5514 return(ret);
5515 }
fe14fcc3 5516
c277df42 5517 NODE_ALIGN_FILL(ret);
a0d0e21e 5518 ptr = ret;
c277df42 5519 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 5520 if (RExC_offsets) { /* MJD */
ccb2c380 5521 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5522 "reganode",
ccb2c380
MP
5523 __LINE__,
5524 reg_name[op],
fac92740
MJD
5525 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5526 "Overwriting end of array!\n" : "OK",
5527 RExC_emit - RExC_emit_start,
5528 RExC_parse - RExC_start,
5529 RExC_offsets[0]));
ccb2c380 5530 Set_Cur_Node_Offset;
fac92740
MJD
5531 }
5532
830247a4 5533 RExC_emit = ptr;
fe14fcc3 5534
a0d0e21e 5535 return(ret);
fe14fcc3
LW
5536}
5537
5538/*
cd439c50 5539- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
5540*/
5541STATIC void
830247a4 5542S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 5543{
5e12f4fb 5544 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
5545}
5546
5547/*
a0d0e21e
LW
5548- reginsert - insert an operator in front of already-emitted operand
5549*
5550* Means relocating the operand.
5551*/
76e3520e 5552STATIC void
830247a4 5553S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 5554{
c277df42
IZ
5555 register regnode *src;
5556 register regnode *dst;
5557 register regnode *place;
5558 register int offset = regarglen[(U8)op];
b81d288d 5559
22c35a8c 5560/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
5561
5562 if (SIZE_ONLY) {
830247a4 5563 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
5564 return;
5565 }
a687059c 5566
830247a4
IZ
5567 src = RExC_emit;
5568 RExC_emit += NODE_STEP_REGNODE + offset;
5569 dst = RExC_emit;
fac92740 5570 while (src > opnd) {
c277df42 5571 StructCopy(--src, --dst, regnode);
fac92740 5572 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 5573 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 5574 "reg_insert",
ccb2c380
MP
5575 __LINE__,
5576 reg_name[op],
fac92740
MJD
5577 dst - RExC_emit_start > RExC_offsets[0]
5578 ? "Overwriting end of array!\n" : "OK",
5579 src - RExC_emit_start,
5580 dst - RExC_emit_start,
5581 RExC_offsets[0]));
ccb2c380
MP
5582 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5583 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
5584 }
5585 }
5586
a0d0e21e
LW
5587
5588 place = opnd; /* Op node, where operand used to be. */
fac92740 5589 if (RExC_offsets) { /* MJD */
ccb2c380 5590 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5591 "reginsert",
ccb2c380
MP
5592 __LINE__,
5593 reg_name[op],
fac92740
MJD
5594 place - RExC_emit_start > RExC_offsets[0]
5595 ? "Overwriting end of array!\n" : "OK",
5596 place - RExC_emit_start,
5597 RExC_parse - RExC_start,
5598 RExC_offsets[0]));
ccb2c380 5599 Set_Node_Offset(place, RExC_parse);
45948336 5600 Set_Node_Length(place, 1);
fac92740 5601 }
c277df42
IZ
5602 src = NEXTOPER(place);
5603 FILL_ADVANCE_NODE(place, op);
5604 Zero(src, offset, regnode);
a687059c
LW
5605}
5606
5607/*
c277df42 5608- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 5609*/
76e3520e 5610STATIC void
830247a4 5611S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5612{
c277df42
IZ
5613 register regnode *scan;
5614 register regnode *temp;
a0d0e21e 5615
c277df42 5616 if (SIZE_ONLY)
a0d0e21e
LW
5617 return;
5618
5619 /* Find last node. */
5620 scan = p;
5621 for (;;) {
5622 temp = regnext(scan);
5623 if (temp == NULL)
5624 break;
5625 scan = temp;
5626 }
a687059c 5627
c277df42
IZ
5628 if (reg_off_by_arg[OP(scan)]) {
5629 ARG_SET(scan, val - scan);
a0ed51b3
LW
5630 }
5631 else {
c277df42
IZ
5632 NEXT_OFF(scan) = val - scan;
5633 }
a687059c
LW
5634}
5635
5636/*
a0d0e21e
LW
5637- regoptail - regtail on operand of first argument; nop if operandless
5638*/
76e3520e 5639STATIC void
830247a4 5640S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5641{
a0d0e21e 5642 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
5643 if (p == NULL || SIZE_ONLY)
5644 return;
22c35a8c 5645 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 5646 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 5647 }
22c35a8c 5648 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 5649 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
5650 }
5651 else
a0d0e21e 5652 return;
a687059c
LW
5653}
5654
5655/*
5656 - regcurly - a little FSA that accepts {\d+,?\d*}
5657 */
79072805 5658STATIC I32
cea2e8a9 5659S_regcurly(pTHX_ register char *s)
a687059c
LW
5660{
5661 if (*s++ != '{')
5662 return FALSE;
f0fcb552 5663 if (!isDIGIT(*s))
a687059c 5664 return FALSE;
f0fcb552 5665 while (isDIGIT(*s))
a687059c
LW
5666 s++;
5667 if (*s == ',')
5668 s++;
f0fcb552 5669 while (isDIGIT(*s))
a687059c
LW
5670 s++;
5671 if (*s != '}')
5672 return FALSE;
5673 return TRUE;
5674}
5675
a687059c 5676
8fa7f367
JH
5677#ifdef DEBUGGING
5678
76e3520e 5679STATIC regnode *
cea2e8a9 5680S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 5681{
f248d071 5682 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 5683 register regnode *next;
c277df42
IZ
5684
5685 while (op != END && (!last || node < last)) {
5686 /* While that wasn't END last time... */
5687
5688 NODE_ALIGN(node);
5689 op = OP(node);
5690 if (op == CLOSE)
5691 l--;
5692 next = regnext(node);
5693 /* Where, what. */
5694 if (OP(node) == OPTIMIZED)
5695 goto after_print;
5696 regprop(sv, node);
b900a521 5697 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 5698 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
5699 if (next == NULL) /* Next ptr. */
5700 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 5701 else
b900a521 5702 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
5703 (void)PerlIO_putc(Perl_debug_log, '\n');
5704 after_print:
22c35a8c 5705 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
5706 register regnode *nnode = (OP(next) == LONGJMP
5707 ? regnext(next)
c277df42
IZ
5708 : next);
5709 if (last && nnode > last)
5710 nnode = last;
5711 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 5712 }
22c35a8c 5713 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 5714 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3 5715 }
a3621e74 5716 else if ( PL_regkind[(U8)op] == TRIE ) {
e1ec3a88
AL
5717 const I32 n = ARG(node);
5718 const reg_trie_data *trie = (reg_trie_data*)PL_regdata->data[n];
5719 const I32 arry_len = av_len(trie->words)+1;
a3621e74 5720 I32 word_idx;
a3621e74 5721 PerlIO_printf(Perl_debug_log,
e4584336 5722 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
a3621e74
YO
5723 (int)(2*(l+3)), "",
5724 trie->wordcount,
5725 trie->charcount,
5726 trie->uniquecharcount,
e4584336 5727 (IV)trie->laststate-1,
a3621e74
YO
5728 node->flags ? " EVAL mode" : "");
5729
5730 for (word_idx=0; word_idx < arry_len; word_idx++) {
5731 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
5732 if (elem_ptr) {
5733 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
5734 (int)(2*(l+4)), "",
5735 PL_colors[0],
5736 SvPV_nolen(*elem_ptr),
5737 PL_colors[1]
5738 );
5739 /*
5740 if (next == NULL)
5741 PerlIO_printf(Perl_debug_log, "(0)\n");
5742 else
5743 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
5744 */
5745 }
5746
5747 }
5748
5749 node = NEXTOPER(node);
5750 node += regarglen[(U8)op];
5751
5752 }
a0ed51b3 5753 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
5754 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5755 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 5756 }
22c35a8c 5757 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
5758 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5759 next, sv, l + 1);
a0ed51b3
LW
5760 }
5761 else if ( op == PLUS || op == STAR) {
c277df42 5762 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
5763 }
5764 else if (op == ANYOF) {
4f66b38d
HS
5765 /* arglen 1 + class block */
5766 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
5767 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
5768 node = NEXTOPER(node);
a0ed51b3 5769 }
22c35a8c 5770 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 5771 /* Literal string, where present. */
cd439c50 5772 node += NODE_SZ_STR(node) - 1;
c277df42 5773 node = NEXTOPER(node);
a0ed51b3
LW
5774 }
5775 else {
c277df42
IZ
5776 node = NEXTOPER(node);
5777 node += regarglen[(U8)op];
5778 }
5779 if (op == CURLYX || op == OPEN)
5780 l++;
5781 else if (op == WHILEM)
5782 l--;
5783 }
5784 return node;
5785}
5786
8fa7f367
JH
5787#endif /* DEBUGGING */
5788
a687059c 5789/*
fd181c75 5790 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
5791 */
5792void
864dbfa3 5793Perl_regdump(pTHX_ regexp *r)
a687059c 5794{
35ff7856 5795#ifdef DEBUGGING
46fc3d4c 5796 SV *sv = sv_newmortal();
a687059c 5797
c277df42 5798 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
5799
5800 /* Header fields of interest. */
c277df42 5801 if (r->anchored_substr)
7b0972df 5802 PerlIO_printf(Perl_debug_log,
b81d288d 5803 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 5804 PL_colors[0],
7b0972df 5805 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 5806 SvPVX(r->anchored_substr),
3280af22 5807 PL_colors[1],
c277df42 5808 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 5809 (IV)r->anchored_offset);
33b8afdf
JH
5810 else if (r->anchored_utf8)
5811 PerlIO_printf(Perl_debug_log,
5812 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
5813 PL_colors[0],
5814 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5815 SvPVX(r->anchored_utf8),
5816 PL_colors[1],
5817 SvTAIL(r->anchored_utf8) ? "$" : "",
5818 (IV)r->anchored_offset);
c277df42 5819 if (r->float_substr)
7b0972df 5820 PerlIO_printf(Perl_debug_log,
b81d288d 5821 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 5822 PL_colors[0],
b81d288d 5823 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 5824 SvPVX(r->float_substr),
3280af22 5825 PL_colors[1],
c277df42 5826 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 5827 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
5828 else if (r->float_utf8)
5829 PerlIO_printf(Perl_debug_log,
5830 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
5831 PL_colors[0],
5832 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5833 SvPVX(r->float_utf8),
5834 PL_colors[1],
5835 SvTAIL(r->float_utf8) ? "$" : "",
5836 (IV)r->float_min_offset, (UV)r->float_max_offset);
5837 if (r->check_substr || r->check_utf8)
b81d288d
AB
5838 PerlIO_printf(Perl_debug_log,
5839 r->check_substr == r->float_substr
33b8afdf 5840 && r->check_utf8 == r->float_utf8
c277df42
IZ
5841 ? "(checking floating" : "(checking anchored");
5842 if (r->reganch & ROPT_NOSCAN)
5843 PerlIO_printf(Perl_debug_log, " noscan");
5844 if (r->reganch & ROPT_CHECK_ALL)
5845 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 5846 if (r->check_substr || r->check_utf8)
c277df42
IZ
5847 PerlIO_printf(Perl_debug_log, ") ");
5848
46fc3d4c 5849 if (r->regstclass) {
5850 regprop(sv, r->regstclass);
5851 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
5852 }
774d564b 5853 if (r->reganch & ROPT_ANCH) {
5854 PerlIO_printf(Perl_debug_log, "anchored");
5855 if (r->reganch & ROPT_ANCH_BOL)
5856 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
5857 if (r->reganch & ROPT_ANCH_MBOL)
5858 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
5859 if (r->reganch & ROPT_ANCH_SBOL)
5860 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 5861 if (r->reganch & ROPT_ANCH_GPOS)
5862 PerlIO_printf(Perl_debug_log, "(GPOS)");
5863 PerlIO_putc(Perl_debug_log, ' ');
5864 }
c277df42
IZ
5865 if (r->reganch & ROPT_GPOS_SEEN)
5866 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 5867 if (r->reganch & ROPT_SKIP)
760ac839 5868 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 5869 if (r->reganch & ROPT_IMPLICIT)
760ac839 5870 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 5871 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
5872 if (r->reganch & ROPT_EVAL_SEEN)
5873 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 5874 PerlIO_printf(Perl_debug_log, "\n");
fac92740 5875 if (r->offsets) {
e4584336
RB
5876 U32 i;
5877 const U32 len = r->offsets[0];
a3621e74
YO
5878 GET_RE_DEBUG_FLAGS_DECL;
5879 DEBUG_OFFSETS_r({
e4584336
RB
5880 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5881 for (i = 1; i <= len; i++)
5882 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5883 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5884 PerlIO_printf(Perl_debug_log, "\n");
a3621e74 5885 });
fac92740 5886 }
17c3b450 5887#endif /* DEBUGGING */
a687059c
LW
5888}
5889
8fa7f367
JH
5890#ifdef DEBUGGING
5891
653099ff
GS
5892STATIC void
5893S_put_byte(pTHX_ SV *sv, int c)
5894{
7be5a6cf 5895 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
5896 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5897 else if (c == '-' || c == ']' || c == '\\' || c == '^')
5898 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
5899 else
5900 Perl_sv_catpvf(aTHX_ sv, "%c", c);
5901}
5902
8fa7f367
JH
5903#endif /* DEBUGGING */
5904
a3621e74 5905
a687059c 5906/*
a0d0e21e
LW
5907- regprop - printable representation of opcode
5908*/
46fc3d4c 5909void
864dbfa3 5910Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 5911{
35ff7856 5912#ifdef DEBUGGING
9b155405 5913 register int k;
a0d0e21e 5914
54dc92de 5915 sv_setpvn(sv, "", 0);
9b155405 5916 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
5917 /* It would be nice to FAIL() here, but this may be called from
5918 regexec.c, and it would be hard to supply pRExC_state. */
5919 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 5920 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405
IZ
5921
5922 k = PL_regkind[(U8)OP(o)];
5923
2a782b5b
JH
5924 if (k == EXACT) {
5925 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
5926 /* Using is_utf8_string() is a crude hack but it may
5927 * be the best for now since we have no flag "this EXACTish
5928 * node was UTF-8" --jhi */
5929 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
8a989385 5930 char *s = do_utf8 ?
c728cb41
JH
5931 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5932 UNI_DISPLAY_REGEX) :
2a782b5b 5933 STRING(o);
e1ec3a88 5934 const int len = do_utf8 ?
2a782b5b
JH
5935 strlen(s) :
5936 STR_LEN(o);
5937 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5938 PL_colors[0],
5939 len, s,
5940 PL_colors[1]);
a3621e74
YO
5941 } else if (k == TRIE) {/*
5942 this isn't always safe, as Pl_regdata may not be for this regex yet
5943 (depending on where its called from) so its being moved to dumpuntil
5944 I32 n = ARG(o);
5945 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5946 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5947 trie->wordcount,
5948 trie->charcount,
5949 trie->uniquecharcount,
5950 trie->laststate);
5951 */
5952 } else if (k == CURLY) {
cb434fcc 5953 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
5954 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5955 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 5956 }
2c2d71f5
JH
5957 else if (k == WHILEM && o->flags) /* Ordinal/of */
5958 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 5959 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 5960 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 5961 else if (k == LOGICAL)
04ebc1ab 5962 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
5963 else if (k == ANYOF) {
5964 int i, rangestart = -1;
ffc61ed2 5965 U8 flags = ANYOF_FLAGS(o);
a6d05634 5966 const char * const anyofs[] = { /* Should be synchronized with
19860706 5967 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
5968 "\\w",
5969 "\\W",
5970 "\\s",
5971 "\\S",
5972 "\\d",
5973 "\\D",
5974 "[:alnum:]",
5975 "[:^alnum:]",
5976 "[:alpha:]",
5977 "[:^alpha:]",
5978 "[:ascii:]",
5979 "[:^ascii:]",
5980 "[:ctrl:]",
5981 "[:^ctrl:]",
5982 "[:graph:]",
5983 "[:^graph:]",
5984 "[:lower:]",
5985 "[:^lower:]",
5986 "[:print:]",
5987 "[:^print:]",
5988 "[:punct:]",
5989 "[:^punct:]",
5990 "[:upper:]",
aaa51d5e 5991 "[:^upper:]",
653099ff 5992 "[:xdigit:]",
aaa51d5e
JF
5993 "[:^xdigit:]",
5994 "[:space:]",
5995 "[:^space:]",
5996 "[:blank:]",
5997 "[:^blank:]"
653099ff
GS
5998 };
5999
19860706 6000 if (flags & ANYOF_LOCALE)
653099ff 6001 sv_catpv(sv, "{loc}");
19860706 6002 if (flags & ANYOF_FOLD)
653099ff
GS
6003 sv_catpv(sv, "{i}");
6004 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 6005 if (flags & ANYOF_INVERT)
653099ff 6006 sv_catpv(sv, "^");
ffc61ed2
JH
6007 for (i = 0; i <= 256; i++) {
6008 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6009 if (rangestart == -1)
6010 rangestart = i;
6011 } else if (rangestart != -1) {
6012 if (i <= rangestart + 3)
6013 for (; rangestart < i; rangestart++)
653099ff 6014 put_byte(sv, rangestart);
ffc61ed2
JH
6015 else {
6016 put_byte(sv, rangestart);
6017 sv_catpv(sv, "-");
6018 put_byte(sv, i - 1);
653099ff 6019 }
ffc61ed2 6020 rangestart = -1;
653099ff 6021 }
847a199f 6022 }
ffc61ed2
JH
6023
6024 if (o->flags & ANYOF_CLASS)
6025 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
6026 if (ANYOF_CLASS_TEST(o,i))
6027 sv_catpv(sv, anyofs[i]);
6028
6029 if (flags & ANYOF_UNICODE)
6030 sv_catpv(sv, "{unicode}");
1aa99e6b 6031 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 6032 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
6033
6034 {
6035 SV *lv;
9e55ce06 6036 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 6037
ffc61ed2
JH
6038 if (lv) {
6039 if (sw) {
89ebb4a3 6040 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 6041
ffc61ed2 6042 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 6043 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 6044
3568d838 6045 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
6046 if (rangestart == -1)
6047 rangestart = i;
6048 } else if (rangestart != -1) {
6049 U8 *p;
b81d288d 6050
ffc61ed2
JH
6051 if (i <= rangestart + 3)
6052 for (; rangestart < i; rangestart++) {
2b9d42f0 6053 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
6054 put_byte(sv, *p);
6055 }
6056 else {
2b9d42f0 6057 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
6058 put_byte(sv, *p);
6059 sv_catpv(sv, "-");
2b9d42f0 6060 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
6061 put_byte(sv, *p);
6062 }
6063 rangestart = -1;
6064 }
19860706 6065 }
ffc61ed2
JH
6066
6067 sv_catpv(sv, "..."); /* et cetera */
19860706 6068 }
fde631ed 6069
ffc61ed2 6070 {
2e0de35c 6071 char *s = savesvpv(lv);
ffc61ed2 6072 char *origs = s;
b81d288d 6073
ffc61ed2 6074 while(*s && *s != '\n') s++;
b81d288d 6075
ffc61ed2
JH
6076 if (*s == '\n') {
6077 char *t = ++s;
6078
6079 while (*s) {
6080 if (*s == '\n')
6081 *s = ' ';
6082 s++;
6083 }
6084 if (s[-1] == ' ')
6085 s[-1] = 0;
6086
6087 sv_catpv(sv, t);
fde631ed 6088 }
b81d288d 6089
ffc61ed2 6090 Safefree(origs);
fde631ed
JH
6091 }
6092 }
653099ff 6093 }
ffc61ed2 6094
653099ff
GS
6095 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6096 }
9b155405 6097 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 6098 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 6099#endif /* DEBUGGING */
35ff7856 6100}
a687059c 6101
cad2e5aa
JH
6102SV *
6103Perl_re_intuit_string(pTHX_ regexp *prog)
6104{ /* Assume that RE_INTUIT is set */
a3621e74
YO
6105 GET_RE_DEBUG_FLAGS_DECL;
6106 DEBUG_COMPILE_r(
cad2e5aa 6107 { STRLEN n_a;
33b8afdf
JH
6108 char *s = SvPV(prog->check_substr
6109 ? prog->check_substr : prog->check_utf8, n_a);
cad2e5aa
JH
6110
6111 if (!PL_colorset) reginitcolors();
6112 PerlIO_printf(Perl_debug_log,
33b8afdf
JH
6113 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
6114 PL_colors[4],
6115 prog->check_substr ? "" : "utf8 ",
6116 PL_colors[5],PL_colors[0],
cad2e5aa
JH
6117 s,
6118 PL_colors[1],
6119 (strlen(s) > 60 ? "..." : ""));
6120 } );
6121
33b8afdf 6122 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
6123}
6124
2b69d0c2 6125void
864dbfa3 6126Perl_pregfree(pTHX_ struct regexp *r)
a687059c 6127{
27da23d5 6128 dVAR;
9e55ce06
JH
6129#ifdef DEBUGGING
6130 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
a3621e74 6131 SV *re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
9e55ce06 6132#endif
7821416a 6133
a3621e74 6134
7821416a
IZ
6135 if (!r || (--r->refcnt > 0))
6136 return;
a3621e74 6137 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
e1ec3a88
AL
6138 const char *s = (r->reganch & ROPT_UTF8)
6139 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 6140 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
e1ec3a88 6141 const int len = SvCUR(dsv);
9e55ce06
JH
6142 if (!PL_colorset)
6143 reginitcolors();
6144 PerlIO_printf(Perl_debug_log,
a3621e74 6145 "%sFreeing REx:%s %s%*.*s%s%s\n",
9e55ce06
JH
6146 PL_colors[4],PL_colors[5],PL_colors[0],
6147 len, len, s,
6148 PL_colors[1],
6149 len > 60 ? "..." : "");
6150 });
cad2e5aa 6151
c277df42 6152 if (r->precomp)
a0d0e21e 6153 Safefree(r->precomp);
fac92740
MJD
6154 if (r->offsets) /* 20010421 MJD */
6155 Safefree(r->offsets);
ed252734
NC
6156 RX_MATCH_COPY_FREE(r);
6157#ifdef PERL_COPY_ON_WRITE
6158 if (r->saved_copy)
6159 SvREFCNT_dec(r->saved_copy);
6160#endif
a193d654
GS
6161 if (r->substrs) {
6162 if (r->anchored_substr)
6163 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6164 if (r->anchored_utf8)
6165 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6166 if (r->float_substr)
6167 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6168 if (r->float_utf8)
6169 SvREFCNT_dec(r->float_utf8);
2779dcf1 6170 Safefree(r->substrs);
a193d654 6171 }
c277df42
IZ
6172 if (r->data) {
6173 int n = r->data->count;
f3548bdc
DM
6174 PAD* new_comppad = NULL;
6175 PAD* old_comppad;
4026c95a 6176 PADOFFSET refcnt;
dfad63ad 6177
c277df42 6178 while (--n >= 0) {
261faec3 6179 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6180 switch (r->data->what[n]) {
6181 case 's':
6182 SvREFCNT_dec((SV*)r->data->data[n]);
6183 break;
653099ff
GS
6184 case 'f':
6185 Safefree(r->data->data[n]);
6186 break;
dfad63ad
HS
6187 case 'p':
6188 new_comppad = (AV*)r->data->data[n];
6189 break;
c277df42 6190 case 'o':
dfad63ad 6191 if (new_comppad == NULL)
cea2e8a9 6192 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6193 PAD_SAVE_LOCAL(old_comppad,
6194 /* Watch out for global destruction's random ordering. */
6195 (SvTYPE(new_comppad) == SVt_PVAV) ?
6196 new_comppad : Null(PAD *)
6197 );
b34c0dd4 6198 OP_REFCNT_LOCK;
4026c95a
SH
6199 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6200 OP_REFCNT_UNLOCK;
6201 if (!refcnt)
9b978d73 6202 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6203
f3548bdc 6204 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6205 SvREFCNT_dec((SV*)new_comppad);
6206 new_comppad = NULL;
c277df42
IZ
6207 break;
6208 case 'n':
9e55ce06 6209 break;
a3621e74
YO
6210 case 't':
6211 {
6212 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6213 U32 refcount;
6214 OP_REFCNT_LOCK;
6215 refcount = trie->refcount--;
6216 OP_REFCNT_UNLOCK;
6217 if ( !refcount ) {
6218 if (trie->charmap)
6219 Safefree(trie->charmap);
6220 if (trie->widecharmap)
6221 SvREFCNT_dec((SV*)trie->widecharmap);
6222 if (trie->states)
6223 Safefree(trie->states);
6224 if (trie->trans)
6225 Safefree(trie->trans);
6226#ifdef DEBUGGING
6227 if (trie->words)
6228 SvREFCNT_dec((SV*)trie->words);
6229 if (trie->revcharmap)
6230 SvREFCNT_dec((SV*)trie->revcharmap);
6231#endif
6232 Safefree(r->data->data[n]); /* do this last!!!! */
6233 }
6234 break;
6235 }
c277df42 6236 default:
830247a4 6237 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6238 }
6239 }
6240 Safefree(r->data->what);
6241 Safefree(r->data);
a0d0e21e
LW
6242 }
6243 Safefree(r->startp);
6244 Safefree(r->endp);
6245 Safefree(r);
a687059c 6246}
c277df42
IZ
6247
6248/*
6249 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6250 */
6251regnode *
864dbfa3 6252Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
6253{
6254 register I32 offset;
6255
3280af22 6256 if (p == &PL_regdummy)
c277df42
IZ
6257 return(NULL);
6258
6259 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6260 if (offset == 0)
6261 return(NULL);
6262
c277df42 6263 return(p+offset);
c277df42
IZ
6264}
6265
01f988be 6266STATIC void
cea2e8a9 6267S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6268{
6269 va_list args;
6270 STRLEN l1 = strlen(pat1);
6271 STRLEN l2 = strlen(pat2);
6272 char buf[512];
06bf62c7 6273 SV *msv;
73d840c0 6274 const char *message;
c277df42
IZ
6275
6276 if (l1 > 510)
6277 l1 = 510;
6278 if (l1 + l2 > 510)
6279 l2 = 510 - l1;
6280 Copy(pat1, buf, l1 , char);
6281 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6282 buf[l1 + l2] = '\n';
6283 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6284#ifdef I_STDARG
6285 /* ANSI variant takes additional second argument */
c277df42 6286 va_start(args, pat2);
8736538c
AS
6287#else
6288 va_start(args);
6289#endif
5a844595 6290 msv = vmess(buf, &args);
c277df42 6291 va_end(args);
06bf62c7 6292 message = SvPV(msv,l1);
c277df42
IZ
6293 if (l1 > 512)
6294 l1 = 512;
6295 Copy(message, buf, l1 , char);
197cf9b9 6296 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6297 Perl_croak(aTHX_ "%s", buf);
c277df42 6298}
a0ed51b3
LW
6299
6300/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6301
6302void
864dbfa3 6303Perl_save_re_context(pTHX)
b81d288d 6304{
830247a4 6305 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 6306 SAVEPPTR(PL_bostr);
a0ed51b3
LW
6307 SAVEPPTR(PL_reginput); /* String-input pointer. */
6308 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6309 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
6310 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6311 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6312 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a5db57d6 6313 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
a0ed51b3 6314 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 6315 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 6316 PL_reg_start_tmp = 0;
a0ed51b3
LW
6317 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6318 PL_reg_start_tmpl = 0;
7766f137 6319 SAVEVPTR(PL_regdata);
a0ed51b3
LW
6320 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6321 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 6322 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 6323 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
6324 SAVEVPTR(PL_regcc); /* from regexec.c */
6325 SAVEVPTR(PL_curcop);
7766f137
GS
6326 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6327 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
6328 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6329 SAVESPTR(PL_reg_sv); /* from regexec.c */
9febdf04 6330 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
7766f137 6331 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 6332 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
6333 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6334 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
a5db57d6
GS
6335 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
6336 PL_reg_oldsaved = Nullch;
6337 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6338 PL_reg_oldsavedlen = 0;
ed252734
NC
6339#ifdef PERL_COPY_ON_WRITE
6340 SAVESPTR(PL_nrs);
6341 PL_nrs = Nullsv;
6342#endif
a5db57d6
GS
6343 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6344 PL_reg_maxiter = 0;
6345 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6346 PL_reg_leftiter = 0;
6347 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
6348 PL_reg_poscache = Nullch;
6349 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6350 PL_reg_poscache_size = 0;
6351 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5fb7366e 6352 SAVEI32(PL_regnpar); /* () count. */
e49a9654 6353 SAVEI32(PL_regsize); /* from regexec.c */
ada6e8a9
AMS
6354
6355 {
6356 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
a8514157 6357 U32 i;
ada6e8a9
AMS
6358 GV *mgv;
6359 REGEXP *rx;
cf28e18a 6360 char digits[TYPE_CHARS(long)];
ada6e8a9
AMS
6361
6362 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
6363 for (i = 1; i <= rx->nparens; i++) {
d994d9a1 6364 sprintf(digits, "%lu", (long)i);
ada6e8a9
AMS
6365 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
6366 save_scalar(mgv);
6367 }
6368 }
6369 }
6370
54b6e2fa 6371#ifdef DEBUGGING
b81d288d 6372 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 6373#endif
a0ed51b3 6374}
51371543 6375
51371543 6376static void
acfe0abc 6377clear_re(pTHX_ void *r)
51371543
GS
6378{
6379 ReREFCNT_dec((regexp *)r);
6380}
ffbc6a93 6381
241d1a3b
NC
6382/*
6383 * Local variables:
6384 * c-indentation-style: bsd
6385 * c-basic-offset: 4
6386 * indent-tabs-mode: t
6387 * End:
6388 *
c49a72a9 6389 * vim: shiftwidth=4:
241d1a3b 6390*/