This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Mention (un)?pack byte-order modifiers in perldelta
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e
AD
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
cad2e5aa 38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
d06ea78c 44/* We *really* need to overwrite these symbols: */
56953603
IZ
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
d06ea78c 48# define Perl_pregfree my_regfree
cad2e5aa
JH
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
d06ea78c 51# define Perl_regnext my_regnext
f0b8d043 52# define Perl_save_re_context my_save_re_context
b81d288d 53# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
b81d288d 56#endif
56953603 57
f0fcb552 58/*SUPPRESS 112*/
a687059c 59/*
e50aee73 60 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
61 *
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
64 *
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
68 *
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
71 * from defects in it.
72 *
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
75 *
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
78 *
79 *
80 **** Alterations to Henry's code are...
81 ****
4bb101f2 82 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
241d1a3b 83 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 84 ****
9ef589d8
LW
85 **** You may distribute under the terms of either the GNU General Public
86 **** License or the Artistic License, as specified in the README file.
87
a687059c
LW
88 *
89 * Beware that some of this code is subtly aware of the way operator
90 * precedence is structured in regular expressions. Serious changes in
91 * regular-expression syntax might require a total rethink.
92 */
93#include "EXTERN.h"
864dbfa3 94#define PERL_IN_REGCOMP_C
a687059c 95#include "perl.h"
d06ea78c 96
acfe0abc 97#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
98# include "INTERN.h"
99#endif
c277df42
IZ
100
101#define REG_COMP_C
a687059c
LW
102#include "regcomp.h"
103
d4cce5f1 104#ifdef op
11343788 105#undef op
d4cce5f1 106#endif /* op */
11343788 107
fe14fcc3 108#ifdef MSDOS
7e4e8c89 109# if defined(BUGGY_MSC6)
fe14fcc3 110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 111# pragma optimize("a",off)
fe14fcc3 112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
113# pragma optimize("w",on )
114# endif /* BUGGY_MSC6 */
fe14fcc3
LW
115#endif /* MSDOS */
116
a687059c
LW
117#ifndef STATIC
118#define STATIC static
119#endif
120
830247a4 121typedef struct RExC_state_t {
e2509266 122 U32 flags; /* are we folding, multilining? */
830247a4
IZ
123 char *precomp; /* uncompiled string. */
124 regexp *rx;
fac92740 125 char *start; /* Start of input for compile */
830247a4
IZ
126 char *end; /* End of input for compile */
127 char *parse; /* Input-scan pointer. */
128 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 129 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 130 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
131 I32 naughty; /* How bad is this pattern? */
132 I32 sawback; /* Did we see \1, ...? */
133 U32 seen;
134 I32 size; /* Code size. */
135 I32 npar; /* () count. */
136 I32 extralen;
137 I32 seen_zerolen;
138 I32 seen_evals;
1aa99e6b 139 I32 utf8;
830247a4
IZ
140#if ADD_TO_REGEXEC
141 char *starttry; /* -Dr: where regtry was called. */
142#define RExC_starttry (pRExC_state->starttry)
143#endif
144} RExC_state_t;
145
e2509266 146#define RExC_flags (pRExC_state->flags)
830247a4
IZ
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
fac92740 149#define RExC_start (pRExC_state->start)
830247a4
IZ
150#define RExC_end (pRExC_state->end)
151#define RExC_parse (pRExC_state->parse)
152#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 153#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 154#define RExC_emit (pRExC_state->emit)
fac92740 155#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
156#define RExC_naughty (pRExC_state->naughty)
157#define RExC_sawback (pRExC_state->sawback)
158#define RExC_seen (pRExC_state->seen)
159#define RExC_size (pRExC_state->size)
160#define RExC_npar (pRExC_state->npar)
161#define RExC_extralen (pRExC_state->extralen)
162#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 164#define RExC_utf8 (pRExC_state->utf8)
830247a4 165
a687059c
LW
166#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
167#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168 ((*s) == '{' && regcurly(s)))
a687059c 169
35c8bce7
LW
170#ifdef SPSTART
171#undef SPSTART /* dratted cpp namespace... */
172#endif
a687059c
LW
173/*
174 * Flags to be passed up and down.
175 */
a687059c 176#define WORST 0 /* Worst case. */
821b33a5 177#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
178#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
179#define SPSTART 0x4 /* Starts with * or +. */
180#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 181
2c2d71f5
JH
182/* Length of a variant. */
183
184typedef struct scan_data_t {
185 I32 len_min;
186 I32 len_delta;
187 I32 pos_min;
188 I32 pos_delta;
189 SV *last_found;
190 I32 last_end; /* min value, <0 unless valid. */
191 I32 last_start_min;
192 I32 last_start_max;
193 SV **longest; /* Either &l_fixed, or &l_float. */
194 SV *longest_fixed;
195 I32 offset_fixed;
196 SV *longest_float;
197 I32 offset_float_min;
198 I32 offset_float_max;
199 I32 flags;
200 I32 whilem_c;
cb434fcc 201 I32 *last_closep;
653099ff 202 struct regnode_charclass_class *start_class;
2c2d71f5
JH
203} scan_data_t;
204
a687059c 205/*
e50aee73 206 * Forward declarations for pregcomp()'s friends.
a687059c 207 */
a0d0e21e 208
b81d288d 209static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 210 0, 0, 0, 0, 0, 0};
c277df42
IZ
211
212#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213#define SF_BEFORE_SEOL 0x1
214#define SF_BEFORE_MEOL 0x2
215#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
217
09b7f37c
CB
218#ifdef NO_UNARY_PLUS
219# define SF_FIX_SHIFT_EOL (0+2)
220# define SF_FL_SHIFT_EOL (0+4)
221#else
222# define SF_FIX_SHIFT_EOL (+2)
223# define SF_FL_SHIFT_EOL (+4)
224#endif
c277df42
IZ
225
226#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228
229#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231#define SF_IS_INF 0x40
232#define SF_HAS_PAR 0x80
233#define SF_IN_PAR 0x100
234#define SF_HAS_EVAL 0x200
4bfe0158 235#define SCF_DO_SUBSTR 0x400
653099ff
GS
236#define SCF_DO_STCLASS_AND 0x0800
237#define SCF_DO_STCLASS_OR 0x1000
238#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 239#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 240
eb160463 241#define UTF (RExC_utf8 != 0)
e2509266
JH
242#define LOC ((RExC_flags & PMf_LOCALE) != 0)
243#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 244
ffc61ed2 245#define OOB_UNICODE 12345678
93733859 246#define OOB_NAMEDCLASS -1
b8c5462f 247
a0ed51b3
LW
248#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
250
8615cb43 251
b45f050a
JF
252/* length of regex to show in messages that don't mark a position within */
253#define RegexLengthToShowInErrorMessages 127
254
255/*
256 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258 * op/pragma/warn/regcomp.
259 */
7253e4e3
RK
260#define MARKER1 "<-- HERE" /* marker as it appears in the description */
261#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 262
7253e4e3 263#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
264
265/*
266 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267 * arg. Show regex, up to a maximum length. If it's too long, chop and add
268 * "...".
269 */
ccb2c380 270#define FAIL(msg) STMT_START { \
bfed75c6 271 const char *ellipses = ""; \
ccb2c380
MP
272 IV len = RExC_end - RExC_precomp; \
273 \
274 if (!SIZE_ONLY) \
275 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
276 if (len > RegexLengthToShowInErrorMessages) { \
277 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
278 len = RegexLengthToShowInErrorMessages - 10; \
279 ellipses = "..."; \
280 } \
281 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
282 msg, (int)len, RExC_precomp, ellipses); \
283} STMT_END
8615cb43 284
b45f050a
JF
285/*
286 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287 * args. Show regex, up to a maximum length. If it's too long, chop and add
288 * "...".
289 */
ccb2c380 290#define FAIL2(pat,msg) STMT_START { \
bfed75c6 291 const char *ellipses = ""; \
ccb2c380
MP
292 IV len = RExC_end - RExC_precomp; \
293 \
294 if (!SIZE_ONLY) \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 if (len > RegexLengthToShowInErrorMessages) { \
297 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
298 len = RegexLengthToShowInErrorMessages - 10; \
299 ellipses = "..."; \
300 } \
301 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
302 msg, (int)len, RExC_precomp, ellipses); \
303} STMT_END
b45f050a
JF
304
305
306/*
307 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
308 */
ccb2c380
MP
309#define Simple_vFAIL(m) STMT_START { \
310 IV offset = RExC_parse - RExC_precomp; \
311 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
312 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
313} STMT_END
b45f050a
JF
314
315/*
316 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
317 */
ccb2c380
MP
318#define vFAIL(m) STMT_START { \
319 if (!SIZE_ONLY) \
320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
321 Simple_vFAIL(m); \
322} STMT_END
b45f050a
JF
323
324/*
325 * Like Simple_vFAIL(), but accepts two arguments.
326 */
ccb2c380
MP
327#define Simple_vFAIL2(m,a1) STMT_START { \
328 IV offset = RExC_parse - RExC_precomp; \
329 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
330 (int)offset, RExC_precomp, RExC_precomp + offset); \
331} STMT_END
b45f050a
JF
332
333/*
334 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
335 */
ccb2c380
MP
336#define vFAIL2(m,a1) STMT_START { \
337 if (!SIZE_ONLY) \
338 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
339 Simple_vFAIL2(m, a1); \
340} STMT_END
b45f050a
JF
341
342
343/*
344 * Like Simple_vFAIL(), but accepts three arguments.
345 */
ccb2c380
MP
346#define Simple_vFAIL3(m, a1, a2) STMT_START { \
347 IV offset = RExC_parse - RExC_precomp; \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
349 (int)offset, RExC_precomp, RExC_precomp + offset); \
350} STMT_END
b45f050a
JF
351
352/*
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354 */
ccb2c380
MP
355#define vFAIL3(m,a1,a2) STMT_START { \
356 if (!SIZE_ONLY) \
357 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
358 Simple_vFAIL3(m, a1, a2); \
359} STMT_END
b45f050a
JF
360
361/*
362 * Like Simple_vFAIL(), but accepts four arguments.
363 */
ccb2c380
MP
364#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
365 IV offset = RExC_parse - RExC_precomp; \
366 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
367 (int)offset, RExC_precomp, RExC_precomp + offset); \
368} STMT_END
b45f050a
JF
369
370/*
371 * Like Simple_vFAIL(), but accepts five arguments.
372 */
ccb2c380
MP
373#define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
374 IV offset = RExC_parse - RExC_precomp; \
375 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
376 (int)offset, RExC_precomp, RExC_precomp + offset); \
377} STMT_END
378
379
380#define vWARN(loc,m) STMT_START { \
381 IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
383 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
384} STMT_END
385
386#define vWARNdep(loc,m) STMT_START { \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
389 "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
391} STMT_END
392
393
394#define vWARN2(loc, m, a1) STMT_START { \
395 IV offset = loc - RExC_precomp; \
396 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
397 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
398} STMT_END
399
400#define vWARN3(loc, m, a1, a2) STMT_START { \
401 IV offset = loc - RExC_precomp; \
402 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
403 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
404} STMT_END
405
406#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
407 IV offset = loc - RExC_precomp; \
408 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
409 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
410} STMT_END
411
412#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
413 IV offset = loc - RExC_precomp; \
414 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
415 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
416} STMT_END
9d1d55b5 417
8615cb43 418
cd439c50 419/* Allow for side effects in s */
ccb2c380
MP
420#define REGC(c,s) STMT_START { \
421 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
422} STMT_END
cd439c50 423
fac92740
MJD
424/* Macros for recording node offsets. 20001227 mjd@plover.com
425 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
426 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
427 * Element 0 holds the number n.
428 */
429
430#define MJD_OFFSET_DEBUG(x)
a3621e74 431/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
432
433
434#define Set_Node_Offset_To_R(node,byte) STMT_START { \
435 if (! SIZE_ONLY) { \
436 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
437 __LINE__, (node), (byte))); \
438 if((node) < 0) { \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
440 } else { \
441 RExC_offsets[2*(node)-1] = (byte); \
442 } \
443 } \
444} STMT_END
445
446#define Set_Node_Offset(node,byte) \
447 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
449
450#define Set_Node_Length_To_R(node,len) STMT_START { \
451 if (! SIZE_ONLY) { \
452 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
453 __LINE__, (node), (len))); \
454 if((node) < 0) { \
455 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
456 } else { \
457 RExC_offsets[2*(node)] = (len); \
458 } \
459 } \
460} STMT_END
461
462#define Set_Node_Length(node,len) \
463 Set_Node_Length_To_R((node)-RExC_emit_start, len)
464#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465#define Set_Node_Cur_Length(node) \
466 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
467
468/* Get offsets and lengths */
469#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
471
acfe0abc 472static void clear_re(pTHX_ void *r);
4327152a 473
653099ff
GS
474/* Mark that we cannot extend a found fixed substring at this point.
475 Updata the longest found anchored substring and the longest found
476 floating substrings if needed. */
477
4327152a 478STATIC void
830247a4 479S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 480{
a0ed51b3
LW
481 STRLEN l = CHR_SVLEN(data->last_found);
482 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 483
c277df42 484 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 485 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
486 if (*data->longest == data->longest_fixed) {
487 data->offset_fixed = l ? data->last_start_min : data->pos_min;
488 if (data->flags & SF_BEFORE_EOL)
b81d288d 489 data->flags
c277df42
IZ
490 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
491 else
492 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
493 }
494 else {
c277df42 495 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
496 data->offset_float_max = (l
497 ? data->last_start_max
c277df42 498 : data->pos_min + data->pos_delta);
9051bda5
HS
499 if ((U32)data->offset_float_max > (U32)I32_MAX)
500 data->offset_float_max = I32_MAX;
c277df42 501 if (data->flags & SF_BEFORE_EOL)
b81d288d 502 data->flags
c277df42
IZ
503 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
504 else
505 data->flags &= ~SF_FL_BEFORE_EOL;
506 }
507 }
508 SvCUR_set(data->last_found, 0);
0eda9292
JH
509 {
510 SV * sv = data->last_found;
511 MAGIC *mg =
512 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513 if (mg && mg->mg_len > 0)
514 mg->mg_len = 0;
515 }
c277df42
IZ
516 data->last_end = -1;
517 data->flags &= ~SF_BEFORE_EOL;
518}
519
653099ff
GS
520/* Can match anything (initialization) */
521STATIC void
830247a4 522S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 523{
653099ff 524 ANYOF_CLASS_ZERO(cl);
f8bef550 525 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
527 if (LOC)
528 cl->flags |= ANYOF_LOCALE;
529}
530
531/* Can match anything (initialization) */
532STATIC int
533S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
534{
535 int value;
536
aaa51d5e 537 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
539 return 1;
1aa99e6b
IH
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
541 return 0;
f8bef550
NC
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
543 return 0;
653099ff
GS
544 return 1;
545}
546
547/* Can match anything (initialization) */
548STATIC void
830247a4 549S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 550{
8ecf7187 551 Zero(cl, 1, struct regnode_charclass_class);
653099ff 552 cl->type = ANYOF;
830247a4 553 cl_anything(pRExC_state, cl);
653099ff
GS
554}
555
556STATIC void
830247a4 557S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 558{
8ecf7187 559 Zero(cl, 1, struct regnode_charclass_class);
653099ff 560 cl->type = ANYOF;
830247a4 561 cl_anything(pRExC_state, cl);
653099ff
GS
562 if (LOC)
563 cl->flags |= ANYOF_LOCALE;
564}
565
566/* 'And' a given class with another one. Can create false positives */
567/* We assume that cl is not inverted */
568STATIC void
569S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
571{
653099ff
GS
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
577 int i;
578
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
582 else
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
1aa99e6b 588
14ebb1a2
JH
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
594 }
14ebb1a2
JH
595 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 597 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 600 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
601}
602
603/* 'OR' a given class with another one. Can create false positives */
604/* We assume that cl is not inverted */
605STATIC void
830247a4 606S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 607{
653099ff
GS
608 if (or_with->flags & ANYOF_INVERT) {
609 /* We do not use
610 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611 * <= (B1 | !B2) | (CL1 | !CL2)
612 * which is wasteful if CL2 is small, but we ignore CL2:
613 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614 * XXXX Can we handle case-fold? Unclear:
615 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
617 */
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619 && !(or_with->flags & ANYOF_FOLD)
620 && !(cl->flags & ANYOF_FOLD) ) {
621 int i;
622
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= ~or_with->bitmap[i];
625 } /* XXXX: logic is complicated otherwise */
626 else {
830247a4 627 cl_anything(pRExC_state, cl);
653099ff
GS
628 }
629 } else {
630 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 632 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
633 || (cl->flags & ANYOF_FOLD)) ) {
634 int i;
635
636 /* OR char bitmap and class bitmap separately */
637 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638 cl->bitmap[i] |= or_with->bitmap[i];
639 if (or_with->flags & ANYOF_CLASS) {
640 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641 cl->classflags[i] |= or_with->classflags[i];
642 cl->flags |= ANYOF_CLASS;
643 }
644 }
645 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 646 cl_anything(pRExC_state, cl);
653099ff
GS
647 }
648 }
649 if (or_with->flags & ANYOF_EOS)
650 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
651
652 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653 ARG(cl) != ARG(or_with)) {
654 cl->flags |= ANYOF_UNICODE_ALL;
655 cl->flags &= ~ANYOF_UNICODE;
656 }
657 if (or_with->flags & ANYOF_UNICODE_ALL) {
658 cl->flags |= ANYOF_UNICODE_ALL;
659 cl->flags &= ~ANYOF_UNICODE;
660 }
653099ff
GS
661}
662
5d1c421c 663/*
a3621e74
YO
664
665 make_trie(startbranch,first,last,tail,flags)
666 startbranch: the first branch in the whole branch sequence
667 first : start branch of sequence of branch-exact nodes.
668 May be the same as startbranch
669 last : Thing following the last branch.
670 May be the same as tail.
671 tail : item following the branch sequence
672 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
673
674Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
675
676A trie is an N'ary tree where the branches are determined by digital
677decomposition of the key. IE, at the root node you look up the 1st character and
678follow that branch repeat until you find the end of the branches. Nodes can be
679marked as "accepting" meaning they represent a complete word. Eg:
680
681 /he|she|his|hers/
682
683would convert into the following structure. Numbers represent states, letters
684following numbers represent valid transitions on the letter from that state, if
685the number is in square brackets it represents an accepting state, otherwise it
686will be in parenthesis.
687
688 +-h->+-e->[3]-+-r->(8)-+-s->[9]
689 | |
690 | (2)
691 | |
692 (1) +-i->(6)-+-s->[7]
693 |
694 +-s->(3)-+-h->(4)-+-e->[5]
695
696 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
697
698This shows that when matching against the string 'hers' we will begin at state 1
699read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
700then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
701is also accepting. Thus we know that we can match both 'he' and 'hers' with a
702single traverse. We store a mapping from accepting to state to which word was
703matched, and then when we have multiple possibilities we try to complete the
704rest of the regex in the order in which they occured in the alternation.
705
706The only prior NFA like behaviour that would be changed by the TRIE support is
707the silent ignoring of duplicate alternations which are of the form:
708
709 / (DUPE|DUPE) X? (?{ ... }) Y /x
710
711Thus EVAL blocks follwing a trie may be called a different number of times with
712and without the optimisation. With the optimisations dupes will be silently
713ignored. This inconsistant behaviour of EVAL type nodes is well established as
714the following demonstrates:
715
716 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
717
718which prints out 'word' three times, but
719
720 'words'=~/(word|word|word)(?{ print $1 })S/
721
722which doesnt print it out at all. This is due to other optimisations kicking in.
723
724Example of what happens on a structural level:
725
726The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
727
728 1: CURLYM[1] {1,32767}(18)
729 5: BRANCH(8)
730 6: EXACT <ac>(16)
731 8: BRANCH(11)
732 9: EXACT <ad>(16)
733 11: BRANCH(14)
734 12: EXACT <ab>(16)
735 16: SUCCEED(0)
736 17: NOTHING(18)
737 18: END(0)
738
739This would be optimizable with startbranch=5, first=5, last=16, tail=16
740and should turn into:
741
742 1: CURLYM[1] {1,32767}(18)
743 5: TRIE(16)
744 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
745 <ac>
746 <ad>
747 <ab>
748 16: SUCCEED(0)
749 17: NOTHING(18)
750 18: END(0)
751
752Cases where tail != last would be like /(?foo|bar)baz/:
753
754 1: BRANCH(4)
755 2: EXACT <foo>(8)
756 4: BRANCH(7)
757 5: EXACT <bar>(8)
758 7: TAIL(8)
759 8: EXACT <baz>(10)
760 10: END(0)
761
762which would be optimizable with startbranch=1, first=1, last=7, tail=8
763and would end up looking like:
764
765 1: TRIE(8)
766 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
767 <foo>
768 <bar>
769 7: TAIL(8)
770 8: EXACT <baz>(10)
771 10: END(0)
772
773*/
774
775#define TRIE_DEBUG_CHAR \
776 DEBUG_TRIE_COMPILE_r({ \
777 SV *tmp; \
778 if ( UTF ) { \
779 tmp = newSVpv( "", 0 ); \
780 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
781 } else { \
782 tmp = Perl_newSVpvf_nocontext( "%c", uvc ); \
783 } \
784 av_push( trie->revcharmap, tmp ); \
785 })
786
787#define TRIE_READ_CHAR STMT_START { \
788 if ( UTF ) { \
789 if ( folder ) { \
790 if ( foldlen > 0 ) { \
791 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
792 foldlen -= len; \
793 scan += len; \
794 len = 0; \
795 } else { \
796 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags); \
797 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
798 foldlen -= UNISKIP( uvc ); \
799 scan = foldbuf + UNISKIP( uvc ); \
800 } \
801 } else { \
802 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags); \
803 } \
804 } else { \
805 uvc = (U32)*uc; \
806 len = 1; \
807 } \
808} STMT_END
809
810
811#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
812#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
813#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
814#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
815
816#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
817 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
818 TRIE_LIST_LEN( state ) *= 2; \
819 Renew( trie->states[ state ].trans.list, \
820 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
821 } \
822 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
823 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
824 TRIE_LIST_CUR( state )++; \
825} STMT_END
826
827#define TRIE_LIST_NEW(state) STMT_START { \
828 Newz( 1023, trie->states[ state ].trans.list, \
829 4, reg_trie_trans_le ); \
830 TRIE_LIST_CUR( state ) = 1; \
831 TRIE_LIST_LEN( state ) = 4; \
832} STMT_END
833
834STATIC I32
835S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
836{
837 /* first pass, loop through and scan words */
838 reg_trie_data *trie;
839 regnode *cur;
840 U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
841 STRLEN len = 0;
842 UV uvc = 0;
843 U16 curword = 0;
844 U32 next_alloc = 0;
845 /* we just use folder as a flag in utf8 */
846 const U8 *folder=( flags == EXACTF
847 ? PL_fold
848 : ( flags == EXACTFL
849 ? PL_fold_locale
850 : NULL
851 )
852 );
853
854 U32 data_slot = add_data( pRExC_state, 1, "t" );
855 SV *re_trie_maxbuff;
856
857 GET_RE_DEBUG_FLAGS_DECL;
858
859 Newz( 848200, trie, 1, reg_trie_data );
860 trie->refcount = 1;
861 RExC_rx->data->data[ data_slot ] = (void*)trie;
862 Newz( 848201, trie->charmap, 256, U16 );
863 DEBUG_r({
864 trie->words = newAV();
865 trie->revcharmap = newAV();
866 });
867
868
0111c4fd 869 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 870 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 871 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74
YO
872 }
873
874 /* -- First loop and Setup --
875
876 We first traverse the branches and scan each word to determine if it
877 contains widechars, and how many unique chars there are, this is
878 important as we have to build a table with at least as many columns as we
879 have unique chars.
880
881 We use an array of integers to represent the character codes 0..255
882 (trie->charmap) and we use a an HV* to store unicode characters. We use the
883 native representation of the character value as the key and IV's for the
884 coded index.
885
886 *TODO* If we keep track of how many times each character is used we can
887 remap the columns so that the table compression later on is more
888 efficient in terms of memory by ensuring most common value is in the
889 middle and the least common are on the outside. IMO this would be better
890 than a most to least common mapping as theres a decent chance the most
891 common letter will share a node with the least common, meaning the node
892 will not be compressable. With a middle is most common approach the worst
893 case is when we have the least common nodes twice.
894
895 */
896
897
898 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
899 regnode *noper = NEXTOPER( cur );
900 U8 *uc = (U8*)STRING( noper );
901 U8 *e = uc + STR_LEN( noper );
902 STRLEN foldlen = 0;
903 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
904 U8 *scan;
905
906 for ( ; uc < e ; uc += len ) {
907 trie->charcount++;
908 TRIE_READ_CHAR;
909 if ( uvc < 256 ) {
910 if ( !trie->charmap[ uvc ] ) {
911 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
912 if ( folder )
913 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
914 TRIE_DEBUG_CHAR;
915 }
916 } else {
917 SV** svpp;
918 if ( !trie->widecharmap )
919 trie->widecharmap = newHV();
920
921 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
922
923 if ( !svpp )
924 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%X", uvc );
925
926 if ( !SvTRUE( *svpp ) ) {
927 sv_setiv( *svpp, ++trie->uniquecharcount );
928 TRIE_DEBUG_CHAR;
929 }
930 }
931 }
932 trie->wordcount++;
933 } /* end first pass */
934 DEBUG_TRIE_COMPILE_r(
935 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
936 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
937 trie->charcount, trie->uniquecharcount )
938 );
939
940
941 /*
942 We now know what we are dealing with in terms of unique chars and
943 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
944 representation using a flat table will take. If it's over a reasonable
945 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
946 conservative but potentially much slower representation using an array
947 of lists.
948
949 At the end we convert both representations into the same compressed
950 form that will be used in regexec.c for matching with. The latter
951 is a form that cannot be used to construct with but has memory
952 properties similar to the list form and access properties similar
953 to the table form making it both suitable for fast searches and
954 small enough that its feasable to store for the duration of a program.
955
956 See the comment in the code where the compressed table is produced
957 inplace from the flat tabe representation for an explanation of how
958 the compression works.
959
960 */
961
962
963 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
964 /*
965 Second Pass -- Array Of Lists Representation
966
967 Each state will be represented by a list of charid:state records
968 (reg_trie_trans_le) the first such element holds the CUR and LEN
969 points of the allocated array. (See defines above).
970
971 We build the initial structure using the lists, and then convert
972 it into the compressed table form which allows faster lookups
973 (but cant be modified once converted).
974
975
976 */
977
978
979 STRLEN transcount = 1;
980
981 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
982 TRIE_LIST_NEW(1);
983 next_alloc = 2;
984
985 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
986
987 regnode *noper = NEXTOPER( cur );
988 U8 *uc = (U8*)STRING( noper );
989 U8 *e = uc + STR_LEN( noper );
990 U32 state = 1; /* required init */
991 U16 charid = 0; /* sanity init */
992 U8 *scan = (U8*)NULL; /* sanity init */
993 STRLEN foldlen = 0; /* required init */
994 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
995
996
997 for ( ; uc < e ; uc += len ) {
998
999 TRIE_READ_CHAR;
1000
1001 if ( uvc < 256 ) {
1002 charid = trie->charmap[ uvc ];
1003 } else {
1004 SV** svpp=(SV**)NULL;
1005 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1006 if ( !svpp ) {
1007 charid = 0;
1008 } else {
1009 charid=(U16)SvIV( *svpp );
1010 }
1011 }
1012 if ( charid ) {
1013
1014 U16 check;
1015 U32 newstate = 0;
1016
1017 charid--;
1018 if ( !trie->states[ state ].trans.list ) {
1019 TRIE_LIST_NEW( state );
1020 }
1021 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1022 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1023 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1024 break;
1025 }
1026 }
1027 if ( ! newstate ) {
1028 newstate = next_alloc++;
1029 TRIE_LIST_PUSH( state, charid, newstate );
1030 transcount++;
1031 }
1032 state = newstate;
1033
1034 } else {
1035 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %d", uvc );
1036 }
1037 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1038 }
1039
1040 if ( !trie->states[ state ].wordnum ) {
1041 /* we havent inserted this word into the structure yet. */
1042 trie->states[ state ].wordnum = ++curword;
1043
1044 DEBUG_r({
1045 /* store the word for dumping */
1046 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1047 if ( UTF ) SvUTF8_on( tmp );
1048 av_push( trie->words, tmp );
1049 });
1050
1051 } else {
1052 /* Its a dupe. So ignore it. */
1053 }
1054
1055 } /* end second pass */
1056
1057 trie->laststate = next_alloc;
1058 Renew( trie->states, next_alloc, reg_trie_state );
1059
1060 DEBUG_TRIE_COMPILE_MORE_r({
1061 U32 state;
1062 U16 charid;
1063
1064 /*
1065 print out the table precompression.
1066 */
1067
1068 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1069 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1070
1071 for( state=1 ; state < next_alloc ; state ++ ) {
1072
1073 PerlIO_printf( Perl_debug_log, "\n %04X :", state );
1074 if ( ! trie->states[ state ].wordnum ) {
1075 PerlIO_printf( Perl_debug_log, "%5s| ","");
1076 } else {
1077 PerlIO_printf( Perl_debug_log, "W%04X| ",
1078 trie->states[ state ].wordnum
1079 );
1080 }
1081 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1082 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1083 PerlIO_printf( Perl_debug_log, "%s:%3X=%04X | ",
1084 SvPV_nolen( *tmp ),
1085 TRIE_LIST_ITEM(state,charid).forid,
1086 TRIE_LIST_ITEM(state,charid).newstate
1087 );
1088 }
1089
1090 }
1091 PerlIO_printf( Perl_debug_log, "\n\n" );
1092 });
1093
1094 Newz( 848203, trie->trans, transcount ,reg_trie_trans );
1095 {
1096 U32 state;
1097 U16 idx;
1098 U32 tp = 0;
1099 U32 zp = 0;
1100
1101
1102 for( state=1 ; state < next_alloc ; state ++ ) {
1103 U32 base=0;
1104
1105 /*
1106 DEBUG_TRIE_COMPILE_MORE_r(
1107 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1108 );
1109 */
1110
1111 if (trie->states[state].trans.list) {
1112 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1113 U16 maxid=minid;
1114
1115
1116 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1117 if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
1118 minid=TRIE_LIST_ITEM( state, idx).forid;
1119 } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
1120 maxid=TRIE_LIST_ITEM( state, idx).forid;
1121 }
1122 }
1123 if ( transcount < tp + maxid - minid + 1) {
1124 transcount *= 2;
1125 Renew( trie->trans, transcount, reg_trie_trans );
1126 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1127 }
1128 base = trie->uniquecharcount + tp - minid;
1129 if ( maxid == minid ) {
1130 U32 set = 0;
1131 for ( ; zp < tp ; zp++ ) {
1132 if ( ! trie->trans[ zp ].next ) {
1133 base = trie->uniquecharcount + zp - minid;
1134 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1135 trie->trans[ zp ].check = state;
1136 set = 1;
1137 break;
1138 }
1139 }
1140 if ( !set ) {
1141 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1142 trie->trans[ tp ].check = state;
1143 tp++;
1144 zp = tp;
1145 }
1146 } else {
1147 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1148 U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1149 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1150 trie->trans[ tid ].check = state;
1151 }
1152 tp += ( maxid - minid + 1 );
1153 }
1154 Safefree(trie->states[ state ].trans.list);
1155 }
1156 /*
1157 DEBUG_TRIE_COMPILE_MORE_r(
1158 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1159 );
1160 */
1161 trie->states[ state ].trans.base=base;
1162 }
cc601c31 1163 trie->lasttrans = tp + 1;
a3621e74
YO
1164 }
1165 } else {
1166 /*
1167 Second Pass -- Flat Table Representation.
1168
1169 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1170 We know that we will need Charcount+1 trans at most to store the data
1171 (one row per char at worst case) So we preallocate both structures
1172 assuming worst case.
1173
1174 We then construct the trie using only the .next slots of the entry
1175 structs.
1176
1177 We use the .check field of the first entry of the node temporarily to
1178 make compression both faster and easier by keeping track of how many non
1179 zero fields are in the node.
1180
1181 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1182 transition.
1183
1184 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1185 number representing the first entry of the node, and state as a
1186 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1187 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1188 are 2 entrys per node. eg:
1189
1190 A B A B
1191 1. 2 4 1. 3 7
1192 2. 0 3 3. 0 5
1193 3. 0 0 5. 0 0
1194 4. 0 0 7. 0 0
1195
1196 The table is internally in the right hand, idx form. However as we also
1197 have to deal with the states array which is indexed by nodenum we have to
1198 use TRIE_NODENUM() to convert.
1199
1200 */
1201
1202 Newz( 848203, trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1203 reg_trie_trans );
1204 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
1205 next_alloc = trie->uniquecharcount + 1;
1206
1207 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1208
1209 regnode *noper = NEXTOPER( cur );
1210 U8 *uc = (U8*)STRING( noper );
1211 U8 *e = uc + STR_LEN( noper );
1212
1213 U32 state = 1; /* required init */
1214
1215 U16 charid = 0; /* sanity init */
1216 U32 accept_state = 0; /* sanity init */
1217 U8 *scan = (U8*)NULL; /* sanity init */
1218
1219 STRLEN foldlen = 0; /* required init */
1220 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1221
1222
1223 for ( ; uc < e ; uc += len ) {
1224
1225 TRIE_READ_CHAR;
1226
1227 if ( uvc < 256 ) {
1228 charid = trie->charmap[ uvc ];
1229 } else {
1230 SV** svpp=(SV**)NULL;
1231 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1232 if ( !svpp ) {
1233 charid = 0;
1234 } else {
1235 charid=(U16)SvIV( *svpp );
1236 }
1237 }
1238 if ( charid ) {
1239 charid--;
1240 if ( !trie->trans[ state + charid ].next ) {
1241 trie->trans[ state + charid ].next = next_alloc;
1242 trie->trans[ state ].check++;
1243 next_alloc += trie->uniquecharcount;
1244 }
1245 state = trie->trans[ state + charid ].next;
1246 } else {
1247 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %d", uvc );
1248 }
1249 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1250 }
1251
1252 accept_state = TRIE_NODENUM( state );
1253 if ( !trie->states[ accept_state ].wordnum ) {
1254 /* we havent inserted this word into the structure yet. */
1255 trie->states[ accept_state ].wordnum = ++curword;
1256
1257 DEBUG_r({
1258 /* store the word for dumping */
1259 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1260 if ( UTF ) SvUTF8_on( tmp );
1261 av_push( trie->words, tmp );
1262 });
1263
1264 } else {
1265 /* Its a dupe. So ignore it. */
1266 }
1267
1268 } /* end second pass */
1269
1270 DEBUG_TRIE_COMPILE_MORE_r({
1271 /*
1272 print out the table precompression so that we can do a visual check
1273 that they are identical.
1274 */
1275 U32 state;
1276 U16 charid;
1277 PerlIO_printf( Perl_debug_log, "\nChar : " );
1278
1279 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1280 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1281 if ( tmp ) {
1282 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1283 }
1284 }
1285
1286 PerlIO_printf( Perl_debug_log, "\nState+-" );
1287
1288 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1289 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1290 }
1291
1292 PerlIO_printf( Perl_debug_log, "\n" );
1293
1294 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1295
1296 PerlIO_printf( Perl_debug_log, "%04X : ", TRIE_NODENUM( state ) );
1297
1298 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1299 PerlIO_printf( Perl_debug_log, "%04X ",
1300 SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1301 }
1302 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1303 PerlIO_printf( Perl_debug_log, " (%04X)\n", trie->trans[ state ].check );
1304 } else {
1305 PerlIO_printf( Perl_debug_log, " (%04X) W%04X\n", trie->trans[ state ].check,
1306 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1307 }
1308 }
1309 PerlIO_printf( Perl_debug_log, "\n\n" );
1310 });
1311 {
1312 /*
1313 * Inplace compress the table.*
1314
1315 For sparse data sets the table constructed by the trie algorithm will
1316 be mostly 0/FAIL transitions or to put it another way mostly empty.
1317 (Note that leaf nodes will not contain any transitions.)
1318
1319 This algorithm compresses the tables by eliminating most such
1320 transitions, at the cost of a modest bit of extra work during lookup:
1321
1322 - Each states[] entry contains a .base field which indicates the
1323 index in the state[] array wheres its transition data is stored.
1324
1325 - If .base is 0 there are no valid transitions from that node.
1326
1327 - If .base is nonzero then charid is added to it to find an entry in
1328 the trans array.
1329
1330 -If trans[states[state].base+charid].check!=state then the
1331 transition is taken to be a 0/Fail transition. Thus if there are fail
1332 transitions at the front of the node then the .base offset will point
1333 somewhere inside the previous nodes data (or maybe even into a node
1334 even earlier), but the .check field determines if the transition is
1335 valid.
1336
1337 The following process inplace converts the table to the compressed
1338 table: We first do not compress the root node 1,and mark its all its
1339 .check pointers as 1 and set its .base pointer as 1 as well. This
1340 allows to do a DFA construction from the compressed table later, and
1341 ensures that any .base pointers we calculate later are greater than
1342 0.
1343
1344 - We set 'pos' to indicate the first entry of the second node.
1345
1346 - We then iterate over the columns of the node, finding the first and
1347 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1348 and set the .check pointers accordingly, and advance pos
1349 appropriately and repreat for the next node. Note that when we copy
1350 the next pointers we have to convert them from the original
1351 NODEIDX form to NODENUM form as the former is not valid post
1352 compression.
1353
1354 - If a node has no transitions used we mark its base as 0 and do not
1355 advance the pos pointer.
1356
1357 - If a node only has one transition we use a second pointer into the
1358 structure to fill in allocated fail transitions from other states.
1359 This pointer is independent of the main pointer and scans forward
1360 looking for null transitions that are allocated to a state. When it
1361 finds one it writes the single transition into the "hole". If the
1362 pointer doesnt find one the single transition is appeneded as normal.
1363
1364 - Once compressed we can Renew/realloc the structures to release the
1365 excess space.
1366
1367 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1368 specifically Fig 3.47 and the associated pseudocode.
1369
1370 demq
1371 */
1372 U32 laststate = TRIE_NODENUM( next_alloc );
1373 U32 used , state, charid;
1374 U32 pos = 0, zp=0;
1375 trie->laststate = laststate;
1376
1377 for ( state = 1 ; state < laststate ; state++ ) {
1378 U8 flag = 0;
1379 U32 stateidx = TRIE_NODEIDX( state );
1380 U32 o_used=trie->trans[ stateidx ].check;
1381 used = trie->trans[ stateidx ].check;
1382 trie->trans[ stateidx ].check = 0;
1383
1384 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1385 if ( flag || trie->trans[ stateidx + charid ].next ) {
1386 if ( trie->trans[ stateidx + charid ].next ) {
1387 if (o_used == 1) {
1388 for ( ; zp < pos ; zp++ ) {
1389 if ( ! trie->trans[ zp ].next ) {
1390 break;
1391 }
1392 }
1393 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1394 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1395 trie->trans[ zp ].check = state;
1396 if ( ++zp > pos ) pos = zp;
1397 break;
1398 }
1399 used--;
1400 }
1401 if ( !flag ) {
1402 flag = 1;
1403 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1404 }
1405 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1406 trie->trans[ pos ].check = state;
1407 pos++;
1408 }
1409 }
1410 }
cc601c31 1411 trie->lasttrans = pos + 1;
a3621e74
YO
1412 Renew( trie->states, laststate + 1, reg_trie_state);
1413 DEBUG_TRIE_COMPILE_MORE_r(
1414 PerlIO_printf( Perl_debug_log, " Alloc: %d Orig: %d elements, Final:%d. Savings of %%%5.2f\n",
1415 ( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ), next_alloc, pos,
1416 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1417 );
1418
1419 } /* end table compress */
1420 }
cc601c31
YO
1421 /* resize the trans array to remove unused space */
1422 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74
YO
1423
1424 DEBUG_TRIE_COMPILE_r({
1425 U32 state;
1426 /*
1427 Now we print it out again, in a slightly different form as there is additional
1428 info we want to be able to see when its compressed. They are close enough for
1429 visual comparison though.
1430 */
1431 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1432
1433 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1434 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1435 if ( tmp ) {
1436 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1437 }
1438 }
1439 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
cc601c31 1440
a3621e74
YO
1441 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1442 PerlIO_printf( Perl_debug_log, "-----");
1443 PerlIO_printf( Perl_debug_log, "\n");
cc601c31 1444
a3621e74
YO
1445 for( state = 1 ; state < trie->laststate ; state++ ) {
1446 U32 base = trie->states[ state ].trans.base;
1447
1448 PerlIO_printf( Perl_debug_log, "#%04X ", state);
1449
1450 if ( trie->states[ state ].wordnum ) {
1451 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1452 } else {
1453 PerlIO_printf( Perl_debug_log, "%6s", "" );
1454 }
1455
1456 PerlIO_printf( Perl_debug_log, " @%04X ", base );
1457
1458 if ( base ) {
1459 U32 ofs = 0;
1460
cc601c31
YO
1461 while( ( base + ofs < trie->uniquecharcount ) ||
1462 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1463 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
a3621e74
YO
1464 ofs++;
1465
1466 PerlIO_printf( Perl_debug_log, "+%02X[ ", ofs);
1467
1468 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1469 if ( ( base + ofs >= trie->uniquecharcount ) &&
1470 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1471 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1472 {
1473 PerlIO_printf( Perl_debug_log, "%04X ",
1474 trie->trans[ base + ofs - trie->uniquecharcount ].next );
1475 } else {
1476 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1477 }
1478 }
1479
1480 PerlIO_printf( Perl_debug_log, "]", ofs);
1481
1482 }
1483 PerlIO_printf( Perl_debug_log, "\n" );
1484 }
1485 });
1486
1487 {
1488 /* now finally we "stitch in" the new TRIE node
1489 This means we convert either the first branch or the first Exact,
1490 depending on whether the thing following (in 'last') is a branch
1491 or not and whther first is the startbranch (ie is it a sub part of
1492 the alternation or is it the whole thing.)
1493 Assuming its a sub part we conver the EXACT otherwise we convert
1494 the whole branch sequence, including the first.
1495 */
1496 regnode *convert;
1497
1498
1499
1500
1501 if ( first == startbranch && OP( last ) != BRANCH ) {
1502 convert = first;
1503 } else {
1504 convert = NEXTOPER( first );
1505 NEXT_OFF( first ) = (U16)(last - first);
1506 }
1507
1508 OP( convert ) = TRIE + (U8)( flags - EXACT );
1509 NEXT_OFF( convert ) = (U16)(tail - convert);
1510 ARG_SET( convert, data_slot );
1511
1512 /* tells us if we need to handle accept buffers specially */
1513 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1514
1515
1516 /* needed for dumping*/
1517 DEBUG_r({
1518 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1519 /* We now need to mark all of the space originally used by the
1520 branches as optimized away. This keeps the dumpuntil from
1521 throwing a wobbly as it doesnt use regnext() to traverse the
1522 opcodes.
1523 */
1524 while( optimize < last ) {
1525 OP( optimize ) = OPTIMIZED;
1526 optimize++;
1527 }
1528 });
1529 } /* end node insert */
1530 return 1;
1531}
1532
1533
1534
1535/*
5d1c421c
JH
1536 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1537 * These need to be revisited when a newer toolchain becomes available.
1538 */
1539#if defined(__sparc64__) && defined(__GNUC__)
1540# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1541# undef SPARC64_GCC_WORKAROUND
1542# define SPARC64_GCC_WORKAROUND 1
1543# endif
1544#endif
1545
653099ff
GS
1546/* REx optimizer. Converts nodes into quickier variants "in place".
1547 Finds fixed substrings. */
1548
c277df42
IZ
1549/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
1550 to the position after last scanned or to NULL. */
1551
a3621e74 1552
76e3520e 1553STATIC I32
a3621e74 1554S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1555 /* scanp: Start here (read-write). */
1556 /* deltap: Write maxlen-minlen here. */
1557 /* last: Stop before this one. */
1558{
1559 I32 min = 0, pars = 0, code;
1560 regnode *scan = *scanp, *next;
1561 I32 delta = 0;
1562 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1563 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1564 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1565 scan_data_t data_fake;
653099ff 1566 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1567 SV *re_trie_maxbuff = NULL;
1568
1569 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1570
c277df42
IZ
1571 while (scan && OP(scan) != END && scan < last) {
1572 /* Peephole optimizer: */
a3621e74
YO
1573 DEBUG_OPTIMISE_r({
1574 SV *mysv=sv_newmortal();
1575 regprop( mysv, scan);
1576 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08X)\n",depth*2,"",SvPV_nolen(mysv),scan);
1577 });
c277df42 1578
22c35a8c 1579 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 1580 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1581 regnode *n = regnext(scan);
1582 U32 stringok = 1;
1583#ifdef DEBUGGING
1584 regnode *stop = scan;
b81d288d 1585#endif
c277df42 1586
cd439c50 1587 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1588 /* Skip NOTHING, merge EXACT*. */
1589 while (n &&
b81d288d 1590 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
1591 (stringok && (OP(n) == OP(scan))))
1592 && NEXT_OFF(n)
1593 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1594 if (OP(n) == TAIL || n > next)
1595 stringok = 0;
22c35a8c 1596 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
1597 NEXT_OFF(scan) += NEXT_OFF(n);
1598 next = n + NODE_STEP_REGNODE;
1599#ifdef DEBUGGING
1600 if (stringok)
1601 stop = n;
b81d288d 1602#endif
c277df42 1603 n = regnext(n);
a0ed51b3 1604 }
f49d4d0f 1605 else if (stringok) {
cd439c50 1606 int oldl = STR_LEN(scan);
c277df42 1607 regnode *nnext = regnext(n);
f49d4d0f 1608
b81d288d 1609 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1610 break;
1611 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1612 STR_LEN(scan) += STR_LEN(n);
1613 next = n + NODE_SZ_STR(n);
c277df42 1614 /* Now we can overwrite *n : */
f49d4d0f 1615 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1616#ifdef DEBUGGING
f49d4d0f 1617 stop = next - 1;
b81d288d 1618#endif
c277df42
IZ
1619 n = nnext;
1620 }
1621 }
61a36c01 1622
a3621e74 1623 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1624/*
1625 Two problematic code points in Unicode casefolding of EXACT nodes:
1626
1627 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1628 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1629
1630 which casefold to
1631
1632 Unicode UTF-8
1633
1634 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1635 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1636
1637 This means that in case-insensitive matching (or "loose matching",
1638 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1639 length of the above casefolded versions) can match a target string
1640 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1641 This would rather mess up the minimum length computation.
1642
1643 What we'll do is to look for the tail four bytes, and then peek
1644 at the preceding two bytes to see whether we need to decrease
1645 the minimum length by four (six minus two).
1646
1647 Thanks to the design of UTF-8, there cannot be false matches:
1648 A sequence of valid UTF-8 bytes cannot be a subsequence of
1649 another valid sequence of UTF-8 bytes.
1650
1651*/
1652 char *s0 = STRING(scan), *s, *t;
1653 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
bfed75c6
AL
1654 const char *t0 = "\xcc\x88\xcc\x81";
1655 const char *t1 = t0 + 3;
61a36c01
JH
1656
1657 for (s = s0 + 2;
1658 s < s2 && (t = ninstr(s, s1, t0, t1));
1659 s = t + 4) {
1660 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1661 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1662 min -= 4;
1663 }
1664 }
1665
c277df42
IZ
1666#ifdef DEBUGGING
1667 /* Allow dumping */
cd439c50 1668 n = scan + NODE_SZ_STR(scan);
c277df42 1669 while (n <= stop) {
22c35a8c 1670 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1671 OP(n) = OPTIMIZED;
1672 NEXT_OFF(n) = 0;
1673 }
1674 n++;
1675 }
653099ff 1676#endif
c277df42 1677 }
a3621e74
YO
1678
1679
1680
653099ff
GS
1681 /* Follow the next-chain of the current node and optimize
1682 away all the NOTHINGs from it. */
c277df42 1683 if (OP(scan) != CURLYX) {
048cfca1
GS
1684 int max = (reg_off_by_arg[OP(scan)]
1685 ? I32_MAX
1686 /* I32 may be smaller than U16 on CRAYs! */
1687 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1688 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1689 int noff;
1690 regnode *n = scan;
b81d288d 1691
c277df42
IZ
1692 /* Skip NOTHING and LONGJMP. */
1693 while ((n = regnext(n))
22c35a8c 1694 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1695 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1696 && off + noff < max)
1697 off += noff;
1698 if (reg_off_by_arg[OP(scan)])
1699 ARG(scan) = off;
b81d288d 1700 else
c277df42
IZ
1701 NEXT_OFF(scan) = off;
1702 }
a3621e74 1703
653099ff
GS
1704 /* The principal pseudo-switch. Cannot be a switch, since we
1705 look into several different things. */
b81d288d 1706 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1707 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1708 next = regnext(scan);
1709 code = OP(scan);
a3621e74 1710 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1711
1712 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1713 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1714 struct regnode_charclass_class accum;
a3621e74 1715 regnode *startbranch=scan;
c277df42 1716
653099ff 1717 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1718 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1719 if (flags & SCF_DO_STCLASS)
830247a4 1720 cl_init_zero(pRExC_state, &accum);
a3621e74 1721
c277df42 1722 while (OP(scan) == code) {
830247a4 1723 I32 deltanext, minnext, f = 0, fake;
653099ff 1724 struct regnode_charclass_class this_class;
c277df42
IZ
1725
1726 num++;
1727 data_fake.flags = 0;
b81d288d 1728 if (data) {
2c2d71f5 1729 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1730 data_fake.last_closep = data->last_closep;
1731 }
1732 else
1733 data_fake.last_closep = &fake;
c277df42
IZ
1734 next = regnext(scan);
1735 scan = NEXTOPER(scan);
1736 if (code != BRANCH)
1737 scan = NEXTOPER(scan);
653099ff 1738 if (flags & SCF_DO_STCLASS) {
830247a4 1739 cl_init(pRExC_state, &this_class);
653099ff
GS
1740 data_fake.start_class = &this_class;
1741 f = SCF_DO_STCLASS_AND;
b81d288d 1742 }
e1901655
IZ
1743 if (flags & SCF_WHILEM_VISITED_POS)
1744 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1745
653099ff 1746 /* we suppose the run is continuous, last=next...*/
830247a4 1747 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1748 next, &data_fake, f,depth+1);
b81d288d 1749 if (min1 > minnext)
c277df42
IZ
1750 min1 = minnext;
1751 if (max1 < minnext + deltanext)
1752 max1 = minnext + deltanext;
1753 if (deltanext == I32_MAX)
aca2d497 1754 is_inf = is_inf_internal = 1;
c277df42
IZ
1755 scan = next;
1756 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1757 pars++;
405ff068 1758 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1759 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1760 if (data)
1761 data->whilem_c = data_fake.whilem_c;
653099ff 1762 if (flags & SCF_DO_STCLASS)
830247a4 1763 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1764 if (code == SUSPEND)
c277df42
IZ
1765 break;
1766 }
1767 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1768 min1 = 0;
1769 if (flags & SCF_DO_SUBSTR) {
1770 data->pos_min += min1;
1771 data->pos_delta += max1 - min1;
1772 if (max1 != min1 || is_inf)
1773 data->longest = &(data->longest_float);
1774 }
1775 min += min1;
1776 delta += max1 - min1;
653099ff 1777 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1778 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1779 if (min1) {
1780 cl_and(data->start_class, &and_with);
1781 flags &= ~SCF_DO_STCLASS;
1782 }
1783 }
1784 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1785 if (min1) {
1786 cl_and(data->start_class, &accum);
653099ff 1787 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1788 }
1789 else {
b81d288d 1790 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1791 * data->start_class */
1792 StructCopy(data->start_class, &and_with,
1793 struct regnode_charclass_class);
1794 flags &= ~SCF_DO_STCLASS_AND;
1795 StructCopy(&accum, data->start_class,
1796 struct regnode_charclass_class);
1797 flags |= SCF_DO_STCLASS_OR;
1798 data->start_class->flags |= ANYOF_EOS;
1799 }
653099ff 1800 }
a3621e74
YO
1801
1802 /* demq.
1803
1804 Assuming this was/is a branch we are dealing with: 'scan' now
1805 points at the item that follows the branch sequence, whatever
1806 it is. We now start at the beginning of the sequence and look
1807 for subsequences of
1808
1809 BRANCH->EXACT=>X
1810 BRANCH->EXACT=>X
1811
1812 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1813
1814 If we can find such a subseqence we need to turn the first
1815 element into a trie and then add the subsequent branch exact
1816 strings to the trie.
1817
1818 We have two cases
1819
1820 1. patterns where the whole set of branch can be converted to a trie,
1821
1822 2. patterns where only a subset of the alternations can be
1823 converted to a trie.
1824
1825 In case 1 we can replace the whole set with a single regop
1826 for the trie. In case 2 we need to keep the start and end
1827 branchs so
1828
1829 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1830 becomes BRANCH TRIE; BRANCH X;
1831
1832 Hypthetically when we know the regex isnt anchored we can
1833 turn a case 1 into a DFA and let it rip... Every time it finds a match
1834 it would just call its tail, no WHILEM/CURLY needed.
1835
1836 */
0111c4fd
RGS
1837 if (DO_TRIE) {
1838 if (!re_trie_maxbuff) {
1839 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1840 if (!SvIOK(re_trie_maxbuff))
1841 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1842 }
a3621e74
YO
1843 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1844 regnode *cur;
1845 regnode *first = (regnode *)NULL;
1846 regnode *last = (regnode *)NULL;
1847 regnode *tail = scan;
1848 U8 optype = 0;
1849 U32 count=0;
1850
1851#ifdef DEBUGGING
1852 SV *mysv = sv_newmortal(); /* for dumping */
1853#endif
1854 /* var tail is used because there may be a TAIL
1855 regop in the way. Ie, the exacts will point to the
1856 thing following the TAIL, but the last branch will
1857 point at the TAIL. So we advance tail. If we
1858 have nested (?:) we may have to move through several
1859 tails.
1860 */
1861
1862 while ( OP( tail ) == TAIL ) {
1863 /* this is the TAIL generated by (?:) */
1864 tail = regnext( tail );
1865 }
1866
1867 DEBUG_OPTIMISE_r({
1868 regprop( mysv, tail );
1869 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1870 depth * 2 + 2, "", "Tail node is:", SvPV_nolen( mysv ),
1871 (RExC_seen_evals) ? "[EVAL]" : ""
1872 );
1873 });
1874 /*
1875
1876 step through the branches, cur represents each
1877 branch, noper is the first thing to be matched
1878 as part of that branch and noper_next is the
1879 regnext() of that node. if noper is an EXACT
1880 and noper_next is the same as scan (our current
1881 position in the regex) then the EXACT branch is
1882 a possible optimization target. Once we have
1883 two or more consequetive such branches we can
1884 create a trie of the EXACT's contents and stich
1885 it in place. If the sequence represents all of
1886 the branches we eliminate the whole thing and
1887 replace it with a single TRIE. If it is a
1888 subsequence then we need to stitch it in. This
1889 means the first branch has to remain, and needs
1890 to be repointed at the item on the branch chain
1891 following the last branch optimized. This could
1892 be either a BRANCH, in which case the
1893 subsequence is internal, or it could be the
1894 item following the branch sequence in which
1895 case the subsequence is at the end.
1896
1897 */
1898
1899 /* dont use tail as the end marker for this traverse */
1900 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1901 regnode *noper = NEXTOPER( cur );
1902 regnode *noper_next = regnext( noper );
1903
a3621e74
YO
1904 DEBUG_OPTIMISE_r({
1905 regprop( mysv, cur);
1906 PerlIO_printf( Perl_debug_log, "%*s%s",
1907 depth * 2 + 2," ", SvPV_nolen( mysv ) );
1908
1909 regprop( mysv, noper);
1910 PerlIO_printf( Perl_debug_log, " -> %s",
1911 SvPV_nolen(mysv));
1912
1913 if ( noper_next ) {
1914 regprop( mysv, noper_next );
1915 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1916 SvPV_nolen(mysv));
1917 }
1918 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1919 first, last, cur );
1920 });
1921 if ( ( first ? OP( noper ) == optype
1922 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1923 && noper_next == tail && count<U16_MAX)
1924 {
1925 count++;
1926 if ( !first ) {
1927 first = cur;
1928 optype = OP( noper );
1929 } else {
1930 DEBUG_OPTIMISE_r(
1931 if (!last ) {
1932 regprop( mysv, first);
1933 PerlIO_printf( Perl_debug_log, "%*s%s",
1934 depth * 2 + 2, "F:", SvPV_nolen( mysv ) );
1935 regprop( mysv, NEXTOPER(first) );
1936 PerlIO_printf( Perl_debug_log, " -> %s\n",
1937 SvPV_nolen( mysv ) );
1938 }
1939 );
1940 last = cur;
1941 DEBUG_OPTIMISE_r({
1942 regprop( mysv, cur);
1943 PerlIO_printf( Perl_debug_log, "%*s%s",
1944 depth * 2 + 2, "N:", SvPV_nolen( mysv ) );
1945 regprop( mysv, noper );
1946 PerlIO_printf( Perl_debug_log, " -> %s\n",
1947 SvPV_nolen( mysv ) );
1948 });
1949 }
1950 } else {
1951 if ( last ) {
1952 DEBUG_OPTIMISE_r(
1953 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1954 depth * 2 + 2, "E:", "**END**" );
1955 );
1956 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1957 }
1958 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1959 && noper_next == tail )
1960 {
1961 count = 1;
1962 first = cur;
1963 optype = OP( noper );
1964 } else {
1965 count = 0;
1966 first = NULL;
1967 optype = 0;
1968 }
1969 last = NULL;
1970 }
1971 }
1972 DEBUG_OPTIMISE_r({
1973 regprop( mysv, cur);
1974 PerlIO_printf( Perl_debug_log,
1975 "%*s%s\t(0x%p,0x%p,0x%p)\n", depth * 2 + 2,
1976 " ", SvPV_nolen( mysv ), first, last, cur);
1977
1978 });
1979 if ( last ) {
1980 DEBUG_OPTIMISE_r(
1981 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1982 depth * 2 + 2, "E:", "==END==" );
1983 );
1984 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1985 }
1986 }
1987 }
a0ed51b3 1988 }
a3621e74 1989 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 1990 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 1991 } else /* single branch is optimized. */
c277df42
IZ
1992 scan = NEXTOPER(scan);
1993 continue;
a0ed51b3
LW
1994 }
1995 else if (OP(scan) == EXACT) {
cd439c50 1996 I32 l = STR_LEN(scan);
1aa99e6b 1997 UV uc = *((U8*)STRING(scan));
a0ed51b3 1998 if (UTF) {
1aa99e6b
IH
1999 U8 *s = (U8*)STRING(scan);
2000 l = utf8_length(s, s + l);
9041c2e3 2001 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2002 }
2003 min += l;
c277df42 2004 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2005 /* The code below prefers earlier match for fixed
2006 offset, later match for variable offset. */
2007 if (data->last_end == -1) { /* Update the start info. */
2008 data->last_start_min = data->pos_min;
2009 data->last_start_max = is_inf
b81d288d 2010 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2011 }
cd439c50 2012 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292
JH
2013 {
2014 SV * sv = data->last_found;
2015 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2016 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2017 if (mg && mg->mg_len >= 0)
5e43f467
JH
2018 mg->mg_len += utf8_length((U8*)STRING(scan),
2019 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2020 }
33b8afdf
JH
2021 if (UTF)
2022 SvUTF8_on(data->last_found);
c277df42
IZ
2023 data->last_end = data->pos_min + l;
2024 data->pos_min += l; /* As in the first entry. */
2025 data->flags &= ~SF_BEFORE_EOL;
2026 }
653099ff
GS
2027 if (flags & SCF_DO_STCLASS_AND) {
2028 /* Check whether it is compatible with what we know already! */
2029 int compat = 1;
2030
1aa99e6b 2031 if (uc >= 0x100 ||
516a5887 2032 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2033 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2034 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2035 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2036 )
653099ff
GS
2037 compat = 0;
2038 ANYOF_CLASS_ZERO(data->start_class);
2039 ANYOF_BITMAP_ZERO(data->start_class);
2040 if (compat)
1aa99e6b 2041 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2042 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2043 if (uc < 0x100)
2044 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2045 }
2046 else if (flags & SCF_DO_STCLASS_OR) {
2047 /* false positive possible if the class is case-folded */
1aa99e6b 2048 if (uc < 0x100)
9b877dbb
IH
2049 ANYOF_BITMAP_SET(data->start_class, uc);
2050 else
2051 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2052 data->start_class->flags &= ~ANYOF_EOS;
2053 cl_and(data->start_class, &and_with);
2054 }
2055 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2056 }
653099ff 2057 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2058 I32 l = STR_LEN(scan);
1aa99e6b 2059 UV uc = *((U8*)STRING(scan));
653099ff
GS
2060
2061 /* Search for fixed substrings supports EXACT only. */
b81d288d 2062 if (flags & SCF_DO_SUBSTR)
830247a4 2063 scan_commit(pRExC_state, data);
a0ed51b3 2064 if (UTF) {
1aa99e6b
IH
2065 U8 *s = (U8 *)STRING(scan);
2066 l = utf8_length(s, s + l);
9041c2e3 2067 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2068 }
2069 min += l;
c277df42 2070 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 2071 data->pos_min += l;
653099ff
GS
2072 if (flags & SCF_DO_STCLASS_AND) {
2073 /* Check whether it is compatible with what we know already! */
2074 int compat = 1;
2075
1aa99e6b 2076 if (uc >= 0x100 ||
516a5887 2077 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2078 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2079 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2080 compat = 0;
2081 ANYOF_CLASS_ZERO(data->start_class);
2082 ANYOF_BITMAP_ZERO(data->start_class);
2083 if (compat) {
1aa99e6b 2084 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2085 data->start_class->flags &= ~ANYOF_EOS;
2086 data->start_class->flags |= ANYOF_FOLD;
2087 if (OP(scan) == EXACTFL)
2088 data->start_class->flags |= ANYOF_LOCALE;
2089 }
2090 }
2091 else if (flags & SCF_DO_STCLASS_OR) {
2092 if (data->start_class->flags & ANYOF_FOLD) {
2093 /* false positive possible if the class is case-folded.
2094 Assume that the locale settings are the same... */
1aa99e6b
IH
2095 if (uc < 0x100)
2096 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2097 data->start_class->flags &= ~ANYOF_EOS;
2098 }
2099 cl_and(data->start_class, &and_with);
2100 }
2101 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2102 }
bfed75c6 2103 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2104 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2105 I32 f = flags, pos_before = 0;
c277df42 2106 regnode *oscan = scan;
653099ff
GS
2107 struct regnode_charclass_class this_class;
2108 struct regnode_charclass_class *oclass = NULL;
727f22e3 2109 I32 next_is_eval = 0;
653099ff 2110
22c35a8c 2111 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2112 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2113 scan = NEXTOPER(scan);
2114 goto finish;
2115 case PLUS:
653099ff 2116 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2117 next = NEXTOPER(scan);
653099ff 2118 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2119 mincount = 1;
2120 maxcount = REG_INFTY;
c277df42
IZ
2121 next = regnext(scan);
2122 scan = NEXTOPER(scan);
2123 goto do_curly;
2124 }
2125 }
2126 if (flags & SCF_DO_SUBSTR)
2127 data->pos_min++;
2128 min++;
2129 /* Fall through. */
2130 case STAR:
653099ff
GS
2131 if (flags & SCF_DO_STCLASS) {
2132 mincount = 0;
b81d288d 2133 maxcount = REG_INFTY;
653099ff
GS
2134 next = regnext(scan);
2135 scan = NEXTOPER(scan);
2136 goto do_curly;
2137 }
b81d288d 2138 is_inf = is_inf_internal = 1;
c277df42
IZ
2139 scan = regnext(scan);
2140 if (flags & SCF_DO_SUBSTR) {
830247a4 2141 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2142 data->longest = &(data->longest_float);
2143 }
2144 goto optimize_curly_tail;
2145 case CURLY:
b81d288d 2146 mincount = ARG1(scan);
c277df42
IZ
2147 maxcount = ARG2(scan);
2148 next = regnext(scan);
cb434fcc
IZ
2149 if (OP(scan) == CURLYX) {
2150 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2151 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2152 }
c277df42 2153 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2154 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2155 do_curly:
2156 if (flags & SCF_DO_SUBSTR) {
830247a4 2157 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2158 pos_before = data->pos_min;
2159 }
2160 if (data) {
2161 fl = data->flags;
2162 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2163 if (is_inf)
2164 data->flags |= SF_IS_INF;
2165 }
653099ff 2166 if (flags & SCF_DO_STCLASS) {
830247a4 2167 cl_init(pRExC_state, &this_class);
653099ff
GS
2168 oclass = data->start_class;
2169 data->start_class = &this_class;
2170 f |= SCF_DO_STCLASS_AND;
2171 f &= ~SCF_DO_STCLASS_OR;
2172 }
e1901655
IZ
2173 /* These are the cases when once a subexpression
2174 fails at a particular position, it cannot succeed
2175 even after backtracking at the enclosing scope.
b81d288d 2176
e1901655
IZ
2177 XXXX what if minimal match and we are at the
2178 initial run of {n,m}? */
2179 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2180 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2181
c277df42 2182 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2183 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2184 (mincount == 0
2185 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2186
2187 if (flags & SCF_DO_STCLASS)
2188 data->start_class = oclass;
2189 if (mincount == 0 || minnext == 0) {
2190 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2191 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2192 }
2193 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2194 /* Switch to OR mode: cache the old value of
653099ff
GS
2195 * data->start_class */
2196 StructCopy(data->start_class, &and_with,
2197 struct regnode_charclass_class);
2198 flags &= ~SCF_DO_STCLASS_AND;
2199 StructCopy(&this_class, data->start_class,
2200 struct regnode_charclass_class);
2201 flags |= SCF_DO_STCLASS_OR;
2202 data->start_class->flags |= ANYOF_EOS;
2203 }
2204 } else { /* Non-zero len */
2205 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2206 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2207 cl_and(data->start_class, &and_with);
2208 }
2209 else if (flags & SCF_DO_STCLASS_AND)
2210 cl_and(data->start_class, &this_class);
2211 flags &= ~SCF_DO_STCLASS;
2212 }
c277df42
IZ
2213 if (!scan) /* It was not CURLYX, but CURLY. */
2214 scan = next;
84037bb0 2215 if (ckWARN(WARN_REGEXP)
727f22e3
JP
2216 /* ? quantifier ok, except for (?{ ... }) */
2217 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2218 && (minnext == 0) && (deltanext == 0)
99799961 2219 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 2220 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 2221 {
830247a4 2222 vWARN(RExC_parse,
b45f050a
JF
2223 "Quantifier unexpected on zero-length expression");
2224 }
2225
c277df42 2226 min += minnext * mincount;
b81d288d 2227 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2228 && (minnext + deltanext) > 0)
2229 || deltanext == I32_MAX);
aca2d497 2230 is_inf |= is_inf_internal;
c277df42
IZ
2231 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2232
2233 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2234 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2235 && data->flags & SF_IN_PAR
2236 && !(data->flags & SF_HAS_EVAL)
2237 && !deltanext && minnext == 1 ) {
2238 /* Try to optimize to CURLYN. */
2239 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
2240 regnode *nxt1 = nxt;
2241#ifdef DEBUGGING
2242 regnode *nxt2;
2243#endif
c277df42
IZ
2244
2245 /* Skip open. */
2246 nxt = regnext(nxt);
bfed75c6 2247 if (!strchr((const char*)PL_simple,OP(nxt))
22c35a8c 2248 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 2249 && STR_LEN(nxt) == 1))
c277df42 2250 goto nogo;
497b47a8 2251#ifdef DEBUGGING
c277df42 2252 nxt2 = nxt;
497b47a8 2253#endif
c277df42 2254 nxt = regnext(nxt);
b81d288d 2255 if (OP(nxt) != CLOSE)
c277df42
IZ
2256 goto nogo;
2257 /* Now we know that nxt2 is the only contents: */
eb160463 2258 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2259 OP(oscan) = CURLYN;
2260 OP(nxt1) = NOTHING; /* was OPEN. */
2261#ifdef DEBUGGING
2262 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2263 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2264 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2265 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2266 OP(nxt + 1) = OPTIMIZED; /* was count. */
2267 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2268#endif
c277df42 2269 }
c277df42
IZ
2270 nogo:
2271
2272 /* Try optimization CURLYX => CURLYM. */
b81d288d 2273 if ( OP(oscan) == CURLYX && data
c277df42 2274 && !(data->flags & SF_HAS_PAR)
c277df42 2275 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2276 && !deltanext /* atom is fixed width */
2277 && minnext != 0 /* CURLYM can't handle zero width */
2278 ) {
c277df42
IZ
2279 /* XXXX How to optimize if data == 0? */
2280 /* Optimize to a simpler form. */
2281 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2282 regnode *nxt2;
2283
2284 OP(oscan) = CURLYM;
2285 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2286 && (OP(nxt2) != WHILEM))
c277df42
IZ
2287 nxt = nxt2;
2288 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2289 /* Need to optimize away parenths. */
2290 if (data->flags & SF_IN_PAR) {
2291 /* Set the parenth number. */
2292 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2293
b81d288d 2294 if (OP(nxt) != CLOSE)
b45f050a 2295 FAIL("Panic opt close");
eb160463 2296 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2297 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2298 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2299#ifdef DEBUGGING
2300 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2301 OP(nxt + 1) = OPTIMIZED; /* was count. */
2302 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2303 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2304#endif
c277df42
IZ
2305#if 0
2306 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2307 regnode *nnxt = regnext(nxt1);
b81d288d 2308
c277df42
IZ
2309 if (nnxt == nxt) {
2310 if (reg_off_by_arg[OP(nxt1)])
2311 ARG_SET(nxt1, nxt2 - nxt1);
2312 else if (nxt2 - nxt1 < U16_MAX)
2313 NEXT_OFF(nxt1) = nxt2 - nxt1;
2314 else
2315 OP(nxt) = NOTHING; /* Cannot beautify */
2316 }
2317 nxt1 = nnxt;
2318 }
2319#endif
2320 /* Optimize again: */
b81d288d 2321 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2322 NULL, 0,depth+1);
a0ed51b3
LW
2323 }
2324 else
c277df42 2325 oscan->flags = 0;
c277df42 2326 }
e1901655
IZ
2327 else if ((OP(oscan) == CURLYX)
2328 && (flags & SCF_WHILEM_VISITED_POS)
2329 /* See the comment on a similar expression above.
2330 However, this time it not a subexpression
2331 we care about, but the expression itself. */
2332 && (maxcount == REG_INFTY)
2333 && data && ++data->whilem_c < 16) {
2334 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2335 /* Find WHILEM (as in regexec.c) */
2336 regnode *nxt = oscan + NEXT_OFF(oscan);
2337
2338 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2339 nxt += ARG(nxt);
eb160463
GS
2340 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2341 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2342 }
b81d288d 2343 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2344 pars++;
2345 if (flags & SCF_DO_SUBSTR) {
2346 SV *last_str = Nullsv;
2347 int counted = mincount != 0;
2348
2349 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2350#if defined(SPARC64_GCC_WORKAROUND)
2351 I32 b = 0;
2352 STRLEN l = 0;
2353 char *s = NULL;
2354 I32 old = 0;
2355
2356 if (pos_before >= data->last_start_min)
2357 b = pos_before;
2358 else
2359 b = data->last_start_min;
2360
2361 l = 0;
2362 s = SvPV(data->last_found, l);
2363 old = b - data->last_start_min;
2364
2365#else
b81d288d 2366 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2367 ? pos_before : data->last_start_min;
2368 STRLEN l;
2369 char *s = SvPV(data->last_found, l);
a0ed51b3 2370 I32 old = b - data->last_start_min;
5d1c421c 2371#endif
a0ed51b3
LW
2372
2373 if (UTF)
2374 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2375
a0ed51b3 2376 l -= old;
c277df42 2377 /* Get the added string: */
79cb57f6 2378 last_str = newSVpvn(s + old, l);
0e933229
IH
2379 if (UTF)
2380 SvUTF8_on(last_str);
c277df42
IZ
2381 if (deltanext == 0 && pos_before == b) {
2382 /* What was added is a constant string */
2383 if (mincount > 1) {
2384 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2385 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
2386 SvPVX(last_str), l, mincount - 1);
2387 SvCUR(last_str) *= mincount;
2388 /* Add additional parts. */
b81d288d 2389 SvCUR_set(data->last_found,
c277df42
IZ
2390 SvCUR(data->last_found) - l);
2391 sv_catsv(data->last_found, last_str);
0eda9292
JH
2392 {
2393 SV * sv = data->last_found;
2394 MAGIC *mg =
2395 SvUTF8(sv) && SvMAGICAL(sv) ?
2396 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2397 if (mg && mg->mg_len >= 0)
2398 mg->mg_len += CHR_SVLEN(last_str);
2399 }
c277df42
IZ
2400 data->last_end += l * (mincount - 1);
2401 }
2a8d9689
HS
2402 } else {
2403 /* start offset must point into the last copy */
2404 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2405 data->last_start_max += is_inf ? I32_MAX
2406 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2407 }
2408 }
2409 /* It is counted once already... */
2410 data->pos_min += minnext * (mincount - counted);
2411 data->pos_delta += - counted * deltanext +
2412 (minnext + deltanext) * maxcount - minnext * mincount;
2413 if (mincount != maxcount) {
653099ff
GS
2414 /* Cannot extend fixed substrings found inside
2415 the group. */
830247a4 2416 scan_commit(pRExC_state,data);
c277df42
IZ
2417 if (mincount && last_str) {
2418 sv_setsv(data->last_found, last_str);
2419 data->last_end = data->pos_min;
b81d288d 2420 data->last_start_min =
a0ed51b3 2421 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2422 data->last_start_max = is_inf
2423 ? I32_MAX
c277df42 2424 : data->pos_min + data->pos_delta
a0ed51b3 2425 - CHR_SVLEN(last_str);
c277df42
IZ
2426 }
2427 data->longest = &(data->longest_float);
2428 }
aca2d497 2429 SvREFCNT_dec(last_str);
c277df42 2430 }
405ff068 2431 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2432 data->flags |= SF_HAS_EVAL;
2433 optimize_curly_tail:
c277df42 2434 if (OP(oscan) != CURLYX) {
22c35a8c 2435 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2436 && NEXT_OFF(next))
2437 NEXT_OFF(oscan) += NEXT_OFF(next);
2438 }
c277df42 2439 continue;
653099ff 2440 default: /* REF and CLUMP only? */
c277df42 2441 if (flags & SCF_DO_SUBSTR) {
830247a4 2442 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2443 data->longest = &(data->longest_float);
2444 }
aca2d497 2445 is_inf = is_inf_internal = 1;
653099ff 2446 if (flags & SCF_DO_STCLASS_OR)
830247a4 2447 cl_anything(pRExC_state, data->start_class);
653099ff 2448 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2449 break;
2450 }
a0ed51b3 2451 }
bfed75c6 2452 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2453 int value = 0;
653099ff 2454
c277df42 2455 if (flags & SCF_DO_SUBSTR) {
830247a4 2456 scan_commit(pRExC_state,data);
c277df42
IZ
2457 data->pos_min++;
2458 }
2459 min++;
653099ff
GS
2460 if (flags & SCF_DO_STCLASS) {
2461 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2462
2463 /* Some of the logic below assumes that switching
2464 locale on will only add false positives. */
2465 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2466 case SANY:
653099ff
GS
2467 default:
2468 do_default:
2469 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2470 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2471 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2472 break;
2473 case REG_ANY:
2474 if (OP(scan) == SANY)
2475 goto do_default;
2476 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2477 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2478 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2479 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2480 }
2481 if (flags & SCF_DO_STCLASS_AND || !value)
2482 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2483 break;
2484 case ANYOF:
2485 if (flags & SCF_DO_STCLASS_AND)
2486 cl_and(data->start_class,
2487 (struct regnode_charclass_class*)scan);
2488 else
830247a4 2489 cl_or(pRExC_state, data->start_class,
653099ff
GS
2490 (struct regnode_charclass_class*)scan);
2491 break;
2492 case ALNUM:
2493 if (flags & SCF_DO_STCLASS_AND) {
2494 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2495 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2496 for (value = 0; value < 256; value++)
2497 if (!isALNUM(value))
2498 ANYOF_BITMAP_CLEAR(data->start_class, value);
2499 }
2500 }
2501 else {
2502 if (data->start_class->flags & ANYOF_LOCALE)
2503 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2504 else {
2505 for (value = 0; value < 256; value++)
2506 if (isALNUM(value))
b81d288d 2507 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2508 }
2509 }
2510 break;
2511 case ALNUML:
2512 if (flags & SCF_DO_STCLASS_AND) {
2513 if (data->start_class->flags & ANYOF_LOCALE)
2514 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2515 }
2516 else {
2517 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2518 data->start_class->flags |= ANYOF_LOCALE;
2519 }
2520 break;
2521 case NALNUM:
2522 if (flags & SCF_DO_STCLASS_AND) {
2523 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2524 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2525 for (value = 0; value < 256; value++)
2526 if (isALNUM(value))
2527 ANYOF_BITMAP_CLEAR(data->start_class, value);
2528 }
2529 }
2530 else {
2531 if (data->start_class->flags & ANYOF_LOCALE)
2532 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2533 else {
2534 for (value = 0; value < 256; value++)
2535 if (!isALNUM(value))
b81d288d 2536 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2537 }
2538 }
2539 break;
2540 case NALNUML:
2541 if (flags & SCF_DO_STCLASS_AND) {
2542 if (data->start_class->flags & ANYOF_LOCALE)
2543 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2544 }
2545 else {
2546 data->start_class->flags |= ANYOF_LOCALE;
2547 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2548 }
2549 break;
2550 case SPACE:
2551 if (flags & SCF_DO_STCLASS_AND) {
2552 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2553 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2554 for (value = 0; value < 256; value++)
2555 if (!isSPACE(value))
2556 ANYOF_BITMAP_CLEAR(data->start_class, value);
2557 }
2558 }
2559 else {
2560 if (data->start_class->flags & ANYOF_LOCALE)
2561 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2562 else {
2563 for (value = 0; value < 256; value++)
2564 if (isSPACE(value))
b81d288d 2565 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2566 }
2567 }
2568 break;
2569 case SPACEL:
2570 if (flags & SCF_DO_STCLASS_AND) {
2571 if (data->start_class->flags & ANYOF_LOCALE)
2572 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2573 }
2574 else {
2575 data->start_class->flags |= ANYOF_LOCALE;
2576 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2577 }
2578 break;
2579 case NSPACE:
2580 if (flags & SCF_DO_STCLASS_AND) {
2581 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2582 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2583 for (value = 0; value < 256; value++)
2584 if (isSPACE(value))
2585 ANYOF_BITMAP_CLEAR(data->start_class, value);
2586 }
2587 }
2588 else {
2589 if (data->start_class->flags & ANYOF_LOCALE)
2590 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2591 else {
2592 for (value = 0; value < 256; value++)
2593 if (!isSPACE(value))
b81d288d 2594 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2595 }
2596 }
2597 break;
2598 case NSPACEL:
2599 if (flags & SCF_DO_STCLASS_AND) {
2600 if (data->start_class->flags & ANYOF_LOCALE) {
2601 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2602 for (value = 0; value < 256; value++)
2603 if (!isSPACE(value))
2604 ANYOF_BITMAP_CLEAR(data->start_class, value);
2605 }
2606 }
2607 else {
2608 data->start_class->flags |= ANYOF_LOCALE;
2609 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2610 }
2611 break;
2612 case DIGIT:
2613 if (flags & SCF_DO_STCLASS_AND) {
2614 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2615 for (value = 0; value < 256; value++)
2616 if (!isDIGIT(value))
2617 ANYOF_BITMAP_CLEAR(data->start_class, value);
2618 }
2619 else {
2620 if (data->start_class->flags & ANYOF_LOCALE)
2621 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2622 else {
2623 for (value = 0; value < 256; value++)
2624 if (isDIGIT(value))
b81d288d 2625 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2626 }
2627 }
2628 break;
2629 case NDIGIT:
2630 if (flags & SCF_DO_STCLASS_AND) {
2631 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2632 for (value = 0; value < 256; value++)
2633 if (isDIGIT(value))
2634 ANYOF_BITMAP_CLEAR(data->start_class, value);
2635 }
2636 else {
2637 if (data->start_class->flags & ANYOF_LOCALE)
2638 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2639 else {
2640 for (value = 0; value < 256; value++)
2641 if (!isDIGIT(value))
b81d288d 2642 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2643 }
2644 }
2645 break;
2646 }
2647 if (flags & SCF_DO_STCLASS_OR)
2648 cl_and(data->start_class, &and_with);
2649 flags &= ~SCF_DO_STCLASS;
2650 }
a0ed51b3 2651 }
22c35a8c 2652 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2653 data->flags |= (OP(scan) == MEOL
2654 ? SF_BEFORE_MEOL
2655 : SF_BEFORE_SEOL);
a0ed51b3 2656 }
653099ff
GS
2657 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2658 /* Lookbehind, or need to calculate parens/evals/stclass: */
2659 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2660 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2661 /* Lookahead/lookbehind */
cb434fcc 2662 I32 deltanext, minnext, fake = 0;
c277df42 2663 regnode *nscan;
653099ff
GS
2664 struct regnode_charclass_class intrnl;
2665 int f = 0;
c277df42
IZ
2666
2667 data_fake.flags = 0;
b81d288d 2668 if (data) {
2c2d71f5 2669 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2670 data_fake.last_closep = data->last_closep;
2671 }
2672 else
2673 data_fake.last_closep = &fake;
653099ff
GS
2674 if ( flags & SCF_DO_STCLASS && !scan->flags
2675 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2676 cl_init(pRExC_state, &intrnl);
653099ff 2677 data_fake.start_class = &intrnl;
e1901655 2678 f |= SCF_DO_STCLASS_AND;
653099ff 2679 }
e1901655
IZ
2680 if (flags & SCF_WHILEM_VISITED_POS)
2681 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2682 next = regnext(scan);
2683 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2684 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2685 if (scan->flags) {
2686 if (deltanext) {
9baa0206 2687 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2688 }
2689 else if (minnext > U8_MAX) {
9baa0206 2690 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2691 }
eb160463 2692 scan->flags = (U8)minnext;
c277df42
IZ
2693 }
2694 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2695 pars++;
405ff068 2696 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2697 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2698 if (data)
2699 data->whilem_c = data_fake.whilem_c;
e1901655 2700 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
2701 int was = (data->start_class->flags & ANYOF_EOS);
2702
2703 cl_and(data->start_class, &intrnl);
2704 if (was)
2705 data->start_class->flags |= ANYOF_EOS;
2706 }
a0ed51b3
LW
2707 }
2708 else if (OP(scan) == OPEN) {
c277df42 2709 pars++;
a0ed51b3 2710 }
cb434fcc 2711 else if (OP(scan) == CLOSE) {
eb160463 2712 if ((I32)ARG(scan) == is_par) {
cb434fcc 2713 next = regnext(scan);
c277df42 2714
cb434fcc
IZ
2715 if ( next && (OP(next) != WHILEM) && next < last)
2716 is_par = 0; /* Disable optimization */
2717 }
2718 if (data)
2719 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2720 }
2721 else if (OP(scan) == EVAL) {
c277df42
IZ
2722 if (data)
2723 data->flags |= SF_HAS_EVAL;
2724 }
96776eda 2725 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2726 if (flags & SCF_DO_SUBSTR) {
830247a4 2727 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2728 data->longest = &(data->longest_float);
2729 }
2730 is_inf = is_inf_internal = 1;
653099ff 2731 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2732 cl_anything(pRExC_state, data->start_class);
96776eda 2733 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2734 }
c277df42
IZ
2735 /* Else: zero-length, ignore. */
2736 scan = regnext(scan);
2737 }
2738
2739 finish:
2740 *scanp = scan;
aca2d497 2741 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2742 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2743 data->pos_delta = I32_MAX - data->pos_min;
2744 if (is_par > U8_MAX)
2745 is_par = 0;
2746 if (is_par && pars==1 && data) {
2747 data->flags |= SF_IN_PAR;
2748 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2749 }
2750 else if (pars && data) {
c277df42
IZ
2751 data->flags |= SF_HAS_PAR;
2752 data->flags &= ~SF_IN_PAR;
2753 }
653099ff
GS
2754 if (flags & SCF_DO_STCLASS_OR)
2755 cl_and(data->start_class, &and_with);
c277df42
IZ
2756 return min;
2757}
2758
76e3520e 2759STATIC I32
bfed75c6 2760S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2761{
830247a4 2762 if (RExC_rx->data) {
b81d288d
AB
2763 Renewc(RExC_rx->data,
2764 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2765 char, struct reg_data);
830247a4
IZ
2766 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2767 RExC_rx->data->count += n;
a0ed51b3
LW
2768 }
2769 else {
830247a4 2770 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2771 char, struct reg_data);
830247a4
IZ
2772 New(1208, RExC_rx->data->what, n, U8);
2773 RExC_rx->data->count = n;
c277df42 2774 }
830247a4
IZ
2775 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2776 return RExC_rx->data->count - n;
c277df42
IZ
2777}
2778
d88dccdf 2779void
864dbfa3 2780Perl_reginitcolors(pTHX)
d88dccdf 2781{
d88dccdf
IZ
2782 int i = 0;
2783 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 2784
d88dccdf
IZ
2785 if (s) {
2786 PL_colors[0] = s = savepv(s);
2787 while (++i < 6) {
2788 s = strchr(s, '\t');
2789 if (s) {
2790 *s = '\0';
2791 PL_colors[i] = ++s;
2792 }
2793 else
c712d376 2794 PL_colors[i] = s = "";
d88dccdf
IZ
2795 }
2796 } else {
b81d288d 2797 while (i < 6)
d88dccdf
IZ
2798 PL_colors[i++] = "";
2799 }
2800 PL_colorset = 1;
2801}
2802
8615cb43 2803
a687059c 2804/*
e50aee73 2805 - pregcomp - compile a regular expression into internal code
a687059c
LW
2806 *
2807 * We can't allocate space until we know how big the compiled form will be,
2808 * but we can't compile it (and thus know how big it is) until we've got a
2809 * place to put the code. So we cheat: we compile it twice, once with code
2810 * generation turned off and size counting turned on, and once "for real".
2811 * This also means that we don't allocate space until we are sure that the
2812 * thing really will compile successfully, and we never have to move the
2813 * code and thus invalidate pointers into it. (Note that it has to be in
2814 * one piece because free() must be able to free it all.) [NB: not true in perl]
2815 *
2816 * Beware that the optimization-preparation code in here knows about some
2817 * of the structure of the compiled regexp. [I'll say.]
2818 */
2819regexp *
864dbfa3 2820Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2821{
a0d0e21e 2822 register regexp *r;
c277df42 2823 regnode *scan;
c277df42 2824 regnode *first;
a0d0e21e 2825 I32 flags;
a0d0e21e
LW
2826 I32 minlen = 0;
2827 I32 sawplus = 0;
2828 I32 sawopen = 0;
2c2d71f5 2829 scan_data_t data;
830247a4
IZ
2830 RExC_state_t RExC_state;
2831 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 2832
a3621e74
YO
2833 GET_RE_DEBUG_FLAGS_DECL;
2834
a0d0e21e 2835 if (exp == NULL)
c277df42 2836 FAIL("NULL regexp argument");
a0d0e21e 2837
a5961de5 2838 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 2839
5cfc7842 2840 RExC_precomp = exp;
a3621e74
YO
2841 DEBUG_r(if (!PL_colorset) reginitcolors());
2842 DEBUG_COMPILE_r({
2843 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
2844 PL_colors[4],PL_colors[5],PL_colors[0],
2845 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2846 });
e2509266 2847 RExC_flags = pm->op_pmflags;
830247a4 2848 RExC_sawback = 0;
bbce6d69 2849
830247a4
IZ
2850 RExC_seen = 0;
2851 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2852 RExC_seen_evals = 0;
2853 RExC_extralen = 0;
c277df42 2854
bbce6d69 2855 /* First pass: determine size, legality. */
830247a4 2856 RExC_parse = exp;
fac92740 2857 RExC_start = exp;
830247a4
IZ
2858 RExC_end = xend;
2859 RExC_naughty = 0;
2860 RExC_npar = 1;
2861 RExC_size = 0L;
2862 RExC_emit = &PL_regdummy;
2863 RExC_whilem_seen = 0;
85ddcde9
JH
2864#if 0 /* REGC() is (currently) a NOP at the first pass.
2865 * Clever compilers notice this and complain. --jhi */
830247a4 2866 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 2867#endif
830247a4 2868 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 2869 RExC_precomp = Nullch;
a0d0e21e
LW
2870 return(NULL);
2871 }
a3621e74 2872 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 2873
c277df42
IZ
2874 /* Small enough for pointer-storage convention?
2875 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
2876 if (RExC_size >= 0x10000L && RExC_extralen)
2877 RExC_size += RExC_extralen;
c277df42 2878 else
830247a4
IZ
2879 RExC_extralen = 0;
2880 if (RExC_whilem_seen > 15)
2881 RExC_whilem_seen = 15;
a0d0e21e 2882
bbce6d69 2883 /* Allocate space and initialize. */
830247a4 2884 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 2885 char, regexp);
a0d0e21e 2886 if (r == NULL)
b45f050a
JF
2887 FAIL("Regexp out of space");
2888
0f79a09d
GS
2889#ifdef DEBUGGING
2890 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 2891 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 2892#endif
c277df42 2893 r->refcnt = 1;
bbce6d69 2894 r->prelen = xend - exp;
5cfc7842 2895 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 2896 r->subbeg = NULL;
ed252734
NC
2897#ifdef PERL_COPY_ON_WRITE
2898 r->saved_copy = Nullsv;
2899#endif
cf93c79d 2900 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 2901 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
2902
2903 r->substrs = 0; /* Useful during FAIL. */
2904 r->startp = 0; /* Useful during FAIL. */
2905 r->endp = 0; /* Useful during FAIL. */
2906
fac92740
MJD
2907 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2908 if (r->offsets) {
2909 r->offsets[0] = RExC_size;
2910 }
a3621e74 2911 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
392fbf5d 2912 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 2913 r->offsets ? "Got" : "Couldn't get",
392fbf5d 2914 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 2915
830247a4 2916 RExC_rx = r;
bbce6d69 2917
2918 /* Second pass: emit code. */
e2509266 2919 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
2920 RExC_parse = exp;
2921 RExC_end = xend;
2922 RExC_naughty = 0;
2923 RExC_npar = 1;
fac92740 2924 RExC_emit_start = r->program;
830247a4 2925 RExC_emit = r->program;
2cd61cdb 2926 /* Store the count of eval-groups for security checks: */
eb160463 2927 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 2928 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 2929 r->data = 0;
830247a4 2930 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
2931 return(NULL);
2932
a3621e74 2933
a0d0e21e 2934 /* Dig out information for optimizations. */
cf93c79d 2935 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 2936 pm->op_pmflags = RExC_flags;
a0ed51b3 2937 if (UTF)
5ff6fc6d 2938 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 2939 r->regstclass = NULL;
830247a4 2940 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 2941 r->reganch |= ROPT_NAUGHTY;
c277df42 2942 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
2943
2944 /* XXXX To minimize changes to RE engine we always allocate
2945 3-units-long substrs field. */
2946 Newz(1004, r->substrs, 1, struct reg_substr_data);
2947
2c2d71f5 2948 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 2949 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 2950 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 2951 I32 fake;
c5254dd6 2952 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
2953 struct regnode_charclass_class ch_class;
2954 int stclass_flag;
cb434fcc 2955 I32 last_close = 0;
a0d0e21e
LW
2956
2957 first = scan;
c277df42 2958 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 2959 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 2960 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
2961 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2962 (OP(first) == PLUS) ||
2963 (OP(first) == MINMOD) ||
653099ff 2964 /* An {n,m} with n>0 */
22c35a8c 2965 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
2966 if (OP(first) == PLUS)
2967 sawplus = 1;
2968 else
2969 first += regarglen[(U8)OP(first)];
2970 first = NEXTOPER(first);
a687059c
LW
2971 }
2972
a0d0e21e
LW
2973 /* Starting-point info. */
2974 again:
653099ff 2975 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
2976 if (OP(first) == EXACT)
2977 ; /* Empty, get anchored substr later. */
2978 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
2979 r->regstclass = first;
2980 }
bfed75c6 2981 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 2982 r->regstclass = first;
22c35a8c
GS
2983 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2984 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 2985 r->regstclass = first;
22c35a8c 2986 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
2987 r->reganch |= (OP(first) == MBOL
2988 ? ROPT_ANCH_MBOL
2989 : (OP(first) == SBOL
2990 ? ROPT_ANCH_SBOL
2991 : ROPT_ANCH_BOL));
a0d0e21e 2992 first = NEXTOPER(first);
774d564b 2993 goto again;
2994 }
2995 else if (OP(first) == GPOS) {
2996 r->reganch |= ROPT_ANCH_GPOS;
2997 first = NEXTOPER(first);
2998 goto again;
a0d0e21e 2999 }
e09294f4 3000 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 3001 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3002 !(r->reganch & ROPT_ANCH) )
3003 {
3004 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
3005 int type = OP(NEXTOPER(first));
3006
ffc61ed2 3007 if (type == REG_ANY)
cad2e5aa
JH
3008 type = ROPT_ANCH_MBOL;
3009 else
3010 type = ROPT_ANCH_SBOL;
3011
3012 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3013 first = NEXTOPER(first);
774d564b 3014 goto again;
a0d0e21e 3015 }
b81d288d 3016 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3017 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3018 /* x+ must match at the 1st pos of run of x's */
3019 r->reganch |= ROPT_SKIP;
a0d0e21e 3020
c277df42 3021 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 3022 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 3023 (IV)(first - scan + 1)));
a0d0e21e
LW
3024 /*
3025 * If there's something expensive in the r.e., find the
3026 * longest literal string that must appear and make it the
3027 * regmust. Resolve ties in favor of later strings, since
3028 * the regstart check works with the beginning of the r.e.
3029 * and avoiding duplication strengthens checking. Not a
3030 * strong reason, but sufficient in the absence of others.
3031 * [Now we resolve ties in favor of the earlier string if
c277df42 3032 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3033 * earlier string may buy us something the later one won't.]
3034 */
a0d0e21e 3035 minlen = 0;
a687059c 3036
79cb57f6
GS
3037 data.longest_fixed = newSVpvn("",0);
3038 data.longest_float = newSVpvn("",0);
3039 data.last_found = newSVpvn("",0);
c277df42
IZ
3040 data.longest = &(data.longest_fixed);
3041 first = scan;
653099ff 3042 if (!r->regstclass) {
830247a4 3043 cl_init(pRExC_state, &ch_class);
653099ff
GS
3044 data.start_class = &ch_class;
3045 stclass_flag = SCF_DO_STCLASS_AND;
3046 } else /* XXXX Check for BOUND? */
3047 stclass_flag = 0;
cb434fcc 3048 data.last_closep = &last_close;
653099ff 3049
830247a4 3050 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3051 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3052 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3053 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3054 && !RExC_seen_zerolen
3055 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3056 r->reganch |= ROPT_CHECK_ALL;
830247a4 3057 scan_commit(pRExC_state, &data);
c277df42
IZ
3058 SvREFCNT_dec(data.last_found);
3059
a0ed51b3 3060 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3061 if (longest_float_length
c277df42
IZ
3062 || (data.flags & SF_FL_BEFORE_EOL
3063 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3064 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3065 int t;
3066
a0ed51b3 3067 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3068 && data.offset_fixed == data.offset_float_min
3069 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3070 goto remove_float; /* As in (a)+. */
3071
33b8afdf
JH
3072 if (SvUTF8(data.longest_float)) {
3073 r->float_utf8 = data.longest_float;
3074 r->float_substr = Nullsv;
3075 } else {
3076 r->float_substr = data.longest_float;
3077 r->float_utf8 = Nullsv;
3078 }
c277df42
IZ
3079 r->float_min_offset = data.offset_float_min;
3080 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3081 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3082 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3083 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3084 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3085 }
3086 else {
aca2d497 3087 remove_float:
33b8afdf 3088 r->float_substr = r->float_utf8 = Nullsv;
c277df42 3089 SvREFCNT_dec(data.longest_float);
c5254dd6 3090 longest_float_length = 0;
a0d0e21e 3091 }
c277df42 3092
a0ed51b3 3093 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3094 if (longest_fixed_length
c277df42
IZ
3095 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3096 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3097 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3098 int t;
3099
33b8afdf
JH
3100 if (SvUTF8(data.longest_fixed)) {
3101 r->anchored_utf8 = data.longest_fixed;
3102 r->anchored_substr = Nullsv;
3103 } else {
3104 r->anchored_substr = data.longest_fixed;
3105 r->anchored_utf8 = Nullsv;
3106 }
c277df42 3107 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3108 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3109 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3110 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3111 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3112 }
3113 else {
33b8afdf 3114 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 3115 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3116 longest_fixed_length = 0;
a0d0e21e 3117 }
b81d288d 3118 if (r->regstclass
ffc61ed2 3119 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3120 r->regstclass = NULL;
33b8afdf
JH
3121 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3122 && stclass_flag
653099ff 3123 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3124 && !cl_is_anything(data.start_class))
3125 {
830247a4 3126 I32 n = add_data(pRExC_state, 1, "f");
653099ff 3127
b81d288d 3128 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
3129 struct regnode_charclass_class);
3130 StructCopy(data.start_class,
830247a4 3131 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3132 struct regnode_charclass_class);
830247a4 3133 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3134 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 3135 PL_regdata = r->data; /* for regprop() */
a3621e74 3136 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
9c5ffd7c
JH
3137 regprop(sv, (regnode*)data.start_class);
3138 PerlIO_printf(Perl_debug_log,
3139 "synthetic stclass `%s'.\n",
3140 SvPVX(sv));});
653099ff 3141 }
c277df42
IZ
3142
3143 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3144 if (longest_fixed_length > longest_float_length) {
c277df42 3145 r->check_substr = r->anchored_substr;
33b8afdf 3146 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3147 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3148 if (r->reganch & ROPT_ANCH_SINGLE)
3149 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3150 }
3151 else {
c277df42 3152 r->check_substr = r->float_substr;
33b8afdf 3153 r->check_utf8 = r->float_utf8;
c277df42
IZ
3154 r->check_offset_min = data.offset_float_min;
3155 r->check_offset_max = data.offset_float_max;
a0d0e21e 3156 }
30382c73
IZ
3157 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3158 This should be changed ASAP! */
33b8afdf 3159 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3160 r->reganch |= RE_USE_INTUIT;
33b8afdf 3161 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3162 r->reganch |= RE_INTUIT_TAIL;
3163 }
a0ed51b3
LW
3164 }
3165 else {
c277df42
IZ
3166 /* Several toplevels. Best we can is to set minlen. */
3167 I32 fake;
653099ff 3168 struct regnode_charclass_class ch_class;
cb434fcc 3169 I32 last_close = 0;
c277df42 3170
a3621e74 3171 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3172 scan = r->program + 1;
830247a4 3173 cl_init(pRExC_state, &ch_class);
653099ff 3174 data.start_class = &ch_class;
cb434fcc 3175 data.last_closep = &last_close;
a3621e74 3176 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf
JH
3177 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3178 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 3179 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3180 && !cl_is_anything(data.start_class))
3181 {
830247a4 3182 I32 n = add_data(pRExC_state, 1, "f");
653099ff 3183
b81d288d 3184 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
3185 struct regnode_charclass_class);
3186 StructCopy(data.start_class,
830247a4 3187 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3188 struct regnode_charclass_class);
830247a4 3189 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3190 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3191 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
9c5ffd7c
JH
3192 regprop(sv, (regnode*)data.start_class);
3193 PerlIO_printf(Perl_debug_log,
3194 "synthetic stclass `%s'.\n",
3195 SvPVX(sv));});
653099ff 3196 }
a0d0e21e
LW
3197 }
3198
a0d0e21e 3199 r->minlen = minlen;
b81d288d 3200 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3201 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3202 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3203 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3204 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3205 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3206 if (RExC_seen & REG_SEEN_CANY)
3207 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
3208 Newz(1002, r->startp, RExC_npar, I32);
3209 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 3210 PL_regdata = r->data; /* for regprop() */
a3621e74 3211 DEBUG_COMPILE_r(regdump(r));
a0d0e21e 3212 return(r);
a687059c
LW
3213}
3214
3215/*
3216 - reg - regular expression, i.e. main body or parenthesized thing
3217 *
3218 * Caller must absorb opening parenthesis.
3219 *
3220 * Combining parenthesis handling with the base level of regular expression
3221 * is a trifle forced, but the need to tie the tails of the branches to what
3222 * follows makes it hard to avoid.
3223 */
76e3520e 3224STATIC regnode *
830247a4 3225S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 3226 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3227{
c277df42
IZ
3228 register regnode *ret; /* Will be the head of the group. */
3229 register regnode *br;
3230 register regnode *lastbr;
3231 register regnode *ender = 0;
a0d0e21e 3232 register I32 parno = 0;
e2509266 3233 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
3234
3235 /* for (?g), (?gc), and (?o) warnings; warning
3236 about (?c) will warn about (?g) -- japhy */
3237
3238 I32 wastedflags = 0x00,
3239 wasted_o = 0x01,
3240 wasted_g = 0x02,
3241 wasted_gc = 0x02 | 0x04,
3242 wasted_c = 0x04;
3243
fac92740 3244 char * parse_start = RExC_parse; /* MJD */
830247a4 3245 char *oregcomp_parse = RExC_parse;
c277df42 3246 char c;
a0d0e21e 3247
821b33a5 3248 *flagp = 0; /* Tentatively. */
a0d0e21e 3249
9d1d55b5 3250
a0d0e21e
LW
3251 /* Make an OPEN node, if parenthesized. */
3252 if (paren) {
fac92740 3253 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3254 U32 posflags = 0, negflags = 0;
3255 U32 *flagsp = &posflags;
0f5d15d6 3256 int logical = 0;
830247a4 3257 char *seqstart = RExC_parse;
ca9dfc88 3258
830247a4
IZ
3259 RExC_parse++;
3260 paren = *RExC_parse++;
c277df42 3261 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3262 switch (paren) {
fac92740 3263 case '<': /* (?<...) */
830247a4 3264 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3265 if (*RExC_parse == '!')
c277df42 3266 paren = ',';
b81d288d 3267 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3268 goto unknown;
830247a4 3269 RExC_parse++;
fac92740
MJD
3270 case '=': /* (?=...) */
3271 case '!': /* (?!...) */
830247a4 3272 RExC_seen_zerolen++;
fac92740
MJD
3273 case ':': /* (?:...) */
3274 case '>': /* (?>...) */
a0d0e21e 3275 break;
fac92740
MJD
3276 case '$': /* (?$...) */
3277 case '@': /* (?@...) */
8615cb43 3278 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3279 break;
fac92740 3280 case '#': /* (?#...) */
830247a4
IZ
3281 while (*RExC_parse && *RExC_parse != ')')
3282 RExC_parse++;
3283 if (*RExC_parse != ')')
c277df42 3284 FAIL("Sequence (?#... not terminated");
830247a4 3285 nextchar(pRExC_state);
a0d0e21e
LW
3286 *flagp = TRYAGAIN;
3287 return NULL;
fac92740 3288 case 'p': /* (?p...) */
9014280d 3289 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3290 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3291 /* FALL THROUGH*/
fac92740 3292 case '?': /* (??...) */
0f5d15d6 3293 logical = 1;
438a3801
YST
3294 if (*RExC_parse != '{')
3295 goto unknown;
830247a4 3296 paren = *RExC_parse++;
0f5d15d6 3297 /* FALL THROUGH */
fac92740 3298 case '{': /* (?{...}) */
c277df42 3299 {
c277df42
IZ
3300 I32 count = 1, n = 0;
3301 char c;
830247a4 3302 char *s = RExC_parse;
c277df42
IZ
3303 SV *sv;
3304 OP_4tree *sop, *rop;
3305
830247a4
IZ
3306 RExC_seen_zerolen++;
3307 RExC_seen |= REG_SEEN_EVAL;
3308 while (count && (c = *RExC_parse)) {
3309 if (c == '\\' && RExC_parse[1])
3310 RExC_parse++;
b81d288d 3311 else if (c == '{')
c277df42 3312 count++;
b81d288d 3313 else if (c == '}')
c277df42 3314 count--;
830247a4 3315 RExC_parse++;
c277df42 3316 }
830247a4 3317 if (*RExC_parse != ')')
b45f050a 3318 {
b81d288d 3319 RExC_parse = s;
b45f050a
JF
3320 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3321 }
c277df42 3322 if (!SIZE_ONLY) {
f3548bdc 3323 PAD *pad;
b81d288d
AB
3324
3325 if (RExC_parse - 1 - s)
830247a4 3326 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3327 else
79cb57f6 3328 sv = newSVpvn("", 0);
c277df42 3329
569233ed
SB
3330 ENTER;
3331 Perl_save_re_context(aTHX);
f3548bdc 3332 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3333 sop->op_private |= OPpREFCOUNTED;
3334 /* re_dup will OpREFCNT_inc */
3335 OpREFCNT_set(sop, 1);
569233ed 3336 LEAVE;
c277df42 3337
830247a4
IZ
3338 n = add_data(pRExC_state, 3, "nop");
3339 RExC_rx->data->data[n] = (void*)rop;
3340 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3341 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3342 SvREFCNT_dec(sv);
a0ed51b3 3343 }
e24b16f9 3344 else { /* First pass */
830247a4 3345 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3346 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3347 /* No compiled RE interpolated, has runtime
3348 components ===> unsafe. */
3349 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3350 if (PL_tainting && PL_tainted)
cc6b7395 3351 FAIL("Eval-group in insecure regular expression");
923e4eb5 3352 if (IN_PERL_COMPILETIME)
b5c19bd7 3353 PL_cv_has_eval = 1;
c277df42 3354 }
b5c19bd7 3355
830247a4 3356 nextchar(pRExC_state);
0f5d15d6 3357 if (logical) {
830247a4 3358 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3359 if (!SIZE_ONLY)
3360 ret->flags = 2;
830247a4 3361 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3362 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3363 return ret;
3364 }
ccb2c380
MP
3365 ret = reganode(pRExC_state, EVAL, n);
3366 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3367 Set_Node_Offset(ret, parse_start);
3368 return ret;
c277df42 3369 }
fac92740 3370 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3371 {
fac92740 3372 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3373 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3374 || RExC_parse[1] == '<'
830247a4 3375 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3376 I32 flag;
3377
830247a4 3378 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3379 if (!SIZE_ONLY)
3380 ret->flags = 1;
830247a4 3381 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 3382 goto insert_if;
b81d288d 3383 }
a0ed51b3 3384 }
830247a4 3385 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3386 /* (?(1)...) */
830247a4 3387 parno = atoi(RExC_parse++);
c277df42 3388
830247a4
IZ
3389 while (isDIGIT(*RExC_parse))
3390 RExC_parse++;
fac92740
MJD
3391 ret = reganode(pRExC_state, GROUPP, parno);
3392
830247a4 3393 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3394 vFAIL("Switch condition not recognized");
c277df42 3395 insert_if:
830247a4
IZ
3396 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3397 br = regbranch(pRExC_state, &flags, 1);
c277df42 3398 if (br == NULL)
830247a4 3399 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3400 else
830247a4
IZ
3401 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3402 c = *nextchar(pRExC_state);
d1b80229
IZ
3403 if (flags&HASWIDTH)
3404 *flagp |= HASWIDTH;
c277df42 3405 if (c == '|') {
830247a4
IZ
3406 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3407 regbranch(pRExC_state, &flags, 1);
3408 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
3409 if (flags&HASWIDTH)
3410 *flagp |= HASWIDTH;
830247a4 3411 c = *nextchar(pRExC_state);
a0ed51b3
LW
3412 }
3413 else
c277df42
IZ
3414 lastbr = NULL;
3415 if (c != ')')
8615cb43 3416 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
3417 ender = reg_node(pRExC_state, TAIL);
3418 regtail(pRExC_state, br, ender);
c277df42 3419 if (lastbr) {
830247a4
IZ
3420 regtail(pRExC_state, lastbr, ender);
3421 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3422 }
3423 else
830247a4 3424 regtail(pRExC_state, ret, ender);
c277df42 3425 return ret;
a0ed51b3
LW
3426 }
3427 else {
830247a4 3428 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3429 }
3430 }
1b1626e4 3431 case 0:
830247a4 3432 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3433 vFAIL("Sequence (? incomplete");
1b1626e4 3434 break;
a0d0e21e 3435 default:
830247a4 3436 --RExC_parse;
fac92740 3437 parse_flags: /* (?i) */
830247a4 3438 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3439 /* (?g), (?gc) and (?o) are useless here
3440 and must be globally applied -- japhy */
3441
3442 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3443 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3444 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3445 if (! (wastedflags & wflagbit) ) {
3446 wastedflags |= wflagbit;
3447 vWARN5(
3448 RExC_parse + 1,
3449 "Useless (%s%c) - %suse /%c modifier",
3450 flagsp == &negflags ? "?-" : "?",
3451 *RExC_parse,
3452 flagsp == &negflags ? "don't " : "",
3453 *RExC_parse
3454 );
3455 }
3456 }
3457 }
3458 else if (*RExC_parse == 'c') {
3459 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3460 if (! (wastedflags & wasted_c) ) {
3461 wastedflags |= wasted_gc;
3462 vWARN3(
3463 RExC_parse + 1,
3464 "Useless (%sc) - %suse /gc modifier",
3465 flagsp == &negflags ? "?-" : "?",
3466 flagsp == &negflags ? "don't " : ""
3467 );
3468 }
3469 }
3470 }
3471 else { pmflag(flagsp, *RExC_parse); }
3472
830247a4 3473 ++RExC_parse;
ca9dfc88 3474 }
830247a4 3475 if (*RExC_parse == '-') {
ca9dfc88 3476 flagsp = &negflags;
9d1d55b5 3477 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3478 ++RExC_parse;
ca9dfc88 3479 goto parse_flags;
48c036b1 3480 }
e2509266
JH
3481 RExC_flags |= posflags;
3482 RExC_flags &= ~negflags;
830247a4
IZ
3483 if (*RExC_parse == ':') {
3484 RExC_parse++;
ca9dfc88
IZ
3485 paren = ':';
3486 break;
3487 }
c277df42 3488 unknown:
830247a4
IZ
3489 if (*RExC_parse != ')') {
3490 RExC_parse++;
3491 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3492 }
830247a4 3493 nextchar(pRExC_state);
a0d0e21e
LW
3494 *flagp = TRYAGAIN;
3495 return NULL;
3496 }
3497 }
fac92740 3498 else { /* (...) */
830247a4
IZ
3499 parno = RExC_npar;
3500 RExC_npar++;
3501 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3502 Set_Node_Length(ret, 1); /* MJD */
3503 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 3504 open = 1;
a0d0e21e 3505 }
a0ed51b3 3506 }
fac92740 3507 else /* ! paren */
a0d0e21e
LW
3508 ret = NULL;
3509
3510 /* Pick up the branches, linking them together. */
fac92740 3511 parse_start = RExC_parse; /* MJD */
830247a4 3512 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
3513 /* branch_len = (paren != 0); */
3514
a0d0e21e
LW
3515 if (br == NULL)
3516 return(NULL);
830247a4
IZ
3517 if (*RExC_parse == '|') {
3518 if (!SIZE_ONLY && RExC_extralen) {
3519 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3520 }
fac92740 3521 else { /* MJD */
830247a4 3522 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3523 Set_Node_Length(br, paren != 0);
3524 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3525 }
c277df42
IZ
3526 have_branch = 1;
3527 if (SIZE_ONLY)
830247a4 3528 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3529 }
3530 else if (paren == ':') {
c277df42
IZ
3531 *flagp |= flags&SIMPLE;
3532 }
3533 if (open) { /* Starts with OPEN. */
830247a4 3534 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3535 }
3536 else if (paren != '?') /* Not Conditional */
a0d0e21e 3537 ret = br;
32a0ca98 3538 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3539 lastbr = br;
830247a4
IZ
3540 while (*RExC_parse == '|') {
3541 if (!SIZE_ONLY && RExC_extralen) {
3542 ender = reganode(pRExC_state, LONGJMP,0);
3543 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3544 }
3545 if (SIZE_ONLY)
830247a4
IZ
3546 RExC_extralen += 2; /* Account for LONGJMP. */
3547 nextchar(pRExC_state);
3548 br = regbranch(pRExC_state, &flags, 0);
fac92740 3549
a687059c 3550 if (br == NULL)
a0d0e21e 3551 return(NULL);
830247a4 3552 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3553 lastbr = br;
821b33a5
IZ
3554 if (flags&HASWIDTH)
3555 *flagp |= HASWIDTH;
a687059c 3556 *flagp |= flags&SPSTART;
a0d0e21e
LW
3557 }
3558
c277df42
IZ
3559 if (have_branch || paren != ':') {
3560 /* Make a closing node, and hook it on the end. */
3561 switch (paren) {
3562 case ':':
830247a4 3563 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3564 break;
3565 case 1:
830247a4 3566 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3567 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3568 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3569 break;
3570 case '<':
c277df42
IZ
3571 case ',':
3572 case '=':
3573 case '!':
c277df42 3574 *flagp &= ~HASWIDTH;
821b33a5
IZ
3575 /* FALL THROUGH */
3576 case '>':
830247a4 3577 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3578 break;
3579 case 0:
830247a4 3580 ender = reg_node(pRExC_state, END);
c277df42
IZ
3581 break;
3582 }
830247a4 3583 regtail(pRExC_state, lastbr, ender);
a0d0e21e 3584
c277df42
IZ
3585 if (have_branch) {
3586 /* Hook the tails of the branches to the closing node. */
3587 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 3588 regoptail(pRExC_state, br, ender);
c277df42
IZ
3589 }
3590 }
a0d0e21e 3591 }
c277df42
IZ
3592
3593 {
bfed75c6
AL
3594 const char *p;
3595 static const char parens[] = "=!<,>";
c277df42
IZ
3596
3597 if (paren && (p = strchr(parens, paren))) {
eb160463 3598 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
3599 int flag = (p - parens) > 1;
3600
3601 if (paren == '>')
3602 node = SUSPEND, flag = 0;
830247a4 3603 reginsert(pRExC_state, node,ret);
45948336
EP
3604 Set_Node_Cur_Length(ret);
3605 Set_Node_Offset(ret, parse_start + 1);
c277df42 3606 ret->flags = flag;
830247a4 3607 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 3608 }
a0d0e21e
LW
3609 }
3610
3611 /* Check for proper termination. */
ce3e6498 3612 if (paren) {
e2509266 3613 RExC_flags = oregflags;
830247a4
IZ
3614 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3615 RExC_parse = oregcomp_parse;
380a0633 3616 vFAIL("Unmatched (");
ce3e6498 3617 }
a0ed51b3 3618 }
830247a4
IZ
3619 else if (!paren && RExC_parse < RExC_end) {
3620 if (*RExC_parse == ')') {
3621 RExC_parse++;
380a0633 3622 vFAIL("Unmatched )");
a0ed51b3
LW
3623 }
3624 else
b45f050a 3625 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
3626 /* NOTREACHED */
3627 }
a687059c 3628
a0d0e21e 3629 return(ret);
a687059c
LW
3630}
3631
3632/*
3633 - regbranch - one alternative of an | operator
3634 *
3635 * Implements the concatenation operator.
3636 */
76e3520e 3637STATIC regnode *
830247a4 3638S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 3639{
c277df42
IZ
3640 register regnode *ret;
3641 register regnode *chain = NULL;
3642 register regnode *latest;
3643 I32 flags = 0, c = 0;
a0d0e21e 3644
b81d288d 3645 if (first)
c277df42
IZ
3646 ret = NULL;
3647 else {
b81d288d 3648 if (!SIZE_ONLY && RExC_extralen)
830247a4 3649 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 3650 else {
830247a4 3651 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
3652 Set_Node_Length(ret, 1);
3653 }
c277df42
IZ
3654 }
3655
b81d288d 3656 if (!first && SIZE_ONLY)
830247a4 3657 RExC_extralen += 1; /* BRANCHJ */
b81d288d 3658
c277df42 3659 *flagp = WORST; /* Tentatively. */
a0d0e21e 3660
830247a4
IZ
3661 RExC_parse--;
3662 nextchar(pRExC_state);
3663 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 3664 flags &= ~TRYAGAIN;
830247a4 3665 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
3666 if (latest == NULL) {
3667 if (flags & TRYAGAIN)
3668 continue;
3669 return(NULL);
a0ed51b3
LW
3670 }
3671 else if (ret == NULL)
c277df42 3672 ret = latest;
a0d0e21e 3673 *flagp |= flags&HASWIDTH;
c277df42 3674 if (chain == NULL) /* First piece. */
a0d0e21e
LW
3675 *flagp |= flags&SPSTART;
3676 else {
830247a4
IZ
3677 RExC_naughty++;
3678 regtail(pRExC_state, chain, latest);
a687059c 3679 }
a0d0e21e 3680 chain = latest;
c277df42
IZ
3681 c++;
3682 }
3683 if (chain == NULL) { /* Loop ran zero times. */
830247a4 3684 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
3685 if (ret == NULL)
3686 ret = chain;
3687 }
3688 if (c == 1) {
3689 *flagp |= flags&SIMPLE;
a0d0e21e 3690 }
a687059c 3691
a0d0e21e 3692 return(ret);
a687059c
LW
3693}
3694
3695/*
3696 - regpiece - something followed by possible [*+?]
3697 *
3698 * Note that the branching code sequences used for ? and the general cases
3699 * of * and + are somewhat optimized: they use the same NOTHING node as
3700 * both the endmarker for their branch list and the body of the last branch.
3701 * It might seem that this node could be dispensed with entirely, but the
3702 * endmarker role is not redundant.
3703 */
76e3520e 3704STATIC regnode *
830247a4 3705S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3706{
c277df42 3707 register regnode *ret;
a0d0e21e
LW
3708 register char op;
3709 register char *next;
3710 I32 flags;
830247a4 3711 char *origparse = RExC_parse;
a0d0e21e
LW
3712 char *maxpos;
3713 I32 min;
c277df42 3714 I32 max = REG_INFTY;
fac92740 3715 char *parse_start;
a0d0e21e 3716
830247a4 3717 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
3718 if (ret == NULL) {
3719 if (flags & TRYAGAIN)
3720 *flagp |= TRYAGAIN;
3721 return(NULL);
3722 }
3723
830247a4 3724 op = *RExC_parse;
a0d0e21e 3725
830247a4 3726 if (op == '{' && regcurly(RExC_parse)) {
fac92740 3727 parse_start = RExC_parse; /* MJD */
830247a4 3728 next = RExC_parse + 1;
a0d0e21e
LW
3729 maxpos = Nullch;
3730 while (isDIGIT(*next) || *next == ',') {
3731 if (*next == ',') {
3732 if (maxpos)
3733 break;
3734 else
3735 maxpos = next;
a687059c 3736 }
a0d0e21e
LW
3737 next++;
3738 }
3739 if (*next == '}') { /* got one */
3740 if (!maxpos)
3741 maxpos = next;
830247a4
IZ
3742 RExC_parse++;
3743 min = atoi(RExC_parse);
a0d0e21e
LW
3744 if (*maxpos == ',')
3745 maxpos++;
3746 else
830247a4 3747 maxpos = RExC_parse;
a0d0e21e
LW
3748 max = atoi(maxpos);
3749 if (!max && *maxpos != '0')
c277df42
IZ
3750 max = REG_INFTY; /* meaning "infinity" */
3751 else if (max >= REG_INFTY)
8615cb43 3752 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
3753 RExC_parse = next;
3754 nextchar(pRExC_state);
a0d0e21e
LW
3755
3756 do_curly:
3757 if ((flags&SIMPLE)) {
830247a4
IZ
3758 RExC_naughty += 2 + RExC_naughty / 2;
3759 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
3760 Set_Node_Offset(ret, parse_start+1); /* MJD */
3761 Set_Node_Cur_Length(ret);
a0d0e21e
LW
3762 }
3763 else {
830247a4 3764 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
3765
3766 w->flags = 0;
830247a4
IZ
3767 regtail(pRExC_state, ret, w);
3768 if (!SIZE_ONLY && RExC_extralen) {
3769 reginsert(pRExC_state, LONGJMP,ret);
3770 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
3771 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3772 }
830247a4 3773 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
3774 /* MJD hk */
3775 Set_Node_Offset(ret, parse_start+1);
3776 Set_Node_Length(ret,
3777 op == '{' ? (RExC_parse - parse_start) : 1);
3778
830247a4 3779 if (!SIZE_ONLY && RExC_extralen)
c277df42 3780 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 3781 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 3782 if (SIZE_ONLY)
830247a4
IZ
3783 RExC_whilem_seen++, RExC_extralen += 3;
3784 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 3785 }
c277df42 3786 ret->flags = 0;
a0d0e21e
LW
3787
3788 if (min > 0)
821b33a5
IZ
3789 *flagp = WORST;
3790 if (max > 0)
3791 *flagp |= HASWIDTH;
a0d0e21e 3792 if (max && max < min)
8615cb43 3793 vFAIL("Can't do {n,m} with n > m");
c277df42 3794 if (!SIZE_ONLY) {
eb160463
GS
3795 ARG1_SET(ret, (U16)min);
3796 ARG2_SET(ret, (U16)max);
a687059c 3797 }
a687059c 3798
a0d0e21e 3799 goto nest_check;
a687059c 3800 }
a0d0e21e 3801 }
a687059c 3802
a0d0e21e
LW
3803 if (!ISMULT1(op)) {
3804 *flagp = flags;
a687059c 3805 return(ret);
a0d0e21e 3806 }
bb20fd44 3807
c277df42 3808#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
3809
3810 /* if this is reinstated, don't forget to put this back into perldiag:
3811
3812 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3813
3814 (F) The part of the regexp subject to either the * or + quantifier
3815 could match an empty string. The {#} shows in the regular
3816 expression about where the problem was discovered.
3817
3818 */
3819
bb20fd44 3820 if (!(flags&HASWIDTH) && op != '?')
b45f050a 3821 vFAIL("Regexp *+ operand could be empty");
b81d288d 3822#endif
bb20fd44 3823
fac92740 3824 parse_start = RExC_parse;
830247a4 3825 nextchar(pRExC_state);
a0d0e21e 3826
821b33a5 3827 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
3828
3829 if (op == '*' && (flags&SIMPLE)) {
830247a4 3830 reginsert(pRExC_state, STAR, ret);
c277df42 3831 ret->flags = 0;
830247a4 3832 RExC_naughty += 4;
a0d0e21e
LW
3833 }
3834 else if (op == '*') {
3835 min = 0;
3836 goto do_curly;
a0ed51b3
LW
3837 }
3838 else if (op == '+' && (flags&SIMPLE)) {
830247a4 3839 reginsert(pRExC_state, PLUS, ret);
c277df42 3840 ret->flags = 0;
830247a4 3841 RExC_naughty += 3;
a0d0e21e
LW
3842 }
3843 else if (op == '+') {
3844 min = 1;
3845 goto do_curly;
a0ed51b3
LW
3846 }
3847 else if (op == '?') {
a0d0e21e
LW
3848 min = 0; max = 1;
3849 goto do_curly;
3850 }
3851 nest_check:
e476b1b5 3852 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 3853 vWARN3(RExC_parse,
b45f050a 3854 "%.*s matches null string many times",
830247a4 3855 RExC_parse - origparse,
b45f050a 3856 origparse);
a0d0e21e
LW
3857 }
3858
830247a4
IZ
3859 if (*RExC_parse == '?') {
3860 nextchar(pRExC_state);
3861 reginsert(pRExC_state, MINMOD, ret);
3862 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 3863 }
830247a4
IZ
3864 if (ISMULT2(RExC_parse)) {
3865 RExC_parse++;
b45f050a
JF
3866 vFAIL("Nested quantifiers");
3867 }
a0d0e21e
LW
3868
3869 return(ret);
a687059c
LW
3870}
3871
3872/*
3873 - regatom - the lowest level
3874 *
3875 * Optimization: gobbles an entire sequence of ordinary characters so that
3876 * it can turn them into a single node, which is smaller to store and
3877 * faster to run. Backslashed characters are exceptions, each becoming a
3878 * separate node; the code is simpler that way and it's not worth fixing.
3879 *
b45f050a 3880 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 3881STATIC regnode *
830247a4 3882S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3883{
c277df42 3884 register regnode *ret = 0;
a0d0e21e 3885 I32 flags;
45948336 3886 char *parse_start = RExC_parse;
a0d0e21e
LW
3887
3888 *flagp = WORST; /* Tentatively. */
3889
3890tryagain:
830247a4 3891 switch (*RExC_parse) {
a0d0e21e 3892 case '^':
830247a4
IZ
3893 RExC_seen_zerolen++;
3894 nextchar(pRExC_state);
e2509266 3895 if (RExC_flags & PMf_MULTILINE)
830247a4 3896 ret = reg_node(pRExC_state, MBOL);
e2509266 3897 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3898 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3899 else
830247a4 3900 ret = reg_node(pRExC_state, BOL);
fac92740 3901 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3902 break;
3903 case '$':
830247a4 3904 nextchar(pRExC_state);
b81d288d 3905 if (*RExC_parse)
830247a4 3906 RExC_seen_zerolen++;
e2509266 3907 if (RExC_flags & PMf_MULTILINE)
830247a4 3908 ret = reg_node(pRExC_state, MEOL);
e2509266 3909 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3910 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3911 else
830247a4 3912 ret = reg_node(pRExC_state, EOL);
fac92740 3913 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3914 break;
3915 case '.':
830247a4 3916 nextchar(pRExC_state);
e2509266 3917 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
3918 ret = reg_node(pRExC_state, SANY);
3919 else
3920 ret = reg_node(pRExC_state, REG_ANY);
3921 *flagp |= HASWIDTH|SIMPLE;
830247a4 3922 RExC_naughty++;
fac92740 3923 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3924 break;
3925 case '[':
b45f050a 3926 {
830247a4 3927 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 3928 ret = regclass(pRExC_state);
830247a4
IZ
3929 if (*RExC_parse != ']') {
3930 RExC_parse = oregcomp_parse;
b45f050a
JF
3931 vFAIL("Unmatched [");
3932 }
830247a4 3933 nextchar(pRExC_state);
a0d0e21e 3934 *flagp |= HASWIDTH|SIMPLE;
fac92740 3935 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 3936 break;
b45f050a 3937 }
a0d0e21e 3938 case '(':
830247a4
IZ
3939 nextchar(pRExC_state);
3940 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 3941 if (ret == NULL) {
bf93d4cc 3942 if (flags & TRYAGAIN) {
830247a4 3943 if (RExC_parse == RExC_end) {
bf93d4cc
GS
3944 /* Make parent create an empty node if needed. */
3945 *flagp |= TRYAGAIN;
3946 return(NULL);
3947 }
a0d0e21e 3948 goto tryagain;
bf93d4cc 3949 }
a0d0e21e
LW
3950 return(NULL);
3951 }
c277df42 3952 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
3953 break;
3954 case '|':
3955 case ')':
3956 if (flags & TRYAGAIN) {
3957 *flagp |= TRYAGAIN;
3958 return NULL;
3959 }
b45f050a 3960 vFAIL("Internal urp");
a0d0e21e
LW
3961 /* Supposed to be caught earlier. */
3962 break;
85afd4ae 3963 case '{':
830247a4
IZ
3964 if (!regcurly(RExC_parse)) {
3965 RExC_parse++;
85afd4ae
CS
3966 goto defchar;
3967 }
3968 /* FALL THROUGH */
a0d0e21e
LW
3969 case '?':
3970 case '+':
3971 case '*':
830247a4 3972 RExC_parse++;
b45f050a 3973 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
3974 break;
3975 case '\\':
830247a4 3976 switch (*++RExC_parse) {
a0d0e21e 3977 case 'A':
830247a4
IZ
3978 RExC_seen_zerolen++;
3979 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3980 *flagp |= SIMPLE;
830247a4 3981 nextchar(pRExC_state);
fac92740 3982 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3983 break;
3984 case 'G':
830247a4
IZ
3985 ret = reg_node(pRExC_state, GPOS);
3986 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 3987 *flagp |= SIMPLE;
830247a4 3988 nextchar(pRExC_state);
fac92740 3989 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3990 break;
3991 case 'Z':
830247a4 3992 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3993 *flagp |= SIMPLE;
a1917ab9 3994 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 3995 nextchar(pRExC_state);
a0d0e21e 3996 break;
b85d18e9 3997 case 'z':
830247a4 3998 ret = reg_node(pRExC_state, EOS);
b85d18e9 3999 *flagp |= SIMPLE;
830247a4
IZ
4000 RExC_seen_zerolen++; /* Do not optimize RE away */
4001 nextchar(pRExC_state);
fac92740 4002 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 4003 break;
4a2d328f 4004 case 'C':
f33976b4
DB
4005 ret = reg_node(pRExC_state, CANY);
4006 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 4007 *flagp |= HASWIDTH|SIMPLE;
830247a4 4008 nextchar(pRExC_state);
fac92740 4009 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
4010 break;
4011 case 'X':
830247a4 4012 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 4013 *flagp |= HASWIDTH;
830247a4 4014 nextchar(pRExC_state);
fac92740 4015 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 4016 break;
a0d0e21e 4017 case 'w':
eb160463 4018 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 4019 *flagp |= HASWIDTH|SIMPLE;
830247a4 4020 nextchar(pRExC_state);
fac92740 4021 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4022 break;
4023 case 'W':
eb160463 4024 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4025 *flagp |= HASWIDTH|SIMPLE;
830247a4 4026 nextchar(pRExC_state);
fac92740 4027 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4028 break;
4029 case 'b':
830247a4
IZ
4030 RExC_seen_zerolen++;
4031 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4032 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4033 *flagp |= SIMPLE;
830247a4 4034 nextchar(pRExC_state);
fac92740 4035 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4036 break;
4037 case 'B':
830247a4
IZ
4038 RExC_seen_zerolen++;
4039 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4040 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4041 *flagp |= SIMPLE;
830247a4 4042 nextchar(pRExC_state);
fac92740 4043 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4044 break;
4045 case 's':
eb160463 4046 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4047 *flagp |= HASWIDTH|SIMPLE;
830247a4 4048 nextchar(pRExC_state);
fac92740 4049 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4050 break;
4051 case 'S':
eb160463 4052 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4053 *flagp |= HASWIDTH|SIMPLE;
830247a4 4054 nextchar(pRExC_state);
fac92740 4055 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4056 break;
4057 case 'd':
ffc61ed2 4058 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4059 *flagp |= HASWIDTH|SIMPLE;
830247a4 4060 nextchar(pRExC_state);
fac92740 4061 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4062 break;
4063 case 'D':
ffc61ed2 4064 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4065 *flagp |= HASWIDTH|SIMPLE;
830247a4 4066 nextchar(pRExC_state);
fac92740 4067 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4068 break;
a14b48bc
LW
4069 case 'p':
4070 case 'P':
3568d838 4071 {
830247a4 4072 char* oldregxend = RExC_end;
ccb2c380 4073 char* parse_start = RExC_parse - 2;
a14b48bc 4074
830247a4 4075 if (RExC_parse[1] == '{') {
3568d838 4076 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4077 RExC_end = strchr(RExC_parse, '}');
4078 if (!RExC_end) {
0da60cf5 4079 U8 c = (U8)*RExC_parse;
830247a4
IZ
4080 RExC_parse += 2;
4081 RExC_end = oldregxend;
0da60cf5 4082 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4083 }
830247a4 4084 RExC_end++;
a14b48bc 4085 }
af6f566e 4086 else {
830247a4 4087 RExC_end = RExC_parse + 2;
af6f566e
HS
4088 if (RExC_end > oldregxend)
4089 RExC_end = oldregxend;
4090 }
830247a4 4091 RExC_parse--;
a14b48bc 4092
ffc61ed2 4093 ret = regclass(pRExC_state);
a14b48bc 4094
830247a4
IZ
4095 RExC_end = oldregxend;
4096 RExC_parse--;
ccb2c380
MP
4097
4098 Set_Node_Offset(ret, parse_start + 2);
4099 Set_Node_Cur_Length(ret);
830247a4 4100 nextchar(pRExC_state);
a14b48bc
LW
4101 *flagp |= HASWIDTH|SIMPLE;
4102 }
4103 break;
a0d0e21e
LW
4104 case 'n':
4105 case 'r':
4106 case 't':
4107 case 'f':
4108 case 'e':
4109 case 'a':
4110 case 'x':
4111 case 'c':
4112 case '0':
4113 goto defchar;
4114 case '1': case '2': case '3': case '4':
4115 case '5': case '6': case '7': case '8': case '9':
4116 {
830247a4 4117 I32 num = atoi(RExC_parse);
a0d0e21e 4118
830247a4 4119 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4120 goto defchar;
4121 else {
fac92740 4122 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4123 while (isDIGIT(*RExC_parse))
4124 RExC_parse++;
b45f050a 4125
eb160463 4126 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4127 vFAIL("Reference to nonexistent group");
830247a4 4128 RExC_sawback = 1;
eb160463
GS
4129 ret = reganode(pRExC_state,
4130 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4131 num);
a0d0e21e 4132 *flagp |= HASWIDTH;
fac92740
MJD
4133
4134 /* override incorrect value set in reganode MJD */
4135 Set_Node_Offset(ret, parse_start+1);
4136 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4137 RExC_parse--;
4138 nextchar(pRExC_state);
a0d0e21e
LW
4139 }
4140 }
4141 break;
4142 case '\0':
830247a4 4143 if (RExC_parse >= RExC_end)
b45f050a 4144 FAIL("Trailing \\");
a0d0e21e
LW
4145 /* FALL THROUGH */
4146 default:
c9f97d15
IZ
4147 /* Do not generate `unrecognized' warnings here, we fall
4148 back into the quick-grab loop below */
45948336 4149 parse_start--;
a0d0e21e
LW
4150 goto defchar;
4151 }
4152 break;
4633a7c4
LW
4153
4154 case '#':
e2509266 4155 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4156 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4157 if (RExC_parse < RExC_end)
4633a7c4
LW
4158 goto tryagain;
4159 }
4160 /* FALL THROUGH */
4161
a0d0e21e 4162 default: {
ba210ebe 4163 register STRLEN len;
58ae7d3f 4164 register UV ender;
a0d0e21e 4165 register char *p;
c277df42 4166 char *oldp, *s;
ba210ebe 4167 STRLEN numlen;
80aecb99 4168 STRLEN foldlen;
89ebb4a3 4169 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4170
4171 parse_start = RExC_parse - 1;
a0d0e21e 4172
830247a4 4173 RExC_parse++;
a0d0e21e
LW
4174
4175 defchar:
58ae7d3f 4176 ender = 0;
eb160463
GS
4177 ret = reg_node(pRExC_state,
4178 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4179 s = STRING(ret);
830247a4
IZ
4180 for (len = 0, p = RExC_parse - 1;
4181 len < 127 && p < RExC_end;
a0d0e21e
LW
4182 len++)
4183 {
4184 oldp = p;
5b5a24f7 4185
e2509266 4186 if (RExC_flags & PMf_EXTENDED)
830247a4 4187 p = regwhite(p, RExC_end);
a0d0e21e
LW
4188 switch (*p) {
4189 case '^':
4190 case '$':
4191 case '.':
4192 case '[':
4193 case '(':
4194 case ')':
4195 case '|':
4196 goto loopdone;
4197 case '\\':
4198 switch (*++p) {
4199 case 'A':
1ed8eac0
JF
4200 case 'C':
4201 case 'X':
a0d0e21e
LW
4202 case 'G':
4203 case 'Z':
b85d18e9 4204 case 'z':
a0d0e21e
LW
4205 case 'w':
4206 case 'W':
4207 case 'b':
4208 case 'B':
4209 case 's':
4210 case 'S':
4211 case 'd':
4212 case 'D':
a14b48bc
LW
4213 case 'p':
4214 case 'P':
a0d0e21e
LW
4215 --p;
4216 goto loopdone;
4217 case 'n':
4218 ender = '\n';
4219 p++;
a687059c 4220 break;
a0d0e21e
LW
4221 case 'r':
4222 ender = '\r';
4223 p++;
a687059c 4224 break;
a0d0e21e
LW
4225 case 't':
4226 ender = '\t';
4227 p++;
a687059c 4228 break;
a0d0e21e
LW
4229 case 'f':
4230 ender = '\f';
4231 p++;
a687059c 4232 break;
a0d0e21e 4233 case 'e':
c7f1f016 4234 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4235 p++;
a687059c 4236 break;
a0d0e21e 4237 case 'a':
c7f1f016 4238 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4239 p++;
a687059c 4240 break;
a0d0e21e 4241 case 'x':
a0ed51b3
LW
4242 if (*++p == '{') {
4243 char* e = strchr(p, '}');
b81d288d 4244
b45f050a 4245 if (!e) {
830247a4 4246 RExC_parse = p + 1;
b45f050a
JF
4247 vFAIL("Missing right brace on \\x{}");
4248 }
de5f0749 4249 else {
a4c04bdc
NC
4250 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4251 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4252 numlen = e - p - 1;
4253 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4254 if (ender > 0xff)
4255 RExC_utf8 = 1;
a0ed51b3
LW
4256 p = e + 1;
4257 }
a0ed51b3
LW
4258 }
4259 else {
a4c04bdc 4260 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4261 numlen = 2;
4262 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4263 p += numlen;
4264 }
a687059c 4265 break;
a0d0e21e
LW
4266 case 'c':
4267 p++;
bbce6d69 4268 ender = UCHARAT(p++);
4269 ender = toCTRL(ender);
a687059c 4270 break;
a0d0e21e
LW
4271 case '0': case '1': case '2': case '3':case '4':
4272 case '5': case '6': case '7': case '8':case '9':
4273 if (*p == '0' ||
830247a4 4274 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
4275 I32 flags = 0;
4276 numlen = 3;
4277 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4278 p += numlen;
4279 }
4280 else {
4281 --p;
4282 goto loopdone;
a687059c
LW
4283 }
4284 break;
a0d0e21e 4285 case '\0':
830247a4 4286 if (p >= RExC_end)
b45f050a 4287 FAIL("Trailing \\");
a687059c 4288 /* FALL THROUGH */
a0d0e21e 4289 default:
e476b1b5 4290 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 4291 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4292 goto normal_default;
a0d0e21e
LW
4293 }
4294 break;
a687059c 4295 default:
a0ed51b3 4296 normal_default:
fd400ab9 4297 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 4298 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 4299 &numlen, 0);
a0ed51b3
LW
4300 p += numlen;
4301 }
4302 else
4303 ender = *p++;
a0d0e21e 4304 break;
a687059c 4305 }
e2509266 4306 if (RExC_flags & PMf_EXTENDED)
830247a4 4307 p = regwhite(p, RExC_end);
60a8b682
JH
4308 if (UTF && FOLD) {
4309 /* Prime the casefolded buffer. */
ac7e0132 4310 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4311 }
a0d0e21e
LW
4312 if (ISMULT2(p)) { /* Back off on ?+*. */
4313 if (len)
4314 p = oldp;
16ea2a2e 4315 else if (UTF) {
0ebc6274
JH
4316 STRLEN unilen;
4317
80aecb99 4318 if (FOLD) {
60a8b682 4319 /* Emit all the Unicode characters. */
80aecb99
JH
4320 for (foldbuf = tmpbuf;
4321 foldlen;
4322 foldlen -= numlen) {
4323 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4324 if (numlen > 0) {
0ebc6274
JH
4325 reguni(pRExC_state, ender, s, &unilen);
4326 s += unilen;
4327 len += unilen;
4328 /* In EBCDIC the numlen
4329 * and unilen can differ. */
9dc45d57 4330 foldbuf += numlen;
47654450
JH
4331 if (numlen >= foldlen)
4332 break;
9dc45d57
JH
4333 }
4334 else
4335 break; /* "Can't happen." */
80aecb99
JH
4336 }
4337 }
4338 else {
0ebc6274 4339 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4340 if (unilen > 0) {
0ebc6274
JH
4341 s += unilen;
4342 len += unilen;
9dc45d57 4343 }
80aecb99 4344 }
a0ed51b3 4345 }
a0d0e21e
LW
4346 else {
4347 len++;
eb160463 4348 REGC((char)ender, s++);
a0d0e21e
LW
4349 }
4350 break;
a687059c 4351 }
16ea2a2e 4352 if (UTF) {
0ebc6274
JH
4353 STRLEN unilen;
4354
80aecb99 4355 if (FOLD) {
60a8b682 4356 /* Emit all the Unicode characters. */
80aecb99
JH
4357 for (foldbuf = tmpbuf;
4358 foldlen;
4359 foldlen -= numlen) {
4360 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4361 if (numlen > 0) {
0ebc6274
JH
4362 reguni(pRExC_state, ender, s, &unilen);
4363 len += unilen;
4364 s += unilen;
4365 /* In EBCDIC the numlen
4366 * and unilen can differ. */
9dc45d57 4367 foldbuf += numlen;
47654450
JH
4368 if (numlen >= foldlen)
4369 break;
9dc45d57
JH
4370 }
4371 else
4372 break;
80aecb99
JH
4373 }
4374 }
4375 else {
0ebc6274 4376 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4377 if (unilen > 0) {
0ebc6274
JH
4378 s += unilen;
4379 len += unilen;
9dc45d57 4380 }
80aecb99
JH
4381 }
4382 len--;
a0ed51b3
LW
4383 }
4384 else
eb160463 4385 REGC((char)ender, s++);
a0d0e21e
LW
4386 }
4387 loopdone:
830247a4 4388 RExC_parse = p - 1;
fac92740 4389 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4390 nextchar(pRExC_state);
793db0cb
JH
4391 {
4392 /* len is STRLEN which is unsigned, need to copy to signed */
4393 IV iv = len;
4394 if (iv < 0)
4395 vFAIL("Internal disaster");
4396 }
a0d0e21e
LW
4397 if (len > 0)
4398 *flagp |= HASWIDTH;
090f7165 4399 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4400 *flagp |= SIMPLE;
c277df42 4401 if (!SIZE_ONLY)
cd439c50
IZ
4402 STR_LEN(ret) = len;
4403 if (SIZE_ONLY)
830247a4 4404 RExC_size += STR_SZ(len);
cd439c50 4405 else
830247a4 4406 RExC_emit += STR_SZ(len);
a687059c 4407 }
a0d0e21e
LW
4408 break;
4409 }
a687059c 4410
60a8b682
JH
4411 /* If the encoding pragma is in effect recode the text of
4412 * any EXACT-kind nodes. */
22c54be3 4413 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
4414 STRLEN oldlen = STR_LEN(ret);
4415 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4416
4417 if (RExC_utf8)
4418 SvUTF8_on(sv);
4419 if (sv_utf8_downgrade(sv, TRUE)) {
4420 char *s = sv_recode_to_utf8(sv, PL_encoding);
4421 STRLEN newlen = SvCUR(sv);
4422
4423 if (SvUTF8(sv))
4424 RExC_utf8 = 1;
4425 if (!SIZE_ONLY) {
a3621e74
YO
4426 GET_RE_DEBUG_FLAGS_DECL;
4427 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
4428 (int)oldlen, STRING(ret),
4429 (int)newlen, s));
4430 Copy(s, STRING(ret), newlen, char);
4431 STR_LEN(ret) += newlen - oldlen;
4432 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4433 } else
4434 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4435 }
a72c7584
JH
4436 }
4437
a0d0e21e 4438 return(ret);
a687059c
LW
4439}
4440
873ef191 4441STATIC char *
cea2e8a9 4442S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
4443{
4444 while (p < e) {
4445 if (isSPACE(*p))
4446 ++p;
4447 else if (*p == '#') {
4448 do {
4449 p++;
4450 } while (p < e && *p != '\n');
4451 }
4452 else
4453 break;
4454 }
4455 return p;
4456}
4457
b8c5462f
JH
4458/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4459 Character classes ([:foo:]) can also be negated ([:^foo:]).
4460 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4461 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 4462 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
4463
4464#define POSIXCC_DONE(c) ((c) == ':')
4465#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4466#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4467
b8c5462f 4468STATIC I32
830247a4 4469S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
4470{
4471 char *posixcc = 0;
936ed897 4472 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 4473
830247a4 4474 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 4475 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
4476 POSIXCC(UCHARAT(RExC_parse))) {
4477 char c = UCHARAT(RExC_parse);
830247a4 4478 char* s = RExC_parse++;
b81d288d 4479
9a86a77b 4480 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
4481 RExC_parse++;
4482 if (RExC_parse == RExC_end)
620e46c5 4483 /* Grandfather lone [:, [=, [. */
830247a4 4484 RExC_parse = s;
620e46c5 4485 else {
830247a4 4486 char* t = RExC_parse++; /* skip over the c */
b8c5462f 4487
80916619
NC
4488 assert(*t == c);
4489
9a86a77b 4490 if (UCHARAT(RExC_parse) == ']') {
830247a4 4491 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
4492 posixcc = s + 1;
4493 if (*s == ':') {
4494 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
80916619
NC
4495 I32 skip = t - posixcc;
4496
4497 /* Initially switch on the length of the name. */
4498 switch (skip) {
4499 case 4:
4500 if (memEQ(posixcc, "word", 4)) {
4501 /* this is not POSIX, this is the Perl \w */;
4502 namedclass
4503 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4504 }
cc4319de 4505 break;
80916619
NC
4506 case 5:
4507 /* Names all of length 5. */
4508 /* alnum alpha ascii blank cntrl digit graph lower
4509 print punct space upper */
4510 /* Offset 4 gives the best switch position. */
4511 switch (posixcc[4]) {
4512 case 'a':
4513 if (memEQ(posixcc, "alph", 4)) {
4514 /* a */
4515 namedclass
4516 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4517 }
4518 break;
4519 case 'e':
4520 if (memEQ(posixcc, "spac", 4)) {
4521 /* e */
4522 namedclass
4523 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4524 }
4525 break;
4526 case 'h':
4527 if (memEQ(posixcc, "grap", 4)) {
4528 /* h */
4529 namedclass
4530 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4531 }
4532 break;
4533 case 'i':
4534 if (memEQ(posixcc, "asci", 4)) {
4535 /* i */
4536 namedclass
4537 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4538 }
4539 break;
4540 case 'k':
4541 if (memEQ(posixcc, "blan", 4)) {
4542 /* k */
4543 namedclass
4544 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4545 }
4546 break;
4547 case 'l':
4548 if (memEQ(posixcc, "cntr", 4)) {
4549 /* l */
4550 namedclass
4551 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4552 }
4553 break;
4554 case 'm':
4555 if (memEQ(posixcc, "alnu", 4)) {
4556 /* m */
4557 namedclass
4558 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4559 }
4560 break;
4561 case 'r':
4562 if (memEQ(posixcc, "lowe", 4)) {
4563 /* r */
4564 namedclass
4565 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4566 }
4567 if (memEQ(posixcc, "uppe", 4)) {
8fdec511 4568 /* r */
80916619
NC
4569 namedclass
4570 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4571 }
4572 break;
4573 case 't':
4574 if (memEQ(posixcc, "digi", 4)) {
4575 /* t */
4576 namedclass
4577 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4578 }
4579 if (memEQ(posixcc, "prin", 4)) {
8fdec511 4580 /* t */
80916619
NC
4581 namedclass
4582 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4583 }
4584 if (memEQ(posixcc, "punc", 4)) {
4585 /* t */
4586 namedclass
4587 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4588 }
4589 break;
b8c5462f
JH
4590 }
4591 break;
80916619
NC
4592 case 6:
4593 if (memEQ(posixcc, "xdigit", 6)) {
4594 namedclass
4595 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
4596 }
4597 break;
4598 }
80916619
NC
4599
4600 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
4601 {
4602 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4603 t - s - 1, s + 1);
4604 }
80916619
NC
4605 assert (posixcc[skip] == ':');
4606 assert (posixcc[skip+1] == ']');
b45f050a 4607 } else if (!SIZE_ONLY) {
b8c5462f 4608 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 4609
830247a4 4610 /* adjust RExC_parse so the warning shows after
b45f050a 4611 the class closes */
9a86a77b 4612 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 4613 RExC_parse++;
b45f050a
JF
4614 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4615 }
b8c5462f
JH
4616 } else {
4617 /* Maternal grandfather:
4618 * "[:" ending in ":" but not in ":]" */
830247a4 4619 RExC_parse = s;
767d463e 4620 }
620e46c5
JH
4621 }
4622 }
4623
b8c5462f
JH
4624 return namedclass;
4625}
4626
4627STATIC void
830247a4 4628S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 4629{
b938889d 4630 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
830247a4 4631 char *s = RExC_parse;
93733859 4632 char c = *s++;
b8c5462f
JH
4633
4634 while(*s && isALNUM(*s))
4635 s++;
4636 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
4637 if (ckWARN(WARN_REGEXP))
4638 vWARN3(s+2,
4639 "POSIX syntax [%c %c] belongs inside character classes",
4640 c, c);
b45f050a
JF
4641
4642 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 4643 if (POSIXCC_NOTYET(c)) {
830247a4 4644 /* adjust RExC_parse so the error shows after
b45f050a 4645 the class closes */
9a86a77b 4646 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
4647 ;
4648 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4649 }
b8c5462f
JH
4650 }
4651 }
620e46c5
JH
4652}
4653
76e3520e 4654STATIC regnode *
830247a4 4655S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 4656{
ffc61ed2 4657 register UV value;
9a86a77b 4658 register UV nextvalue;
3568d838 4659 register IV prevvalue = OOB_UNICODE;
ffc61ed2 4660 register IV range = 0;
c277df42 4661 register regnode *ret;
ba210ebe 4662 STRLEN numlen;
ffc61ed2 4663 IV namedclass;
9c5ffd7c 4664 char *rangebegin = 0;
936ed897 4665 bool need_class = 0;
9c5ffd7c 4666 SV *listsv = Nullsv;
ffc61ed2
JH
4667 register char *e;
4668 UV n;
9e55ce06
JH
4669 bool optimize_invert = TRUE;
4670 AV* unicode_alternate = 0;
1b2d223b
JH
4671#ifdef EBCDIC
4672 UV literal_endpoint = 0;
4673#endif
ffc61ed2
JH
4674
4675 ret = reganode(pRExC_state, ANYOF, 0);
4676
4677 if (!SIZE_ONLY)
4678 ANYOF_FLAGS(ret) = 0;
4679
9a86a77b 4680 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
4681 RExC_naughty++;
4682 RExC_parse++;
4683 if (!SIZE_ONLY)
4684 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4685 }
a0d0e21e 4686
936ed897 4687 if (SIZE_ONLY)
830247a4 4688 RExC_size += ANYOF_SKIP;
936ed897 4689 else {
830247a4 4690 RExC_emit += ANYOF_SKIP;
936ed897
IZ
4691 if (FOLD)
4692 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4693 if (LOC)
4694 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
4695 ANYOF_BITMAP_ZERO(ret);
4696 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 4697 }
b8c5462f 4698
9a86a77b
JH
4699 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4700
b938889d 4701 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 4702 checkposixcc(pRExC_state);
b8c5462f 4703
f064b6ad
HS
4704 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4705 if (UCHARAT(RExC_parse) == ']')
4706 goto charclassloop;
ffc61ed2 4707
9a86a77b 4708 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
4709
4710 charclassloop:
4711
4712 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4713
73b437c8 4714 if (!range)
830247a4 4715 rangebegin = RExC_parse;
ffc61ed2 4716 if (UTF) {
5e12f4fb 4717 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
4718 RExC_end - RExC_parse,
4719 &numlen, 0);
ffc61ed2
JH
4720 RExC_parse += numlen;
4721 }
4722 else
4723 value = UCHARAT(RExC_parse++);
9a86a77b
JH
4724 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4725 if (value == '[' && POSIXCC(nextvalue))
830247a4 4726 namedclass = regpposixcc(pRExC_state, value);
620e46c5 4727 else if (value == '\\') {
ffc61ed2 4728 if (UTF) {
5e12f4fb 4729 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
4730 RExC_end - RExC_parse,
4731 &numlen, 0);
4732 RExC_parse += numlen;
4733 }
4734 else
4735 value = UCHARAT(RExC_parse++);
470c3474 4736 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 4737 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
4738 * be a problem later if we want switch on Unicode.
4739 * A similar issue a little bit later when switching on
4740 * namedclass. --jhi */
ffc61ed2 4741 switch ((I32)value) {
b8c5462f
JH
4742 case 'w': namedclass = ANYOF_ALNUM; break;
4743 case 'W': namedclass = ANYOF_NALNUM; break;
4744 case 's': namedclass = ANYOF_SPACE; break;
4745 case 'S': namedclass = ANYOF_NSPACE; break;
4746 case 'd': namedclass = ANYOF_DIGIT; break;
4747 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
4748 case 'p':
4749 case 'P':
af6f566e 4750 if (RExC_parse >= RExC_end)
2a4859cd 4751 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 4752 if (*RExC_parse == '{') {
0da60cf5 4753 U8 c = (U8)value;
ffc61ed2
JH
4754 e = strchr(RExC_parse++, '}');
4755 if (!e)
0da60cf5 4756 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
4757 while (isSPACE(UCHARAT(RExC_parse)))
4758 RExC_parse++;
4759 if (e == RExC_parse)
0da60cf5 4760 vFAIL2("Empty \\%c{}", c);
ffc61ed2 4761 n = e - RExC_parse;
ab13f0c7
JH
4762 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4763 n--;
ffc61ed2
JH
4764 }
4765 else {
4766 e = RExC_parse;
4767 n = 1;
4768 }
4769 if (!SIZE_ONLY) {
ab13f0c7
JH
4770 if (UCHARAT(RExC_parse) == '^') {
4771 RExC_parse++;
4772 n--;
4773 value = value == 'p' ? 'P' : 'p'; /* toggle */
4774 while (isSPACE(UCHARAT(RExC_parse))) {
4775 RExC_parse++;
4776 n--;
4777 }
4778 }
ffc61ed2 4779 if (value == 'p')
ab13f0c7
JH
4780 Perl_sv_catpvf(aTHX_ listsv,
4781 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 4782 else
ab13f0c7
JH
4783 Perl_sv_catpvf(aTHX_ listsv,
4784 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
4785 }
4786 RExC_parse = e + 1;
4787 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2
JP
4788 namedclass = ANYOF_MAX; /* no official name, but it's named */
4789 break;
b8c5462f
JH
4790 case 'n': value = '\n'; break;
4791 case 'r': value = '\r'; break;
4792 case 't': value = '\t'; break;
4793 case 'f': value = '\f'; break;
4794 case 'b': value = '\b'; break;
c7f1f016
NIS
4795 case 'e': value = ASCII_TO_NATIVE('\033');break;
4796 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 4797 case 'x':
ffc61ed2 4798 if (*RExC_parse == '{') {
a4c04bdc
NC
4799 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4800 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 4801 e = strchr(RExC_parse++, '}');
b81d288d 4802 if (!e)
ffc61ed2 4803 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
4804
4805 numlen = e - RExC_parse;
4806 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4807 RExC_parse = e + 1;
4808 }
4809 else {
a4c04bdc 4810 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4811 numlen = 2;
4812 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4813 RExC_parse += numlen;
4814 }
b8c5462f
JH
4815 break;
4816 case 'c':
830247a4 4817 value = UCHARAT(RExC_parse++);
b8c5462f
JH
4818 value = toCTRL(value);
4819 break;
4820 case '0': case '1': case '2': case '3': case '4':
4821 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
4822 {
4823 I32 flags = 0;
4824 numlen = 3;
4825 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 4826 RExC_parse += numlen;
b8c5462f 4827 break;
53305cf1 4828 }
1028017a 4829 default:
e476b1b5 4830 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
4831 vWARN2(RExC_parse,
4832 "Unrecognized escape \\%c in character class passed through",
4833 (int)value);
1028017a 4834 break;
b8c5462f 4835 }
ffc61ed2 4836 } /* end of \blah */
1b2d223b
JH
4837#ifdef EBCDIC
4838 else
4839 literal_endpoint++;
4840#endif
ffc61ed2
JH
4841
4842 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4843
4844 if (!SIZE_ONLY && !need_class)
936ed897 4845 ANYOF_CLASS_ZERO(ret);
ffc61ed2 4846
936ed897 4847 need_class = 1;
ffc61ed2
JH
4848
4849 /* a bad range like a-\d, a-[:digit:] ? */
4850 if (range) {
73b437c8 4851 if (!SIZE_ONLY) {
e476b1b5 4852 if (ckWARN(WARN_REGEXP))
830247a4 4853 vWARN4(RExC_parse,
b45f050a 4854 "False [] range \"%*.*s\"",
830247a4
IZ
4855 RExC_parse - rangebegin,
4856 RExC_parse - rangebegin,
b45f050a 4857 rangebegin);
3568d838
JH
4858 if (prevvalue < 256) {
4859 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
4860 ANYOF_BITMAP_SET(ret, '-');
4861 }
4862 else {
4863 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4864 Perl_sv_catpvf(aTHX_ listsv,
3568d838 4865 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 4866 }
b8c5462f 4867 }
ffc61ed2
JH
4868
4869 range = 0; /* this was not a true range */
73b437c8 4870 }
ffc61ed2 4871
73b437c8 4872 if (!SIZE_ONLY) {
c49a72a9
NC
4873 const char *what = NULL;
4874 char yesno = 0;
4875
3568d838
JH
4876 if (namedclass > OOB_NAMEDCLASS)
4877 optimize_invert = FALSE;
e2962f66
JH
4878 /* Possible truncation here but in some 64-bit environments
4879 * the compiler gets heartburn about switch on 64-bit values.
4880 * A similar issue a little earlier when switching on value.
98f323fa 4881 * --jhi */
e2962f66 4882 switch ((I32)namedclass) {
73b437c8
JH
4883 case ANYOF_ALNUM:
4884 if (LOC)
936ed897 4885 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
4886 else {
4887 for (value = 0; value < 256; value++)
4888 if (isALNUM(value))
936ed897 4889 ANYOF_BITMAP_SET(ret, value);
73b437c8 4890 }
c49a72a9
NC
4891 yesno = '+';
4892 what = "Word";
73b437c8
JH
4893 break;
4894 case ANYOF_NALNUM:
4895 if (LOC)
936ed897 4896 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
4897 else {
4898 for (value = 0; value < 256; value++)
4899 if (!isALNUM(value))
936ed897 4900 ANYOF_BITMAP_SET(ret, value);
73b437c8 4901 }
c49a72a9
NC
4902 yesno = '!';
4903 what = "Word";
73b437c8 4904 break;
ffc61ed2 4905 case ANYOF_ALNUMC:
73b437c8 4906 if (LOC)
ffc61ed2 4907 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
4908 else {
4909 for (value = 0; value < 256; value++)
ffc61ed2 4910 if (isALNUMC(value))
936ed897 4911 ANYOF_BITMAP_SET(ret, value);
73b437c8 4912 }
c49a72a9
NC
4913 yesno = '+';
4914 what = "Alnum";
73b437c8
JH
4915 break;
4916 case ANYOF_NALNUMC:
4917 if (LOC)
936ed897 4918 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
4919 else {
4920 for (value = 0; value < 256; value++)
4921 if (!isALNUMC(value))
936ed897 4922 ANYOF_BITMAP_SET(ret, value);
73b437c8 4923 }
c49a72a9
NC
4924 yesno = '!';
4925 what = "Alnum";
73b437c8
JH
4926 break;
4927 case ANYOF_ALPHA:
4928 if (LOC)
936ed897 4929 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
4930 else {
4931 for (value = 0; value < 256; value++)
4932 if (isALPHA(value))
936ed897 4933 ANYOF_BITMAP_SET(ret, value);
73b437c8 4934 }
c49a72a9
NC
4935 yesno = '+';
4936 what = "Alpha";
73b437c8
JH
4937 break;
4938 case ANYOF_NALPHA:
4939 if (LOC)
936ed897 4940 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
4941 else {
4942 for (value = 0; value < 256; value++)
4943 if (!isALPHA(value))
936ed897 4944 ANYOF_BITMAP_SET(ret, value);
73b437c8 4945 }
c49a72a9
NC
4946 yesno = '!';
4947 what = "Alpha";
73b437c8
JH
4948 break;
4949 case ANYOF_ASCII:
4950 if (LOC)
936ed897 4951 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 4952 else {
c7f1f016 4953#ifndef EBCDIC
1ba5c669
JH
4954 for (value = 0; value < 128; value++)
4955 ANYOF_BITMAP_SET(ret, value);
4956#else /* EBCDIC */
ffbc6a93 4957 for (value = 0; value < 256; value++) {
3a3c4447
JH
4958 if (isASCII(value))
4959 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4960 }
1ba5c669 4961#endif /* EBCDIC */
73b437c8 4962 }
c49a72a9
NC
4963 yesno = '+';
4964 what = "ASCII";
73b437c8
JH
4965 break;
4966 case ANYOF_NASCII:
4967 if (LOC)
936ed897 4968 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 4969 else {
c7f1f016 4970#ifndef EBCDIC
1ba5c669
JH
4971 for (value = 128; value < 256; value++)
4972 ANYOF_BITMAP_SET(ret, value);
4973#else /* EBCDIC */
ffbc6a93 4974 for (value = 0; value < 256; value++) {
3a3c4447
JH
4975 if (!isASCII(value))
4976 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4977 }
1ba5c669 4978#endif /* EBCDIC */
73b437c8 4979 }
c49a72a9
NC
4980 yesno = '!';
4981 what = "ASCII";
73b437c8 4982 break;
aaa51d5e
JF
4983 case ANYOF_BLANK:
4984 if (LOC)
4985 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4986 else {
4987 for (value = 0; value < 256; value++)
4988 if (isBLANK(value))
4989 ANYOF_BITMAP_SET(ret, value);
4990 }
c49a72a9
NC
4991 yesno = '+';
4992 what = "Blank";
aaa51d5e
JF
4993 break;
4994 case ANYOF_NBLANK:
4995 if (LOC)
4996 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4997 else {
4998 for (value = 0; value < 256; value++)
4999 if (!isBLANK(value))
5000 ANYOF_BITMAP_SET(ret, value);
5001 }
c49a72a9
NC
5002 yesno = '!';
5003 what = "Blank";
aaa51d5e 5004 break;
73b437c8
JH
5005 case ANYOF_CNTRL:
5006 if (LOC)
936ed897 5007 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
5008 else {
5009 for (value = 0; value < 256; value++)
5010 if (isCNTRL(value))
936ed897 5011 ANYOF_BITMAP_SET(ret, value);
73b437c8 5012 }
c49a72a9
NC
5013 yesno = '+';
5014 what = "Cntrl";
73b437c8
JH
5015 break;
5016 case ANYOF_NCNTRL:
5017 if (LOC)
936ed897 5018 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5019 else {
5020 for (value = 0; value < 256; value++)
5021 if (!isCNTRL(value))
936ed897 5022 ANYOF_BITMAP_SET(ret, value);
73b437c8 5023 }
c49a72a9
NC
5024 yesno = '!';
5025 what = "Cntrl";
ffc61ed2
JH
5026 break;
5027 case ANYOF_DIGIT:
5028 if (LOC)
5029 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5030 else {
5031 /* consecutive digits assumed */
5032 for (value = '0'; value <= '9'; value++)
5033 ANYOF_BITMAP_SET(ret, value);
5034 }
c49a72a9
NC
5035 yesno = '+';
5036 what = "Digit";
ffc61ed2
JH
5037 break;
5038 case ANYOF_NDIGIT:
5039 if (LOC)
5040 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5041 else {
5042 /* consecutive digits assumed */
5043 for (value = 0; value < '0'; value++)
5044 ANYOF_BITMAP_SET(ret, value);
5045 for (value = '9' + 1; value < 256; value++)
5046 ANYOF_BITMAP_SET(ret, value);
5047 }
c49a72a9
NC
5048 yesno = '!';
5049 what = "Digit";
73b437c8
JH
5050 break;
5051 case ANYOF_GRAPH:
5052 if (LOC)
936ed897 5053 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5054 else {
5055 for (value = 0; value < 256; value++)
5056 if (isGRAPH(value))
936ed897 5057 ANYOF_BITMAP_SET(ret, value);
73b437c8 5058 }
c49a72a9
NC
5059 yesno = '+';
5060 what = "Graph";
73b437c8
JH
5061 break;
5062 case ANYOF_NGRAPH:
5063 if (LOC)
936ed897 5064 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5065 else {
5066 for (value = 0; value < 256; value++)
5067 if (!isGRAPH(value))
936ed897 5068 ANYOF_BITMAP_SET(ret, value);
73b437c8 5069 }
c49a72a9
NC
5070 yesno = '!';
5071 what = "Graph";
73b437c8
JH
5072 break;
5073 case ANYOF_LOWER:
5074 if (LOC)
936ed897 5075 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5076 else {
5077 for (value = 0; value < 256; value++)
5078 if (isLOWER(value))
936ed897 5079 ANYOF_BITMAP_SET(ret, value);
73b437c8 5080 }
c49a72a9
NC
5081 yesno = '+';
5082 what = "Lower";
73b437c8
JH
5083 break;
5084 case ANYOF_NLOWER:
5085 if (LOC)
936ed897 5086 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5087 else {
5088 for (value = 0; value < 256; value++)
5089 if (!isLOWER(value))
936ed897 5090 ANYOF_BITMAP_SET(ret, value);
73b437c8 5091 }
c49a72a9
NC
5092 yesno = '!';
5093 what = "Lower";
73b437c8
JH
5094 break;
5095 case ANYOF_PRINT:
5096 if (LOC)
936ed897 5097 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5098 else {
5099 for (value = 0; value < 256; value++)
5100 if (isPRINT(value))
936ed897 5101 ANYOF_BITMAP_SET(ret, value);
73b437c8 5102 }
c49a72a9
NC
5103 yesno = '+';
5104 what = "Print";
73b437c8
JH
5105 break;
5106 case ANYOF_NPRINT:
5107 if (LOC)
936ed897 5108 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5109 else {
5110 for (value = 0; value < 256; value++)
5111 if (!isPRINT(value))
936ed897 5112 ANYOF_BITMAP_SET(ret, value);
73b437c8 5113 }
c49a72a9
NC
5114 yesno = '!';
5115 what = "Print";
73b437c8 5116 break;
aaa51d5e
JF
5117 case ANYOF_PSXSPC:
5118 if (LOC)
5119 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5120 else {
5121 for (value = 0; value < 256; value++)
5122 if (isPSXSPC(value))
5123 ANYOF_BITMAP_SET(ret, value);
5124 }
c49a72a9
NC
5125 yesno = '+';
5126 what = "Space";
aaa51d5e
JF
5127 break;
5128 case ANYOF_NPSXSPC:
5129 if (LOC)
5130 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5131 else {
5132 for (value = 0; value < 256; value++)
5133 if (!isPSXSPC(value))
5134 ANYOF_BITMAP_SET(ret, value);
5135 }
c49a72a9
NC
5136 yesno = '!';
5137 what = "Space";
aaa51d5e 5138 break;
73b437c8
JH
5139 case ANYOF_PUNCT:
5140 if (LOC)
936ed897 5141 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5142 else {
5143 for (value = 0; value < 256; value++)
5144 if (isPUNCT(value))
936ed897 5145 ANYOF_BITMAP_SET(ret, value);
73b437c8 5146 }
c49a72a9
NC
5147 yesno = '+';
5148 what = "Punct";
73b437c8
JH
5149 break;
5150 case ANYOF_NPUNCT:
5151 if (LOC)
936ed897 5152 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5153 else {
5154 for (value = 0; value < 256; value++)
5155 if (!isPUNCT(value))
936ed897 5156 ANYOF_BITMAP_SET(ret, value);
73b437c8 5157 }
c49a72a9
NC
5158 yesno = '!';
5159 what = "Punct";
ffc61ed2
JH
5160 break;
5161 case ANYOF_SPACE:
5162 if (LOC)
5163 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5164 else {
5165 for (value = 0; value < 256; value++)
5166 if (isSPACE(value))
5167 ANYOF_BITMAP_SET(ret, value);
5168 }
c49a72a9
NC
5169 yesno = '+';
5170 what = "SpacePerl";
ffc61ed2
JH
5171 break;
5172 case ANYOF_NSPACE:
5173 if (LOC)
5174 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5175 else {
5176 for (value = 0; value < 256; value++)
5177 if (!isSPACE(value))
5178 ANYOF_BITMAP_SET(ret, value);
5179 }
c49a72a9
NC
5180 yesno = '!';
5181 what = "SpacePerl";
73b437c8
JH
5182 break;
5183 case ANYOF_UPPER:
5184 if (LOC)
936ed897 5185 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5186 else {
5187 for (value = 0; value < 256; value++)
5188 if (isUPPER(value))
936ed897 5189 ANYOF_BITMAP_SET(ret, value);
73b437c8 5190 }
c49a72a9
NC
5191 yesno = '+';
5192 what = "Upper";
73b437c8
JH
5193 break;
5194 case ANYOF_NUPPER:
5195 if (LOC)
936ed897 5196 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5197 else {
5198 for (value = 0; value < 256; value++)
5199 if (!isUPPER(value))
936ed897 5200 ANYOF_BITMAP_SET(ret, value);
73b437c8 5201 }
c49a72a9
NC
5202 yesno = '!';
5203 what = "Upper";
73b437c8
JH
5204 break;
5205 case ANYOF_XDIGIT:
5206 if (LOC)
936ed897 5207 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5208 else {
5209 for (value = 0; value < 256; value++)
5210 if (isXDIGIT(value))
936ed897 5211 ANYOF_BITMAP_SET(ret, value);
73b437c8 5212 }
c49a72a9
NC
5213 yesno = '+';
5214 what = "XDigit";
73b437c8
JH
5215 break;
5216 case ANYOF_NXDIGIT:
5217 if (LOC)
936ed897 5218 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5219 else {
5220 for (value = 0; value < 256; value++)
5221 if (!isXDIGIT(value))
936ed897 5222 ANYOF_BITMAP_SET(ret, value);
73b437c8 5223 }
c49a72a9
NC
5224 yesno = '!';
5225 what = "XDigit";
73b437c8 5226 break;
f81125e2
JP
5227 case ANYOF_MAX:
5228 /* this is to handle \p and \P */
5229 break;
73b437c8 5230 default:
b45f050a 5231 vFAIL("Invalid [::] class");
73b437c8 5232 break;
b8c5462f 5233 }
c49a72a9
NC
5234 if (what) {
5235 /* Strings such as "+utf8::isWord\n" */
5236 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5237 }
b8c5462f 5238 if (LOC)
936ed897 5239 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5240 continue;
a0d0e21e 5241 }
ffc61ed2
JH
5242 } /* end of namedclass \blah */
5243
a0d0e21e 5244 if (range) {
eb160463 5245 if (prevvalue > (IV)value) /* b-a */ {
b45f050a 5246 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
5247 RExC_parse - rangebegin,
5248 RExC_parse - rangebegin,
b45f050a 5249 rangebegin);
3568d838 5250 range = 0; /* not a valid range */
73b437c8 5251 }
a0d0e21e
LW
5252 }
5253 else {
3568d838 5254 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5255 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5256 RExC_parse[1] != ']') {
5257 RExC_parse++;
ffc61ed2
JH
5258
5259 /* a bad range like \w-, [:word:]- ? */
5260 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 5261 if (ckWARN(WARN_REGEXP))
830247a4 5262 vWARN4(RExC_parse,
b45f050a 5263 "False [] range \"%*.*s\"",
830247a4
IZ
5264 RExC_parse - rangebegin,
5265 RExC_parse - rangebegin,
b45f050a 5266 rangebegin);
73b437c8 5267 if (!SIZE_ONLY)
936ed897 5268 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5269 } else
ffc61ed2
JH
5270 range = 1; /* yeah, it's a range! */
5271 continue; /* but do it the next time */
a0d0e21e 5272 }
a687059c 5273 }
ffc61ed2 5274
93733859 5275 /* now is the next time */
ae5c130c 5276 if (!SIZE_ONLY) {
3568d838
JH
5277 IV i;
5278
5279 if (prevvalue < 256) {
5280 IV ceilvalue = value < 256 ? value : 255;
5281
5282#ifdef EBCDIC
1b2d223b
JH
5283 /* In EBCDIC [\x89-\x91] should include
5284 * the \x8e but [i-j] should not. */
5285 if (literal_endpoint == 2 &&
5286 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5287 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5288 {
3568d838
JH
5289 if (isLOWER(prevvalue)) {
5290 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5291 if (isLOWER(i))
5292 ANYOF_BITMAP_SET(ret, i);
5293 } else {
3568d838 5294 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5295 if (isUPPER(i))
5296 ANYOF_BITMAP_SET(ret, i);
5297 }
8ada0baa 5298 }
ffc61ed2 5299 else
8ada0baa 5300#endif
a5961de5
JH
5301 for (i = prevvalue; i <= ceilvalue; i++)
5302 ANYOF_BITMAP_SET(ret, i);
3568d838 5303 }
a5961de5 5304 if (value > 255 || UTF) {
b08decb7
JH
5305 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5306 UV natvalue = NATIVE_TO_UNI(value);
5307
ffc61ed2 5308 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5309 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5310 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5311 prevnatvalue, natvalue);
5312 }
5313 else if (prevnatvalue == natvalue) {
5314 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5315 if (FOLD) {
89ebb4a3 5316 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5317 STRLEN foldlen;
2f3bf011 5318 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5319
c840d2a2
JH
5320 /* If folding and foldable and a single
5321 * character, insert also the folded version
5322 * to the charclass. */
9e55ce06 5323 if (f != value) {
eb160463 5324 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5325 Perl_sv_catpvf(aTHX_ listsv,
5326 "%04"UVxf"\n", f);
5327 else {
5328 /* Any multicharacter foldings
5329 * require the following transform:
5330 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5331 * where E folds into "pq" and F folds
5332 * into "rst", all other characters
5333 * fold to single characters. We save
5334 * away these multicharacter foldings,
5335 * to be later saved as part of the
5336 * additional "s" data. */
5337 SV *sv;
5338
5339 if (!unicode_alternate)
5340 unicode_alternate = newAV();
5341 sv = newSVpvn((char*)foldbuf, foldlen);
5342 SvUTF8_on(sv);
5343 av_push(unicode_alternate, sv);
5344 }
5345 }
254ba52a 5346
60a8b682
JH
5347 /* If folding and the value is one of the Greek
5348 * sigmas insert a few more sigmas to make the
5349 * folding rules of the sigmas to work right.
5350 * Note that not all the possible combinations
5351 * are handled here: some of them are handled
9e55ce06
JH
5352 * by the standard folding rules, and some of
5353 * them (literal or EXACTF cases) are handled
5354 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5355 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5356 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5357 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5358 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5359 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5360 }
5361 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5362 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5363 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5364 }
5365 }
ffc61ed2 5366 }
1b2d223b
JH
5367#ifdef EBCDIC
5368 literal_endpoint = 0;
5369#endif
8ada0baa 5370 }
ffc61ed2
JH
5371
5372 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5373 }
ffc61ed2 5374
936ed897 5375 if (need_class) {
4f66b38d 5376 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5377 if (SIZE_ONLY)
830247a4 5378 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5379 else
830247a4 5380 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5381 }
ffc61ed2 5382
ae5c130c 5383 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 5384 if (!SIZE_ONLY &&
ffc61ed2 5385 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5386 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5387 ) {
a0ed51b3 5388 for (value = 0; value < 256; ++value) {
936ed897 5389 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5390 UV fold = PL_fold[value];
ffc61ed2
JH
5391
5392 if (fold != value)
5393 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5394 }
5395 }
936ed897 5396 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5397 }
ffc61ed2 5398
ae5c130c 5399 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 5400 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
5401 /* If the only flag is inversion. */
5402 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5403 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5404 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5405 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5406 }
a0d0e21e 5407
b81d288d 5408 if (!SIZE_ONLY) {
fde631ed 5409 AV *av = newAV();
ffc61ed2
JH
5410 SV *rv;
5411
9e55ce06 5412 /* The 0th element stores the character class description
6a0407ee 5413 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5414 * to initialize the appropriate swash (which gets stored in
5415 * the 1st element), and also useful for dumping the regnode.
5416 * The 2nd element stores the multicharacter foldings,
6a0407ee 5417 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
5418 av_store(av, 0, listsv);
5419 av_store(av, 1, NULL);
9e55ce06 5420 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 5421 rv = newRV_noinc((SV*)av);
19860706 5422 n = add_data(pRExC_state, 1, "s");
830247a4 5423 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 5424 ARG_SET(ret, n);
a0ed51b3
LW
5425 }
5426
5427 return ret;
5428}
5429
76e3520e 5430STATIC char*
830247a4 5431S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 5432{
830247a4 5433 char* retval = RExC_parse++;
a0d0e21e 5434
4633a7c4 5435 for (;;) {
830247a4
IZ
5436 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5437 RExC_parse[2] == '#') {
e994fd66
AE
5438 while (*RExC_parse != ')') {
5439 if (RExC_parse == RExC_end)
5440 FAIL("Sequence (?#... not terminated");
830247a4 5441 RExC_parse++;
e994fd66 5442 }
830247a4 5443 RExC_parse++;
4633a7c4
LW
5444 continue;
5445 }
e2509266 5446 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
5447 if (isSPACE(*RExC_parse)) {
5448 RExC_parse++;
748a9306
LW
5449 continue;
5450 }
830247a4 5451 else if (*RExC_parse == '#') {
e994fd66
AE
5452 while (RExC_parse < RExC_end)
5453 if (*RExC_parse++ == '\n') break;
748a9306
LW
5454 continue;
5455 }
748a9306 5456 }
4633a7c4 5457 return retval;
a0d0e21e 5458 }
a687059c
LW
5459}
5460
5461/*
c277df42 5462- reg_node - emit a node
a0d0e21e 5463*/
76e3520e 5464STATIC regnode * /* Location. */
830247a4 5465S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 5466{
c277df42
IZ
5467 register regnode *ret;
5468 register regnode *ptr;
a687059c 5469
830247a4 5470 ret = RExC_emit;
c277df42 5471 if (SIZE_ONLY) {
830247a4
IZ
5472 SIZE_ALIGN(RExC_size);
5473 RExC_size += 1;
a0d0e21e
LW
5474 return(ret);
5475 }
a687059c 5476
c277df42 5477 NODE_ALIGN_FILL(ret);
a0d0e21e 5478 ptr = ret;
c277df42 5479 FILL_ADVANCE_NODE(ptr, op);
fac92740 5480 if (RExC_offsets) { /* MJD */
ccb2c380 5481 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
5482 "reg_node", __LINE__,
5483 reg_name[op],
5484 RExC_emit - RExC_emit_start > RExC_offsets[0]
5485 ? "Overwriting end of array!\n" : "OK",
5486 RExC_emit - RExC_emit_start,
5487 RExC_parse - RExC_start,
5488 RExC_offsets[0]));
ccb2c380 5489 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
5490 }
5491
830247a4 5492 RExC_emit = ptr;
a687059c 5493
a0d0e21e 5494 return(ret);
a687059c
LW
5495}
5496
5497/*
a0d0e21e
LW
5498- reganode - emit a node with an argument
5499*/
76e3520e 5500STATIC regnode * /* Location. */
830247a4 5501S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 5502{
c277df42
IZ
5503 register regnode *ret;
5504 register regnode *ptr;
fe14fcc3 5505
830247a4 5506 ret = RExC_emit;
c277df42 5507 if (SIZE_ONLY) {
830247a4
IZ
5508 SIZE_ALIGN(RExC_size);
5509 RExC_size += 2;
a0d0e21e
LW
5510 return(ret);
5511 }
fe14fcc3 5512
c277df42 5513 NODE_ALIGN_FILL(ret);
a0d0e21e 5514 ptr = ret;
c277df42 5515 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 5516 if (RExC_offsets) { /* MJD */
ccb2c380 5517 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5518 "reganode",
ccb2c380
MP
5519 __LINE__,
5520 reg_name[op],
fac92740
MJD
5521 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5522 "Overwriting end of array!\n" : "OK",
5523 RExC_emit - RExC_emit_start,
5524 RExC_parse - RExC_start,
5525 RExC_offsets[0]));
ccb2c380 5526 Set_Cur_Node_Offset;
fac92740
MJD
5527 }
5528
830247a4 5529 RExC_emit = ptr;
fe14fcc3 5530
a0d0e21e 5531 return(ret);
fe14fcc3
LW
5532}
5533
5534/*
cd439c50 5535- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
5536*/
5537STATIC void
830247a4 5538S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 5539{
5e12f4fb 5540 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
5541}
5542
5543/*
a0d0e21e
LW
5544- reginsert - insert an operator in front of already-emitted operand
5545*
5546* Means relocating the operand.
5547*/
76e3520e 5548STATIC void
830247a4 5549S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 5550{
c277df42
IZ
5551 register regnode *src;
5552 register regnode *dst;
5553 register regnode *place;
5554 register int offset = regarglen[(U8)op];
b81d288d 5555
22c35a8c 5556/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
5557
5558 if (SIZE_ONLY) {
830247a4 5559 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
5560 return;
5561 }
a687059c 5562
830247a4
IZ
5563 src = RExC_emit;
5564 RExC_emit += NODE_STEP_REGNODE + offset;
5565 dst = RExC_emit;
fac92740 5566 while (src > opnd) {
c277df42 5567 StructCopy(--src, --dst, regnode);
fac92740 5568 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 5569 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 5570 "reg_insert",
ccb2c380
MP
5571 __LINE__,
5572 reg_name[op],
fac92740
MJD
5573 dst - RExC_emit_start > RExC_offsets[0]
5574 ? "Overwriting end of array!\n" : "OK",
5575 src - RExC_emit_start,
5576 dst - RExC_emit_start,
5577 RExC_offsets[0]));
ccb2c380
MP
5578 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5579 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
5580 }
5581 }
5582
a0d0e21e
LW
5583
5584 place = opnd; /* Op node, where operand used to be. */
fac92740 5585 if (RExC_offsets) { /* MJD */
ccb2c380 5586 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5587 "reginsert",
ccb2c380
MP
5588 __LINE__,
5589 reg_name[op],
fac92740
MJD
5590 place - RExC_emit_start > RExC_offsets[0]
5591 ? "Overwriting end of array!\n" : "OK",
5592 place - RExC_emit_start,
5593 RExC_parse - RExC_start,
5594 RExC_offsets[0]));
ccb2c380 5595 Set_Node_Offset(place, RExC_parse);
45948336 5596 Set_Node_Length(place, 1);
fac92740 5597 }
c277df42
IZ
5598 src = NEXTOPER(place);
5599 FILL_ADVANCE_NODE(place, op);
5600 Zero(src, offset, regnode);
a687059c
LW
5601}
5602
5603/*
c277df42 5604- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 5605*/
76e3520e 5606STATIC void
830247a4 5607S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5608{
c277df42
IZ
5609 register regnode *scan;
5610 register regnode *temp;
a0d0e21e 5611
c277df42 5612 if (SIZE_ONLY)
a0d0e21e
LW
5613 return;
5614
5615 /* Find last node. */
5616 scan = p;
5617 for (;;) {
5618 temp = regnext(scan);
5619 if (temp == NULL)
5620 break;
5621 scan = temp;
5622 }
a687059c 5623
c277df42
IZ
5624 if (reg_off_by_arg[OP(scan)]) {
5625 ARG_SET(scan, val - scan);
a0ed51b3
LW
5626 }
5627 else {
c277df42
IZ
5628 NEXT_OFF(scan) = val - scan;
5629 }
a687059c
LW
5630}
5631
5632/*
a0d0e21e
LW
5633- regoptail - regtail on operand of first argument; nop if operandless
5634*/
76e3520e 5635STATIC void
830247a4 5636S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5637{
a0d0e21e 5638 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
5639 if (p == NULL || SIZE_ONLY)
5640 return;
22c35a8c 5641 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 5642 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 5643 }
22c35a8c 5644 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 5645 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
5646 }
5647 else
a0d0e21e 5648 return;
a687059c
LW
5649}
5650
5651/*
5652 - regcurly - a little FSA that accepts {\d+,?\d*}
5653 */
79072805 5654STATIC I32
cea2e8a9 5655S_regcurly(pTHX_ register char *s)
a687059c
LW
5656{
5657 if (*s++ != '{')
5658 return FALSE;
f0fcb552 5659 if (!isDIGIT(*s))
a687059c 5660 return FALSE;
f0fcb552 5661 while (isDIGIT(*s))
a687059c
LW
5662 s++;
5663 if (*s == ',')
5664 s++;
f0fcb552 5665 while (isDIGIT(*s))
a687059c
LW
5666 s++;
5667 if (*s != '}')
5668 return FALSE;
5669 return TRUE;
5670}
5671
a687059c 5672
8fa7f367
JH
5673#ifdef DEBUGGING
5674
76e3520e 5675STATIC regnode *
cea2e8a9 5676S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 5677{
f248d071 5678 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 5679 register regnode *next;
c277df42
IZ
5680
5681 while (op != END && (!last || node < last)) {
5682 /* While that wasn't END last time... */
5683
5684 NODE_ALIGN(node);
5685 op = OP(node);
5686 if (op == CLOSE)
5687 l--;
5688 next = regnext(node);
5689 /* Where, what. */
5690 if (OP(node) == OPTIMIZED)
5691 goto after_print;
5692 regprop(sv, node);
b900a521 5693 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 5694 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
5695 if (next == NULL) /* Next ptr. */
5696 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 5697 else
b900a521 5698 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
5699 (void)PerlIO_putc(Perl_debug_log, '\n');
5700 after_print:
22c35a8c 5701 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
5702 register regnode *nnode = (OP(next) == LONGJMP
5703 ? regnext(next)
c277df42
IZ
5704 : next);
5705 if (last && nnode > last)
5706 nnode = last;
5707 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 5708 }
22c35a8c 5709 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 5710 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3 5711 }
a3621e74
YO
5712 else if ( PL_regkind[(U8)op] == TRIE ) {
5713 I32 n = ARG(node);
5714 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5715 I32 word_idx;
5716 I32 arry_len=av_len(trie->words)+1;
5717 PerlIO_printf(Perl_debug_log,
5718 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%d%s]\n",
5719 (int)(2*(l+3)), "",
5720 trie->wordcount,
5721 trie->charcount,
5722 trie->uniquecharcount,
5723 trie->laststate-1,
5724 node->flags ? " EVAL mode" : "");
5725
5726 for (word_idx=0; word_idx < arry_len; word_idx++) {
5727 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
5728 if (elem_ptr) {
5729 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
5730 (int)(2*(l+4)), "",
5731 PL_colors[0],
5732 SvPV_nolen(*elem_ptr),
5733 PL_colors[1]
5734 );
5735 /*
5736 if (next == NULL)
5737 PerlIO_printf(Perl_debug_log, "(0)\n");
5738 else
5739 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
5740 */
5741 }
5742
5743 }
5744
5745 node = NEXTOPER(node);
5746 node += regarglen[(U8)op];
5747
5748 }
a0ed51b3 5749 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
5750 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5751 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 5752 }
22c35a8c 5753 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
5754 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5755 next, sv, l + 1);
a0ed51b3
LW
5756 }
5757 else if ( op == PLUS || op == STAR) {
c277df42 5758 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
5759 }
5760 else if (op == ANYOF) {
4f66b38d
HS
5761 /* arglen 1 + class block */
5762 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
5763 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
5764 node = NEXTOPER(node);
a0ed51b3 5765 }
22c35a8c 5766 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 5767 /* Literal string, where present. */
cd439c50 5768 node += NODE_SZ_STR(node) - 1;
c277df42 5769 node = NEXTOPER(node);
a0ed51b3
LW
5770 }
5771 else {
c277df42
IZ
5772 node = NEXTOPER(node);
5773 node += regarglen[(U8)op];
5774 }
5775 if (op == CURLYX || op == OPEN)
5776 l++;
5777 else if (op == WHILEM)
5778 l--;
5779 }
5780 return node;
5781}
5782
8fa7f367
JH
5783#endif /* DEBUGGING */
5784
a687059c 5785/*
fd181c75 5786 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
5787 */
5788void
864dbfa3 5789Perl_regdump(pTHX_ regexp *r)
a687059c 5790{
35ff7856 5791#ifdef DEBUGGING
46fc3d4c 5792 SV *sv = sv_newmortal();
a687059c 5793
c277df42 5794 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
5795
5796 /* Header fields of interest. */
c277df42 5797 if (r->anchored_substr)
7b0972df 5798 PerlIO_printf(Perl_debug_log,
b81d288d 5799 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 5800 PL_colors[0],
7b0972df 5801 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 5802 SvPVX(r->anchored_substr),
3280af22 5803 PL_colors[1],
c277df42 5804 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 5805 (IV)r->anchored_offset);
33b8afdf
JH
5806 else if (r->anchored_utf8)
5807 PerlIO_printf(Perl_debug_log,
5808 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
5809 PL_colors[0],
5810 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5811 SvPVX(r->anchored_utf8),
5812 PL_colors[1],
5813 SvTAIL(r->anchored_utf8) ? "$" : "",
5814 (IV)r->anchored_offset);
c277df42 5815 if (r->float_substr)
7b0972df 5816 PerlIO_printf(Perl_debug_log,
b81d288d 5817 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 5818 PL_colors[0],
b81d288d 5819 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 5820 SvPVX(r->float_substr),
3280af22 5821 PL_colors[1],
c277df42 5822 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 5823 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
5824 else if (r->float_utf8)
5825 PerlIO_printf(Perl_debug_log,
5826 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
5827 PL_colors[0],
5828 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5829 SvPVX(r->float_utf8),
5830 PL_colors[1],
5831 SvTAIL(r->float_utf8) ? "$" : "",
5832 (IV)r->float_min_offset, (UV)r->float_max_offset);
5833 if (r->check_substr || r->check_utf8)
b81d288d
AB
5834 PerlIO_printf(Perl_debug_log,
5835 r->check_substr == r->float_substr
33b8afdf 5836 && r->check_utf8 == r->float_utf8
c277df42
IZ
5837 ? "(checking floating" : "(checking anchored");
5838 if (r->reganch & ROPT_NOSCAN)
5839 PerlIO_printf(Perl_debug_log, " noscan");
5840 if (r->reganch & ROPT_CHECK_ALL)
5841 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 5842 if (r->check_substr || r->check_utf8)
c277df42
IZ
5843 PerlIO_printf(Perl_debug_log, ") ");
5844
46fc3d4c 5845 if (r->regstclass) {
5846 regprop(sv, r->regstclass);
5847 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
5848 }
774d564b 5849 if (r->reganch & ROPT_ANCH) {
5850 PerlIO_printf(Perl_debug_log, "anchored");
5851 if (r->reganch & ROPT_ANCH_BOL)
5852 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
5853 if (r->reganch & ROPT_ANCH_MBOL)
5854 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
5855 if (r->reganch & ROPT_ANCH_SBOL)
5856 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 5857 if (r->reganch & ROPT_ANCH_GPOS)
5858 PerlIO_printf(Perl_debug_log, "(GPOS)");
5859 PerlIO_putc(Perl_debug_log, ' ');
5860 }
c277df42
IZ
5861 if (r->reganch & ROPT_GPOS_SEEN)
5862 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 5863 if (r->reganch & ROPT_SKIP)
760ac839 5864 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 5865 if (r->reganch & ROPT_IMPLICIT)
760ac839 5866 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 5867 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
5868 if (r->reganch & ROPT_EVAL_SEEN)
5869 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 5870 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
5871 if (r->offsets) {
5872 U32 i;
5873 U32 len = r->offsets[0];
a3621e74
YO
5874 GET_RE_DEBUG_FLAGS_DECL;
5875 DEBUG_OFFSETS_r({
392fbf5d 5876 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 5877 for (i = 1; i <= len; i++)
392fbf5d
RB
5878 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5879 (UV)r->offsets[i*2-1],
5880 (UV)r->offsets[i*2]);
fac92740 5881 PerlIO_printf(Perl_debug_log, "\n");
a3621e74 5882 });
fac92740 5883 }
17c3b450 5884#endif /* DEBUGGING */
a687059c
LW
5885}
5886
8fa7f367
JH
5887#ifdef DEBUGGING
5888
653099ff
GS
5889STATIC void
5890S_put_byte(pTHX_ SV *sv, int c)
5891{
7be5a6cf 5892 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
5893 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5894 else if (c == '-' || c == ']' || c == '\\' || c == '^')
5895 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
5896 else
5897 Perl_sv_catpvf(aTHX_ sv, "%c", c);
5898}
5899
8fa7f367
JH
5900#endif /* DEBUGGING */
5901
a3621e74 5902
a687059c 5903/*
a0d0e21e
LW
5904- regprop - printable representation of opcode
5905*/
46fc3d4c 5906void
864dbfa3 5907Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 5908{
35ff7856 5909#ifdef DEBUGGING
9b155405 5910 register int k;
a0d0e21e 5911
54dc92de 5912 sv_setpvn(sv, "", 0);
9b155405 5913 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
5914 /* It would be nice to FAIL() here, but this may be called from
5915 regexec.c, and it would be hard to supply pRExC_state. */
5916 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 5917 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405
IZ
5918
5919 k = PL_regkind[(U8)OP(o)];
5920
2a782b5b
JH
5921 if (k == EXACT) {
5922 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
5923 /* Using is_utf8_string() is a crude hack but it may
5924 * be the best for now since we have no flag "this EXACTish
5925 * node was UTF-8" --jhi */
5926 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
8a989385 5927 char *s = do_utf8 ?
c728cb41
JH
5928 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5929 UNI_DISPLAY_REGEX) :
2a782b5b 5930 STRING(o);
40eddc46 5931 int len = do_utf8 ?
2a782b5b
JH
5932 strlen(s) :
5933 STR_LEN(o);
5934 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5935 PL_colors[0],
5936 len, s,
5937 PL_colors[1]);
a3621e74
YO
5938 } else if (k == TRIE) {/*
5939 this isn't always safe, as Pl_regdata may not be for this regex yet
5940 (depending on where its called from) so its being moved to dumpuntil
5941 I32 n = ARG(o);
5942 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5943 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5944 trie->wordcount,
5945 trie->charcount,
5946 trie->uniquecharcount,
5947 trie->laststate);
5948 */
5949 } else if (k == CURLY) {
cb434fcc 5950 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
5951 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5952 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 5953 }
2c2d71f5
JH
5954 else if (k == WHILEM && o->flags) /* Ordinal/of */
5955 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 5956 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 5957 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 5958 else if (k == LOGICAL)
04ebc1ab 5959 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
5960 else if (k == ANYOF) {
5961 int i, rangestart = -1;
ffc61ed2 5962 U8 flags = ANYOF_FLAGS(o);
a6d05634 5963 const char * const anyofs[] = { /* Should be synchronized with
19860706 5964 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
5965 "\\w",
5966 "\\W",
5967 "\\s",
5968 "\\S",
5969 "\\d",
5970 "\\D",
5971 "[:alnum:]",
5972 "[:^alnum:]",
5973 "[:alpha:]",
5974 "[:^alpha:]",
5975 "[:ascii:]",
5976 "[:^ascii:]",
5977 "[:ctrl:]",
5978 "[:^ctrl:]",
5979 "[:graph:]",
5980 "[:^graph:]",
5981 "[:lower:]",
5982 "[:^lower:]",
5983 "[:print:]",
5984 "[:^print:]",
5985 "[:punct:]",
5986 "[:^punct:]",
5987 "[:upper:]",
aaa51d5e 5988 "[:^upper:]",
653099ff 5989 "[:xdigit:]",
aaa51d5e
JF
5990 "[:^xdigit:]",
5991 "[:space:]",
5992 "[:^space:]",
5993 "[:blank:]",
5994 "[:^blank:]"
653099ff
GS
5995 };
5996
19860706 5997 if (flags & ANYOF_LOCALE)
653099ff 5998 sv_catpv(sv, "{loc}");
19860706 5999 if (flags & ANYOF_FOLD)
653099ff
GS
6000 sv_catpv(sv, "{i}");
6001 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 6002 if (flags & ANYOF_INVERT)
653099ff 6003 sv_catpv(sv, "^");
ffc61ed2
JH
6004 for (i = 0; i <= 256; i++) {
6005 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6006 if (rangestart == -1)
6007 rangestart = i;
6008 } else if (rangestart != -1) {
6009 if (i <= rangestart + 3)
6010 for (; rangestart < i; rangestart++)
653099ff 6011 put_byte(sv, rangestart);
ffc61ed2
JH
6012 else {
6013 put_byte(sv, rangestart);
6014 sv_catpv(sv, "-");
6015 put_byte(sv, i - 1);
653099ff 6016 }
ffc61ed2 6017 rangestart = -1;
653099ff 6018 }
847a199f 6019 }
ffc61ed2
JH
6020
6021 if (o->flags & ANYOF_CLASS)
6022 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
6023 if (ANYOF_CLASS_TEST(o,i))
6024 sv_catpv(sv, anyofs[i]);
6025
6026 if (flags & ANYOF_UNICODE)
6027 sv_catpv(sv, "{unicode}");
1aa99e6b 6028 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 6029 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
6030
6031 {
6032 SV *lv;
9e55ce06 6033 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 6034
ffc61ed2
JH
6035 if (lv) {
6036 if (sw) {
89ebb4a3 6037 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 6038
ffc61ed2 6039 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 6040 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 6041
3568d838 6042 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
6043 if (rangestart == -1)
6044 rangestart = i;
6045 } else if (rangestart != -1) {
6046 U8 *p;
b81d288d 6047
ffc61ed2
JH
6048 if (i <= rangestart + 3)
6049 for (; rangestart < i; rangestart++) {
2b9d42f0 6050 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
6051 put_byte(sv, *p);
6052 }
6053 else {
2b9d42f0 6054 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
6055 put_byte(sv, *p);
6056 sv_catpv(sv, "-");
2b9d42f0 6057 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
6058 put_byte(sv, *p);
6059 }
6060 rangestart = -1;
6061 }
19860706 6062 }
ffc61ed2
JH
6063
6064 sv_catpv(sv, "..."); /* et cetera */
19860706 6065 }
fde631ed 6066
ffc61ed2 6067 {
2e0de35c 6068 char *s = savesvpv(lv);
ffc61ed2 6069 char *origs = s;
b81d288d 6070
ffc61ed2 6071 while(*s && *s != '\n') s++;
b81d288d 6072
ffc61ed2
JH
6073 if (*s == '\n') {
6074 char *t = ++s;
6075
6076 while (*s) {
6077 if (*s == '\n')
6078 *s = ' ';
6079 s++;
6080 }
6081 if (s[-1] == ' ')
6082 s[-1] = 0;
6083
6084 sv_catpv(sv, t);
fde631ed 6085 }
b81d288d 6086
ffc61ed2 6087 Safefree(origs);
fde631ed
JH
6088 }
6089 }
653099ff 6090 }
ffc61ed2 6091
653099ff
GS
6092 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6093 }
9b155405 6094 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 6095 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 6096#endif /* DEBUGGING */
35ff7856 6097}
a687059c 6098
cad2e5aa
JH
6099SV *
6100Perl_re_intuit_string(pTHX_ regexp *prog)
6101{ /* Assume that RE_INTUIT is set */
a3621e74
YO
6102 GET_RE_DEBUG_FLAGS_DECL;
6103 DEBUG_COMPILE_r(
cad2e5aa 6104 { STRLEN n_a;
33b8afdf
JH
6105 char *s = SvPV(prog->check_substr
6106 ? prog->check_substr : prog->check_utf8, n_a);
cad2e5aa
JH
6107
6108 if (!PL_colorset) reginitcolors();
6109 PerlIO_printf(Perl_debug_log,
33b8afdf
JH
6110 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
6111 PL_colors[4],
6112 prog->check_substr ? "" : "utf8 ",
6113 PL_colors[5],PL_colors[0],
cad2e5aa
JH
6114 s,
6115 PL_colors[1],
6116 (strlen(s) > 60 ? "..." : ""));
6117 } );
6118
33b8afdf 6119 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
6120}
6121
2b69d0c2 6122void
864dbfa3 6123Perl_pregfree(pTHX_ struct regexp *r)
a687059c 6124{
9e55ce06
JH
6125#ifdef DEBUGGING
6126 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
a3621e74 6127 SV *re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
9e55ce06 6128#endif
7821416a 6129
a3621e74 6130
7821416a
IZ
6131 if (!r || (--r->refcnt > 0))
6132 return;
a3621e74 6133 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
d103360b
HS
6134 int len;
6135 char *s;
6136
6137 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
6138 r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 6139 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
d103360b 6140 len = SvCUR(dsv);
9e55ce06
JH
6141 if (!PL_colorset)
6142 reginitcolors();
6143 PerlIO_printf(Perl_debug_log,
a3621e74 6144 "%sFreeing REx:%s %s%*.*s%s%s\n",
9e55ce06
JH
6145 PL_colors[4],PL_colors[5],PL_colors[0],
6146 len, len, s,
6147 PL_colors[1],
6148 len > 60 ? "..." : "");
6149 });
cad2e5aa 6150
c277df42 6151 if (r->precomp)
a0d0e21e 6152 Safefree(r->precomp);
fac92740
MJD
6153 if (r->offsets) /* 20010421 MJD */
6154 Safefree(r->offsets);
ed252734
NC
6155 RX_MATCH_COPY_FREE(r);
6156#ifdef PERL_COPY_ON_WRITE
6157 if (r->saved_copy)
6158 SvREFCNT_dec(r->saved_copy);
6159#endif
a193d654
GS
6160 if (r->substrs) {
6161 if (r->anchored_substr)
6162 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6163 if (r->anchored_utf8)
6164 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6165 if (r->float_substr)
6166 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6167 if (r->float_utf8)
6168 SvREFCNT_dec(r->float_utf8);
2779dcf1 6169 Safefree(r->substrs);
a193d654 6170 }
c277df42
IZ
6171 if (r->data) {
6172 int n = r->data->count;
f3548bdc
DM
6173 PAD* new_comppad = NULL;
6174 PAD* old_comppad;
4026c95a 6175 PADOFFSET refcnt;
dfad63ad 6176
c277df42 6177 while (--n >= 0) {
261faec3 6178 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6179 switch (r->data->what[n]) {
6180 case 's':
6181 SvREFCNT_dec((SV*)r->data->data[n]);
6182 break;
653099ff
GS
6183 case 'f':
6184 Safefree(r->data->data[n]);
6185 break;
dfad63ad
HS
6186 case 'p':
6187 new_comppad = (AV*)r->data->data[n];
6188 break;
c277df42 6189 case 'o':
dfad63ad 6190 if (new_comppad == NULL)
cea2e8a9 6191 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6192 PAD_SAVE_LOCAL(old_comppad,
6193 /* Watch out for global destruction's random ordering. */
6194 (SvTYPE(new_comppad) == SVt_PVAV) ?
6195 new_comppad : Null(PAD *)
6196 );
b34c0dd4 6197 OP_REFCNT_LOCK;
4026c95a
SH
6198 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6199 OP_REFCNT_UNLOCK;
6200 if (!refcnt)
9b978d73 6201 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6202
f3548bdc 6203 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6204 SvREFCNT_dec((SV*)new_comppad);
6205 new_comppad = NULL;
c277df42
IZ
6206 break;
6207 case 'n':
9e55ce06 6208 break;
a3621e74
YO
6209 case 't':
6210 {
6211 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6212 U32 refcount;
6213 OP_REFCNT_LOCK;
6214 refcount = trie->refcount--;
6215 OP_REFCNT_UNLOCK;
6216 if ( !refcount ) {
6217 if (trie->charmap)
6218 Safefree(trie->charmap);
6219 if (trie->widecharmap)
6220 SvREFCNT_dec((SV*)trie->widecharmap);
6221 if (trie->states)
6222 Safefree(trie->states);
6223 if (trie->trans)
6224 Safefree(trie->trans);
6225#ifdef DEBUGGING
6226 if (trie->words)
6227 SvREFCNT_dec((SV*)trie->words);
6228 if (trie->revcharmap)
6229 SvREFCNT_dec((SV*)trie->revcharmap);
6230#endif
6231 Safefree(r->data->data[n]); /* do this last!!!! */
6232 }
6233 break;
6234 }
c277df42 6235 default:
830247a4 6236 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6237 }
6238 }
6239 Safefree(r->data->what);
6240 Safefree(r->data);
a0d0e21e
LW
6241 }
6242 Safefree(r->startp);
6243 Safefree(r->endp);
6244 Safefree(r);
a687059c 6245}
c277df42
IZ
6246
6247/*
6248 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6249 */
6250regnode *
864dbfa3 6251Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
6252{
6253 register I32 offset;
6254
3280af22 6255 if (p == &PL_regdummy)
c277df42
IZ
6256 return(NULL);
6257
6258 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6259 if (offset == 0)
6260 return(NULL);
6261
c277df42 6262 return(p+offset);
c277df42
IZ
6263}
6264
01f988be 6265STATIC void
cea2e8a9 6266S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6267{
6268 va_list args;
6269 STRLEN l1 = strlen(pat1);
6270 STRLEN l2 = strlen(pat2);
6271 char buf[512];
06bf62c7 6272 SV *msv;
c277df42
IZ
6273 char *message;
6274
6275 if (l1 > 510)
6276 l1 = 510;
6277 if (l1 + l2 > 510)
6278 l2 = 510 - l1;
6279 Copy(pat1, buf, l1 , char);
6280 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6281 buf[l1 + l2] = '\n';
6282 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6283#ifdef I_STDARG
6284 /* ANSI variant takes additional second argument */
c277df42 6285 va_start(args, pat2);
8736538c
AS
6286#else
6287 va_start(args);
6288#endif
5a844595 6289 msv = vmess(buf, &args);
c277df42 6290 va_end(args);
06bf62c7 6291 message = SvPV(msv,l1);
c277df42
IZ
6292 if (l1 > 512)
6293 l1 = 512;
6294 Copy(message, buf, l1 , char);
197cf9b9 6295 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6296 Perl_croak(aTHX_ "%s", buf);
c277df42 6297}
a0ed51b3
LW
6298
6299/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6300
6301void
864dbfa3 6302Perl_save_re_context(pTHX)
b81d288d 6303{
830247a4 6304 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 6305 SAVEPPTR(PL_bostr);
a0ed51b3
LW
6306 SAVEPPTR(PL_reginput); /* String-input pointer. */
6307 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6308 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
6309 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6310 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6311 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a5db57d6 6312 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
a0ed51b3 6313 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 6314 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 6315 PL_reg_start_tmp = 0;
a0ed51b3
LW
6316 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6317 PL_reg_start_tmpl = 0;
7766f137 6318 SAVEVPTR(PL_regdata);
a0ed51b3
LW
6319 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6320 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 6321 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 6322 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
6323 SAVEVPTR(PL_regcc); /* from regexec.c */
6324 SAVEVPTR(PL_curcop);
7766f137
GS
6325 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6326 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
6327 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6328 SAVESPTR(PL_reg_sv); /* from regexec.c */
9febdf04 6329 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
7766f137 6330 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 6331 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
6332 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6333 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
a5db57d6
GS
6334 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
6335 PL_reg_oldsaved = Nullch;
6336 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6337 PL_reg_oldsavedlen = 0;
ed252734
NC
6338#ifdef PERL_COPY_ON_WRITE
6339 SAVESPTR(PL_nrs);
6340 PL_nrs = Nullsv;
6341#endif
a5db57d6
GS
6342 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6343 PL_reg_maxiter = 0;
6344 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6345 PL_reg_leftiter = 0;
6346 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
6347 PL_reg_poscache = Nullch;
6348 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6349 PL_reg_poscache_size = 0;
6350 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5fb7366e 6351 SAVEI32(PL_regnpar); /* () count. */
e49a9654 6352 SAVEI32(PL_regsize); /* from regexec.c */
ada6e8a9
AMS
6353
6354 {
6355 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
a8514157 6356 U32 i;
ada6e8a9
AMS
6357 GV *mgv;
6358 REGEXP *rx;
cf28e18a 6359 char digits[TYPE_CHARS(long)];
ada6e8a9
AMS
6360
6361 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
6362 for (i = 1; i <= rx->nparens; i++) {
d994d9a1 6363 sprintf(digits, "%lu", (long)i);
ada6e8a9
AMS
6364 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
6365 save_scalar(mgv);
6366 }
6367 }
6368 }
6369
54b6e2fa 6370#ifdef DEBUGGING
b81d288d 6371 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 6372#endif
a0ed51b3 6373}
51371543 6374
51371543 6375static void
acfe0abc 6376clear_re(pTHX_ void *r)
51371543
GS
6377{
6378 ReREFCNT_dec((regexp *)r);
6379}
ffbc6a93 6380
241d1a3b
NC
6381/*
6382 * Local variables:
6383 * c-indentation-style: bsd
6384 * c-basic-offset: 4
6385 * indent-tabs-mode: t
6386 * End:
6387 *
c49a72a9 6388 * vim: shiftwidth=4:
241d1a3b 6389*/