This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade to Math::BigInt 1.75, by Tels
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e
AD
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
cad2e5aa 38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
d06ea78c 44/* We *really* need to overwrite these symbols: */
56953603
IZ
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
d06ea78c 48# define Perl_pregfree my_regfree
cad2e5aa
JH
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
d06ea78c 51# define Perl_regnext my_regnext
f0b8d043 52# define Perl_save_re_context my_save_re_context
b81d288d 53# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
b81d288d 56#endif
56953603 57
f0fcb552 58/*SUPPRESS 112*/
a687059c 59/*
e50aee73 60 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
61 *
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
64 *
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
68 *
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
71 * from defects in it.
72 *
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
75 *
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
78 *
79 *
80 **** Alterations to Henry's code are...
81 ****
4bb101f2 82 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
241d1a3b 83 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 84 ****
9ef589d8
LW
85 **** You may distribute under the terms of either the GNU General Public
86 **** License or the Artistic License, as specified in the README file.
87
a687059c
LW
88 *
89 * Beware that some of this code is subtly aware of the way operator
90 * precedence is structured in regular expressions. Serious changes in
91 * regular-expression syntax might require a total rethink.
92 */
93#include "EXTERN.h"
864dbfa3 94#define PERL_IN_REGCOMP_C
a687059c 95#include "perl.h"
d06ea78c 96
acfe0abc 97#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
98# include "INTERN.h"
99#endif
c277df42
IZ
100
101#define REG_COMP_C
a687059c
LW
102#include "regcomp.h"
103
d4cce5f1 104#ifdef op
11343788 105#undef op
d4cce5f1 106#endif /* op */
11343788 107
fe14fcc3 108#ifdef MSDOS
7e4e8c89 109# if defined(BUGGY_MSC6)
fe14fcc3 110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 111# pragma optimize("a",off)
fe14fcc3 112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
113# pragma optimize("w",on )
114# endif /* BUGGY_MSC6 */
fe14fcc3
LW
115#endif /* MSDOS */
116
a687059c
LW
117#ifndef STATIC
118#define STATIC static
119#endif
120
830247a4 121typedef struct RExC_state_t {
e2509266 122 U32 flags; /* are we folding, multilining? */
830247a4
IZ
123 char *precomp; /* uncompiled string. */
124 regexp *rx;
fac92740 125 char *start; /* Start of input for compile */
830247a4
IZ
126 char *end; /* End of input for compile */
127 char *parse; /* Input-scan pointer. */
128 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 129 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 130 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
131 I32 naughty; /* How bad is this pattern? */
132 I32 sawback; /* Did we see \1, ...? */
133 U32 seen;
134 I32 size; /* Code size. */
135 I32 npar; /* () count. */
136 I32 extralen;
137 I32 seen_zerolen;
138 I32 seen_evals;
1aa99e6b 139 I32 utf8;
830247a4
IZ
140#if ADD_TO_REGEXEC
141 char *starttry; /* -Dr: where regtry was called. */
142#define RExC_starttry (pRExC_state->starttry)
143#endif
144} RExC_state_t;
145
e2509266 146#define RExC_flags (pRExC_state->flags)
830247a4
IZ
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
fac92740 149#define RExC_start (pRExC_state->start)
830247a4
IZ
150#define RExC_end (pRExC_state->end)
151#define RExC_parse (pRExC_state->parse)
152#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 153#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 154#define RExC_emit (pRExC_state->emit)
fac92740 155#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
156#define RExC_naughty (pRExC_state->naughty)
157#define RExC_sawback (pRExC_state->sawback)
158#define RExC_seen (pRExC_state->seen)
159#define RExC_size (pRExC_state->size)
160#define RExC_npar (pRExC_state->npar)
161#define RExC_extralen (pRExC_state->extralen)
162#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 164#define RExC_utf8 (pRExC_state->utf8)
830247a4 165
a687059c
LW
166#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
167#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168 ((*s) == '{' && regcurly(s)))
a687059c 169
35c8bce7
LW
170#ifdef SPSTART
171#undef SPSTART /* dratted cpp namespace... */
172#endif
a687059c
LW
173/*
174 * Flags to be passed up and down.
175 */
a687059c 176#define WORST 0 /* Worst case. */
821b33a5 177#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
178#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
179#define SPSTART 0x4 /* Starts with * or +. */
180#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 181
2c2d71f5
JH
182/* Length of a variant. */
183
184typedef struct scan_data_t {
185 I32 len_min;
186 I32 len_delta;
187 I32 pos_min;
188 I32 pos_delta;
189 SV *last_found;
190 I32 last_end; /* min value, <0 unless valid. */
191 I32 last_start_min;
192 I32 last_start_max;
193 SV **longest; /* Either &l_fixed, or &l_float. */
194 SV *longest_fixed;
195 I32 offset_fixed;
196 SV *longest_float;
197 I32 offset_float_min;
198 I32 offset_float_max;
199 I32 flags;
200 I32 whilem_c;
cb434fcc 201 I32 *last_closep;
653099ff 202 struct regnode_charclass_class *start_class;
2c2d71f5
JH
203} scan_data_t;
204
a687059c 205/*
e50aee73 206 * Forward declarations for pregcomp()'s friends.
a687059c 207 */
a0d0e21e 208
b81d288d 209static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 210 0, 0, 0, 0, 0, 0};
c277df42
IZ
211
212#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213#define SF_BEFORE_SEOL 0x1
214#define SF_BEFORE_MEOL 0x2
215#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
217
09b7f37c
CB
218#ifdef NO_UNARY_PLUS
219# define SF_FIX_SHIFT_EOL (0+2)
220# define SF_FL_SHIFT_EOL (0+4)
221#else
222# define SF_FIX_SHIFT_EOL (+2)
223# define SF_FL_SHIFT_EOL (+4)
224#endif
c277df42
IZ
225
226#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228
229#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231#define SF_IS_INF 0x40
232#define SF_HAS_PAR 0x80
233#define SF_IN_PAR 0x100
234#define SF_HAS_EVAL 0x200
4bfe0158 235#define SCF_DO_SUBSTR 0x400
653099ff
GS
236#define SCF_DO_STCLASS_AND 0x0800
237#define SCF_DO_STCLASS_OR 0x1000
238#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 239#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 240
eb160463 241#define UTF (RExC_utf8 != 0)
e2509266
JH
242#define LOC ((RExC_flags & PMf_LOCALE) != 0)
243#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 244
ffc61ed2 245#define OOB_UNICODE 12345678
93733859 246#define OOB_NAMEDCLASS -1
b8c5462f 247
a0ed51b3
LW
248#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
250
8615cb43 251
b45f050a
JF
252/* length of regex to show in messages that don't mark a position within */
253#define RegexLengthToShowInErrorMessages 127
254
255/*
256 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258 * op/pragma/warn/regcomp.
259 */
7253e4e3
RK
260#define MARKER1 "<-- HERE" /* marker as it appears in the description */
261#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 262
7253e4e3 263#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
264
265/*
266 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267 * arg. Show regex, up to a maximum length. If it's too long, chop and add
268 * "...".
269 */
ccb2c380
MP
270#define FAIL(msg) STMT_START { \
271 char *ellipses = ""; \
272 IV len = RExC_end - RExC_precomp; \
273 \
274 if (!SIZE_ONLY) \
275 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
276 if (len > RegexLengthToShowInErrorMessages) { \
277 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
278 len = RegexLengthToShowInErrorMessages - 10; \
279 ellipses = "..."; \
280 } \
281 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
282 msg, (int)len, RExC_precomp, ellipses); \
283} STMT_END
8615cb43 284
b45f050a
JF
285/*
286 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287 * args. Show regex, up to a maximum length. If it's too long, chop and add
288 * "...".
289 */
ccb2c380
MP
290#define FAIL2(pat,msg) STMT_START { \
291 char *ellipses = ""; \
292 IV len = RExC_end - RExC_precomp; \
293 \
294 if (!SIZE_ONLY) \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 if (len > RegexLengthToShowInErrorMessages) { \
297 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
298 len = RegexLengthToShowInErrorMessages - 10; \
299 ellipses = "..."; \
300 } \
301 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
302 msg, (int)len, RExC_precomp, ellipses); \
303} STMT_END
b45f050a
JF
304
305
306/*
307 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
308 */
ccb2c380
MP
309#define Simple_vFAIL(m) STMT_START { \
310 IV offset = RExC_parse - RExC_precomp; \
311 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
312 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
313} STMT_END
b45f050a
JF
314
315/*
316 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
317 */
ccb2c380
MP
318#define vFAIL(m) STMT_START { \
319 if (!SIZE_ONLY) \
320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
321 Simple_vFAIL(m); \
322} STMT_END
b45f050a
JF
323
324/*
325 * Like Simple_vFAIL(), but accepts two arguments.
326 */
ccb2c380
MP
327#define Simple_vFAIL2(m,a1) STMT_START { \
328 IV offset = RExC_parse - RExC_precomp; \
329 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
330 (int)offset, RExC_precomp, RExC_precomp + offset); \
331} STMT_END
b45f050a
JF
332
333/*
334 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
335 */
ccb2c380
MP
336#define vFAIL2(m,a1) STMT_START { \
337 if (!SIZE_ONLY) \
338 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
339 Simple_vFAIL2(m, a1); \
340} STMT_END
b45f050a
JF
341
342
343/*
344 * Like Simple_vFAIL(), but accepts three arguments.
345 */
ccb2c380
MP
346#define Simple_vFAIL3(m, a1, a2) STMT_START { \
347 IV offset = RExC_parse - RExC_precomp; \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
349 (int)offset, RExC_precomp, RExC_precomp + offset); \
350} STMT_END
b45f050a
JF
351
352/*
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354 */
ccb2c380
MP
355#define vFAIL3(m,a1,a2) STMT_START { \
356 if (!SIZE_ONLY) \
357 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
358 Simple_vFAIL3(m, a1, a2); \
359} STMT_END
b45f050a
JF
360
361/*
362 * Like Simple_vFAIL(), but accepts four arguments.
363 */
ccb2c380
MP
364#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
365 IV offset = RExC_parse - RExC_precomp; \
366 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
367 (int)offset, RExC_precomp, RExC_precomp + offset); \
368} STMT_END
b45f050a
JF
369
370/*
371 * Like Simple_vFAIL(), but accepts five arguments.
372 */
ccb2c380
MP
373#define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
374 IV offset = RExC_parse - RExC_precomp; \
375 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
376 (int)offset, RExC_precomp, RExC_precomp + offset); \
377} STMT_END
378
379
380#define vWARN(loc,m) STMT_START { \
381 IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
383 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
384} STMT_END
385
386#define vWARNdep(loc,m) STMT_START { \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
389 "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
391} STMT_END
392
393
394#define vWARN2(loc, m, a1) STMT_START { \
395 IV offset = loc - RExC_precomp; \
396 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
397 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
398} STMT_END
399
400#define vWARN3(loc, m, a1, a2) STMT_START { \
401 IV offset = loc - RExC_precomp; \
402 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
403 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
404} STMT_END
405
406#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
407 IV offset = loc - RExC_precomp; \
408 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
409 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
410} STMT_END
411
412#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
413 IV offset = loc - RExC_precomp; \
414 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
415 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
416} STMT_END
9d1d55b5 417
8615cb43 418
cd439c50 419/* Allow for side effects in s */
ccb2c380
MP
420#define REGC(c,s) STMT_START { \
421 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
422} STMT_END
cd439c50 423
fac92740
MJD
424/* Macros for recording node offsets. 20001227 mjd@plover.com
425 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
426 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
427 * Element 0 holds the number n.
428 */
429
430#define MJD_OFFSET_DEBUG(x)
a3621e74 431/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
432
433
434#define Set_Node_Offset_To_R(node,byte) STMT_START { \
435 if (! SIZE_ONLY) { \
436 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
437 __LINE__, (node), (byte))); \
438 if((node) < 0) { \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
440 } else { \
441 RExC_offsets[2*(node)-1] = (byte); \
442 } \
443 } \
444} STMT_END
445
446#define Set_Node_Offset(node,byte) \
447 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
449
450#define Set_Node_Length_To_R(node,len) STMT_START { \
451 if (! SIZE_ONLY) { \
452 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
453 __LINE__, (node), (len))); \
454 if((node) < 0) { \
455 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
456 } else { \
457 RExC_offsets[2*(node)] = (len); \
458 } \
459 } \
460} STMT_END
461
462#define Set_Node_Length(node,len) \
463 Set_Node_Length_To_R((node)-RExC_emit_start, len)
464#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465#define Set_Node_Cur_Length(node) \
466 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
467
468/* Get offsets and lengths */
469#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
471
acfe0abc 472static void clear_re(pTHX_ void *r);
4327152a 473
653099ff
GS
474/* Mark that we cannot extend a found fixed substring at this point.
475 Updata the longest found anchored substring and the longest found
476 floating substrings if needed. */
477
4327152a 478STATIC void
830247a4 479S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 480{
a0ed51b3
LW
481 STRLEN l = CHR_SVLEN(data->last_found);
482 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 483
c277df42 484 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 485 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
486 if (*data->longest == data->longest_fixed) {
487 data->offset_fixed = l ? data->last_start_min : data->pos_min;
488 if (data->flags & SF_BEFORE_EOL)
b81d288d 489 data->flags
c277df42
IZ
490 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
491 else
492 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
493 }
494 else {
c277df42 495 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
496 data->offset_float_max = (l
497 ? data->last_start_max
c277df42 498 : data->pos_min + data->pos_delta);
9051bda5
HS
499 if ((U32)data->offset_float_max > (U32)I32_MAX)
500 data->offset_float_max = I32_MAX;
c277df42 501 if (data->flags & SF_BEFORE_EOL)
b81d288d 502 data->flags
c277df42
IZ
503 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
504 else
505 data->flags &= ~SF_FL_BEFORE_EOL;
506 }
507 }
508 SvCUR_set(data->last_found, 0);
0eda9292
JH
509 {
510 SV * sv = data->last_found;
511 MAGIC *mg =
512 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513 if (mg && mg->mg_len > 0)
514 mg->mg_len = 0;
515 }
c277df42
IZ
516 data->last_end = -1;
517 data->flags &= ~SF_BEFORE_EOL;
518}
519
653099ff
GS
520/* Can match anything (initialization) */
521STATIC void
830247a4 522S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 523{
653099ff 524 ANYOF_CLASS_ZERO(cl);
f8bef550 525 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
527 if (LOC)
528 cl->flags |= ANYOF_LOCALE;
529}
530
531/* Can match anything (initialization) */
532STATIC int
533S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
534{
535 int value;
536
aaa51d5e 537 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
539 return 1;
1aa99e6b
IH
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
541 return 0;
f8bef550
NC
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
543 return 0;
653099ff
GS
544 return 1;
545}
546
547/* Can match anything (initialization) */
548STATIC void
830247a4 549S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 550{
8ecf7187 551 Zero(cl, 1, struct regnode_charclass_class);
653099ff 552 cl->type = ANYOF;
830247a4 553 cl_anything(pRExC_state, cl);
653099ff
GS
554}
555
556STATIC void
830247a4 557S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 558{
8ecf7187 559 Zero(cl, 1, struct regnode_charclass_class);
653099ff 560 cl->type = ANYOF;
830247a4 561 cl_anything(pRExC_state, cl);
653099ff
GS
562 if (LOC)
563 cl->flags |= ANYOF_LOCALE;
564}
565
566/* 'And' a given class with another one. Can create false positives */
567/* We assume that cl is not inverted */
568STATIC void
569S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
571{
653099ff
GS
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
577 int i;
578
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
582 else
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
1aa99e6b 588
14ebb1a2
JH
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
594 }
14ebb1a2
JH
595 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 597 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 600 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
601}
602
603/* 'OR' a given class with another one. Can create false positives */
604/* We assume that cl is not inverted */
605STATIC void
830247a4 606S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 607{
653099ff
GS
608 if (or_with->flags & ANYOF_INVERT) {
609 /* We do not use
610 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611 * <= (B1 | !B2) | (CL1 | !CL2)
612 * which is wasteful if CL2 is small, but we ignore CL2:
613 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614 * XXXX Can we handle case-fold? Unclear:
615 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
617 */
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619 && !(or_with->flags & ANYOF_FOLD)
620 && !(cl->flags & ANYOF_FOLD) ) {
621 int i;
622
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= ~or_with->bitmap[i];
625 } /* XXXX: logic is complicated otherwise */
626 else {
830247a4 627 cl_anything(pRExC_state, cl);
653099ff
GS
628 }
629 } else {
630 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 632 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
633 || (cl->flags & ANYOF_FOLD)) ) {
634 int i;
635
636 /* OR char bitmap and class bitmap separately */
637 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638 cl->bitmap[i] |= or_with->bitmap[i];
639 if (or_with->flags & ANYOF_CLASS) {
640 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641 cl->classflags[i] |= or_with->classflags[i];
642 cl->flags |= ANYOF_CLASS;
643 }
644 }
645 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 646 cl_anything(pRExC_state, cl);
653099ff
GS
647 }
648 }
649 if (or_with->flags & ANYOF_EOS)
650 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
651
652 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653 ARG(cl) != ARG(or_with)) {
654 cl->flags |= ANYOF_UNICODE_ALL;
655 cl->flags &= ~ANYOF_UNICODE;
656 }
657 if (or_with->flags & ANYOF_UNICODE_ALL) {
658 cl->flags |= ANYOF_UNICODE_ALL;
659 cl->flags &= ~ANYOF_UNICODE;
660 }
653099ff
GS
661}
662
5d1c421c 663/*
a3621e74
YO
664
665 make_trie(startbranch,first,last,tail,flags)
666 startbranch: the first branch in the whole branch sequence
667 first : start branch of sequence of branch-exact nodes.
668 May be the same as startbranch
669 last : Thing following the last branch.
670 May be the same as tail.
671 tail : item following the branch sequence
672 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
673
674Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
675
676A trie is an N'ary tree where the branches are determined by digital
677decomposition of the key. IE, at the root node you look up the 1st character and
678follow that branch repeat until you find the end of the branches. Nodes can be
679marked as "accepting" meaning they represent a complete word. Eg:
680
681 /he|she|his|hers/
682
683would convert into the following structure. Numbers represent states, letters
684following numbers represent valid transitions on the letter from that state, if
685the number is in square brackets it represents an accepting state, otherwise it
686will be in parenthesis.
687
688 +-h->+-e->[3]-+-r->(8)-+-s->[9]
689 | |
690 | (2)
691 | |
692 (1) +-i->(6)-+-s->[7]
693 |
694 +-s->(3)-+-h->(4)-+-e->[5]
695
696 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
697
698This shows that when matching against the string 'hers' we will begin at state 1
699read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
700then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
701is also accepting. Thus we know that we can match both 'he' and 'hers' with a
702single traverse. We store a mapping from accepting to state to which word was
703matched, and then when we have multiple possibilities we try to complete the
704rest of the regex in the order in which they occured in the alternation.
705
706The only prior NFA like behaviour that would be changed by the TRIE support is
707the silent ignoring of duplicate alternations which are of the form:
708
709 / (DUPE|DUPE) X? (?{ ... }) Y /x
710
711Thus EVAL blocks follwing a trie may be called a different number of times with
712and without the optimisation. With the optimisations dupes will be silently
713ignored. This inconsistant behaviour of EVAL type nodes is well established as
714the following demonstrates:
715
716 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
717
718which prints out 'word' three times, but
719
720 'words'=~/(word|word|word)(?{ print $1 })S/
721
722which doesnt print it out at all. This is due to other optimisations kicking in.
723
724Example of what happens on a structural level:
725
726The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
727
728 1: CURLYM[1] {1,32767}(18)
729 5: BRANCH(8)
730 6: EXACT <ac>(16)
731 8: BRANCH(11)
732 9: EXACT <ad>(16)
733 11: BRANCH(14)
734 12: EXACT <ab>(16)
735 16: SUCCEED(0)
736 17: NOTHING(18)
737 18: END(0)
738
739This would be optimizable with startbranch=5, first=5, last=16, tail=16
740and should turn into:
741
742 1: CURLYM[1] {1,32767}(18)
743 5: TRIE(16)
744 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
745 <ac>
746 <ad>
747 <ab>
748 16: SUCCEED(0)
749 17: NOTHING(18)
750 18: END(0)
751
752Cases where tail != last would be like /(?foo|bar)baz/:
753
754 1: BRANCH(4)
755 2: EXACT <foo>(8)
756 4: BRANCH(7)
757 5: EXACT <bar>(8)
758 7: TAIL(8)
759 8: EXACT <baz>(10)
760 10: END(0)
761
762which would be optimizable with startbranch=1, first=1, last=7, tail=8
763and would end up looking like:
764
765 1: TRIE(8)
766 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
767 <foo>
768 <bar>
769 7: TAIL(8)
770 8: EXACT <baz>(10)
771 10: END(0)
772
773*/
774
775#define TRIE_DEBUG_CHAR \
776 DEBUG_TRIE_COMPILE_r({ \
777 SV *tmp; \
778 if ( UTF ) { \
779 tmp = newSVpv( "", 0 ); \
780 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
781 } else { \
782 tmp = Perl_newSVpvf_nocontext( "%c", uvc ); \
783 } \
784 av_push( trie->revcharmap, tmp ); \
785 })
786
787#define TRIE_READ_CHAR STMT_START { \
788 if ( UTF ) { \
789 if ( folder ) { \
790 if ( foldlen > 0 ) { \
791 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
792 foldlen -= len; \
793 scan += len; \
794 len = 0; \
795 } else { \
796 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags); \
797 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
798 foldlen -= UNISKIP( uvc ); \
799 scan = foldbuf + UNISKIP( uvc ); \
800 } \
801 } else { \
802 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags); \
803 } \
804 } else { \
805 uvc = (U32)*uc; \
806 len = 1; \
807 } \
808} STMT_END
809
810
811#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
812#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
813#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
814#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
815
816#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
817 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
818 TRIE_LIST_LEN( state ) *= 2; \
819 Renew( trie->states[ state ].trans.list, \
820 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
821 } \
822 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
823 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
824 TRIE_LIST_CUR( state )++; \
825} STMT_END
826
827#define TRIE_LIST_NEW(state) STMT_START { \
828 Newz( 1023, trie->states[ state ].trans.list, \
829 4, reg_trie_trans_le ); \
830 TRIE_LIST_CUR( state ) = 1; \
831 TRIE_LIST_LEN( state ) = 4; \
832} STMT_END
833
834STATIC I32
835S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
836{
837 /* first pass, loop through and scan words */
838 reg_trie_data *trie;
839 regnode *cur;
840 U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
841 STRLEN len = 0;
842 UV uvc = 0;
843 U16 curword = 0;
844 U32 next_alloc = 0;
845 /* we just use folder as a flag in utf8 */
846 const U8 *folder=( flags == EXACTF
847 ? PL_fold
848 : ( flags == EXACTFL
849 ? PL_fold_locale
850 : NULL
851 )
852 );
853
854 U32 data_slot = add_data( pRExC_state, 1, "t" );
855 SV *re_trie_maxbuff;
856
857 GET_RE_DEBUG_FLAGS_DECL;
858
859 Newz( 848200, trie, 1, reg_trie_data );
860 trie->refcount = 1;
861 RExC_rx->data->data[ data_slot ] = (void*)trie;
862 Newz( 848201, trie->charmap, 256, U16 );
863 DEBUG_r({
864 trie->words = newAV();
865 trie->revcharmap = newAV();
866 });
867
868
869 re_trie_maxbuff=get_sv(RE_TRIE_MAXBUFF, 1);
870 if (!SvIOK(re_trie_maxbuff)) {
871 sv_setiv(re_trie_maxbuff, TRIE_SIMPLE_MAX_BUFF);
872 }
873
874 /* -- First loop and Setup --
875
876 We first traverse the branches and scan each word to determine if it
877 contains widechars, and how many unique chars there are, this is
878 important as we have to build a table with at least as many columns as we
879 have unique chars.
880
881 We use an array of integers to represent the character codes 0..255
882 (trie->charmap) and we use a an HV* to store unicode characters. We use the
883 native representation of the character value as the key and IV's for the
884 coded index.
885
886 *TODO* If we keep track of how many times each character is used we can
887 remap the columns so that the table compression later on is more
888 efficient in terms of memory by ensuring most common value is in the
889 middle and the least common are on the outside. IMO this would be better
890 than a most to least common mapping as theres a decent chance the most
891 common letter will share a node with the least common, meaning the node
892 will not be compressable. With a middle is most common approach the worst
893 case is when we have the least common nodes twice.
894
895 */
896
897
898 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
899 regnode *noper = NEXTOPER( cur );
900 U8 *uc = (U8*)STRING( noper );
901 U8 *e = uc + STR_LEN( noper );
902 STRLEN foldlen = 0;
903 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
904 U8 *scan;
905
906 for ( ; uc < e ; uc += len ) {
907 trie->charcount++;
908 TRIE_READ_CHAR;
909 if ( uvc < 256 ) {
910 if ( !trie->charmap[ uvc ] ) {
911 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
912 if ( folder )
913 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
914 TRIE_DEBUG_CHAR;
915 }
916 } else {
917 SV** svpp;
918 if ( !trie->widecharmap )
919 trie->widecharmap = newHV();
920
921 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
922
923 if ( !svpp )
924 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%X", uvc );
925
926 if ( !SvTRUE( *svpp ) ) {
927 sv_setiv( *svpp, ++trie->uniquecharcount );
928 TRIE_DEBUG_CHAR;
929 }
930 }
931 }
932 trie->wordcount++;
933 } /* end first pass */
934 DEBUG_TRIE_COMPILE_r(
935 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
936 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
937 trie->charcount, trie->uniquecharcount )
938 );
939
940
941 /*
942 We now know what we are dealing with in terms of unique chars and
943 string sizes so we can calculate how much memory a naive
944 representation using a flat table will take. If its over a reasonable
945 limit (as specified by $^RE_TRIE_MAXBUFF) we use a more memory
946 conservative but potentially much slower representation using an array
947 of lists.
948
949 At the end we convert both representations into the same compressed
950 form that will be used in regexec.c for matching with. The latter
951 is a form that cannot be used to construct with but has memory
952 properties similar to the list form and access properties similar
953 to the table form making it both suitable for fast searches and
954 small enough that its feasable to store for the duration of a program.
955
956 See the comment in the code where the compressed table is produced
957 inplace from the flat tabe representation for an explanation of how
958 the compression works.
959
960 */
961
962
963 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
964 /*
965 Second Pass -- Array Of Lists Representation
966
967 Each state will be represented by a list of charid:state records
968 (reg_trie_trans_le) the first such element holds the CUR and LEN
969 points of the allocated array. (See defines above).
970
971 We build the initial structure using the lists, and then convert
972 it into the compressed table form which allows faster lookups
973 (but cant be modified once converted).
974
975
976 */
977
978
979 STRLEN transcount = 1;
980
981 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
982 TRIE_LIST_NEW(1);
983 next_alloc = 2;
984
985 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
986
987 regnode *noper = NEXTOPER( cur );
988 U8 *uc = (U8*)STRING( noper );
989 U8 *e = uc + STR_LEN( noper );
990 U32 state = 1; /* required init */
991 U16 charid = 0; /* sanity init */
992 U8 *scan = (U8*)NULL; /* sanity init */
993 STRLEN foldlen = 0; /* required init */
994 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
995
996
997 for ( ; uc < e ; uc += len ) {
998
999 TRIE_READ_CHAR;
1000
1001 if ( uvc < 256 ) {
1002 charid = trie->charmap[ uvc ];
1003 } else {
1004 SV** svpp=(SV**)NULL;
1005 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1006 if ( !svpp ) {
1007 charid = 0;
1008 } else {
1009 charid=(U16)SvIV( *svpp );
1010 }
1011 }
1012 if ( charid ) {
1013
1014 U16 check;
1015 U32 newstate = 0;
1016
1017 charid--;
1018 if ( !trie->states[ state ].trans.list ) {
1019 TRIE_LIST_NEW( state );
1020 }
1021 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1022 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1023 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1024 break;
1025 }
1026 }
1027 if ( ! newstate ) {
1028 newstate = next_alloc++;
1029 TRIE_LIST_PUSH( state, charid, newstate );
1030 transcount++;
1031 }
1032 state = newstate;
1033
1034 } else {
1035 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %d", uvc );
1036 }
1037 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1038 }
1039
1040 if ( !trie->states[ state ].wordnum ) {
1041 /* we havent inserted this word into the structure yet. */
1042 trie->states[ state ].wordnum = ++curword;
1043
1044 DEBUG_r({
1045 /* store the word for dumping */
1046 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1047 if ( UTF ) SvUTF8_on( tmp );
1048 av_push( trie->words, tmp );
1049 });
1050
1051 } else {
1052 /* Its a dupe. So ignore it. */
1053 }
1054
1055 } /* end second pass */
1056
1057 trie->laststate = next_alloc;
1058 Renew( trie->states, next_alloc, reg_trie_state );
1059
1060 DEBUG_TRIE_COMPILE_MORE_r({
1061 U32 state;
1062 U16 charid;
1063
1064 /*
1065 print out the table precompression.
1066 */
1067
1068 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1069 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1070
1071 for( state=1 ; state < next_alloc ; state ++ ) {
1072
1073 PerlIO_printf( Perl_debug_log, "\n %04X :", state );
1074 if ( ! trie->states[ state ].wordnum ) {
1075 PerlIO_printf( Perl_debug_log, "%5s| ","");
1076 } else {
1077 PerlIO_printf( Perl_debug_log, "W%04X| ",
1078 trie->states[ state ].wordnum
1079 );
1080 }
1081 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1082 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1083 PerlIO_printf( Perl_debug_log, "%s:%3X=%04X | ",
1084 SvPV_nolen( *tmp ),
1085 TRIE_LIST_ITEM(state,charid).forid,
1086 TRIE_LIST_ITEM(state,charid).newstate
1087 );
1088 }
1089
1090 }
1091 PerlIO_printf( Perl_debug_log, "\n\n" );
1092 });
1093
1094 Newz( 848203, trie->trans, transcount ,reg_trie_trans );
1095 {
1096 U32 state;
1097 U16 idx;
1098 U32 tp = 0;
1099 U32 zp = 0;
1100
1101
1102 for( state=1 ; state < next_alloc ; state ++ ) {
1103 U32 base=0;
1104
1105 /*
1106 DEBUG_TRIE_COMPILE_MORE_r(
1107 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1108 );
1109 */
1110
1111 if (trie->states[state].trans.list) {
1112 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1113 U16 maxid=minid;
1114
1115
1116 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1117 if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
1118 minid=TRIE_LIST_ITEM( state, idx).forid;
1119 } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
1120 maxid=TRIE_LIST_ITEM( state, idx).forid;
1121 }
1122 }
1123 if ( transcount < tp + maxid - minid + 1) {
1124 transcount *= 2;
1125 Renew( trie->trans, transcount, reg_trie_trans );
1126 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1127 }
1128 base = trie->uniquecharcount + tp - minid;
1129 if ( maxid == minid ) {
1130 U32 set = 0;
1131 for ( ; zp < tp ; zp++ ) {
1132 if ( ! trie->trans[ zp ].next ) {
1133 base = trie->uniquecharcount + zp - minid;
1134 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1135 trie->trans[ zp ].check = state;
1136 set = 1;
1137 break;
1138 }
1139 }
1140 if ( !set ) {
1141 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1142 trie->trans[ tp ].check = state;
1143 tp++;
1144 zp = tp;
1145 }
1146 } else {
1147 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1148 U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1149 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1150 trie->trans[ tid ].check = state;
1151 }
1152 tp += ( maxid - minid + 1 );
1153 }
1154 Safefree(trie->states[ state ].trans.list);
1155 }
1156 /*
1157 DEBUG_TRIE_COMPILE_MORE_r(
1158 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1159 );
1160 */
1161 trie->states[ state ].trans.base=base;
1162 }
1163 Renew( trie->trans, tp + 1, reg_trie_trans );
1164
1165 }
1166 } else {
1167 /*
1168 Second Pass -- Flat Table Representation.
1169
1170 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1171 We know that we will need Charcount+1 trans at most to store the data
1172 (one row per char at worst case) So we preallocate both structures
1173 assuming worst case.
1174
1175 We then construct the trie using only the .next slots of the entry
1176 structs.
1177
1178 We use the .check field of the first entry of the node temporarily to
1179 make compression both faster and easier by keeping track of how many non
1180 zero fields are in the node.
1181
1182 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1183 transition.
1184
1185 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1186 number representing the first entry of the node, and state as a
1187 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1188 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1189 are 2 entrys per node. eg:
1190
1191 A B A B
1192 1. 2 4 1. 3 7
1193 2. 0 3 3. 0 5
1194 3. 0 0 5. 0 0
1195 4. 0 0 7. 0 0
1196
1197 The table is internally in the right hand, idx form. However as we also
1198 have to deal with the states array which is indexed by nodenum we have to
1199 use TRIE_NODENUM() to convert.
1200
1201 */
1202
1203 Newz( 848203, trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1204 reg_trie_trans );
1205 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
1206 next_alloc = trie->uniquecharcount + 1;
1207
1208 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1209
1210 regnode *noper = NEXTOPER( cur );
1211 U8 *uc = (U8*)STRING( noper );
1212 U8 *e = uc + STR_LEN( noper );
1213
1214 U32 state = 1; /* required init */
1215
1216 U16 charid = 0; /* sanity init */
1217 U32 accept_state = 0; /* sanity init */
1218 U8 *scan = (U8*)NULL; /* sanity init */
1219
1220 STRLEN foldlen = 0; /* required init */
1221 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1222
1223
1224 for ( ; uc < e ; uc += len ) {
1225
1226 TRIE_READ_CHAR;
1227
1228 if ( uvc < 256 ) {
1229 charid = trie->charmap[ uvc ];
1230 } else {
1231 SV** svpp=(SV**)NULL;
1232 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1233 if ( !svpp ) {
1234 charid = 0;
1235 } else {
1236 charid=(U16)SvIV( *svpp );
1237 }
1238 }
1239 if ( charid ) {
1240 charid--;
1241 if ( !trie->trans[ state + charid ].next ) {
1242 trie->trans[ state + charid ].next = next_alloc;
1243 trie->trans[ state ].check++;
1244 next_alloc += trie->uniquecharcount;
1245 }
1246 state = trie->trans[ state + charid ].next;
1247 } else {
1248 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %d", uvc );
1249 }
1250 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1251 }
1252
1253 accept_state = TRIE_NODENUM( state );
1254 if ( !trie->states[ accept_state ].wordnum ) {
1255 /* we havent inserted this word into the structure yet. */
1256 trie->states[ accept_state ].wordnum = ++curword;
1257
1258 DEBUG_r({
1259 /* store the word for dumping */
1260 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1261 if ( UTF ) SvUTF8_on( tmp );
1262 av_push( trie->words, tmp );
1263 });
1264
1265 } else {
1266 /* Its a dupe. So ignore it. */
1267 }
1268
1269 } /* end second pass */
1270
1271 DEBUG_TRIE_COMPILE_MORE_r({
1272 /*
1273 print out the table precompression so that we can do a visual check
1274 that they are identical.
1275 */
1276 U32 state;
1277 U16 charid;
1278 PerlIO_printf( Perl_debug_log, "\nChar : " );
1279
1280 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1281 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1282 if ( tmp ) {
1283 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1284 }
1285 }
1286
1287 PerlIO_printf( Perl_debug_log, "\nState+-" );
1288
1289 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1290 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1291 }
1292
1293 PerlIO_printf( Perl_debug_log, "\n" );
1294
1295 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1296
1297 PerlIO_printf( Perl_debug_log, "%04X : ", TRIE_NODENUM( state ) );
1298
1299 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1300 PerlIO_printf( Perl_debug_log, "%04X ",
1301 SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
1302 }
1303 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1304 PerlIO_printf( Perl_debug_log, " (%04X)\n", trie->trans[ state ].check );
1305 } else {
1306 PerlIO_printf( Perl_debug_log, " (%04X) W%04X\n", trie->trans[ state ].check,
1307 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1308 }
1309 }
1310 PerlIO_printf( Perl_debug_log, "\n\n" );
1311 });
1312 {
1313 /*
1314 * Inplace compress the table.*
1315
1316 For sparse data sets the table constructed by the trie algorithm will
1317 be mostly 0/FAIL transitions or to put it another way mostly empty.
1318 (Note that leaf nodes will not contain any transitions.)
1319
1320 This algorithm compresses the tables by eliminating most such
1321 transitions, at the cost of a modest bit of extra work during lookup:
1322
1323 - Each states[] entry contains a .base field which indicates the
1324 index in the state[] array wheres its transition data is stored.
1325
1326 - If .base is 0 there are no valid transitions from that node.
1327
1328 - If .base is nonzero then charid is added to it to find an entry in
1329 the trans array.
1330
1331 -If trans[states[state].base+charid].check!=state then the
1332 transition is taken to be a 0/Fail transition. Thus if there are fail
1333 transitions at the front of the node then the .base offset will point
1334 somewhere inside the previous nodes data (or maybe even into a node
1335 even earlier), but the .check field determines if the transition is
1336 valid.
1337
1338 The following process inplace converts the table to the compressed
1339 table: We first do not compress the root node 1,and mark its all its
1340 .check pointers as 1 and set its .base pointer as 1 as well. This
1341 allows to do a DFA construction from the compressed table later, and
1342 ensures that any .base pointers we calculate later are greater than
1343 0.
1344
1345 - We set 'pos' to indicate the first entry of the second node.
1346
1347 - We then iterate over the columns of the node, finding the first and
1348 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1349 and set the .check pointers accordingly, and advance pos
1350 appropriately and repreat for the next node. Note that when we copy
1351 the next pointers we have to convert them from the original
1352 NODEIDX form to NODENUM form as the former is not valid post
1353 compression.
1354
1355 - If a node has no transitions used we mark its base as 0 and do not
1356 advance the pos pointer.
1357
1358 - If a node only has one transition we use a second pointer into the
1359 structure to fill in allocated fail transitions from other states.
1360 This pointer is independent of the main pointer and scans forward
1361 looking for null transitions that are allocated to a state. When it
1362 finds one it writes the single transition into the "hole". If the
1363 pointer doesnt find one the single transition is appeneded as normal.
1364
1365 - Once compressed we can Renew/realloc the structures to release the
1366 excess space.
1367
1368 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1369 specifically Fig 3.47 and the associated pseudocode.
1370
1371 demq
1372 */
1373 U32 laststate = TRIE_NODENUM( next_alloc );
1374 U32 used , state, charid;
1375 U32 pos = 0, zp=0;
1376 trie->laststate = laststate;
1377
1378 for ( state = 1 ; state < laststate ; state++ ) {
1379 U8 flag = 0;
1380 U32 stateidx = TRIE_NODEIDX( state );
1381 U32 o_used=trie->trans[ stateidx ].check;
1382 used = trie->trans[ stateidx ].check;
1383 trie->trans[ stateidx ].check = 0;
1384
1385 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1386 if ( flag || trie->trans[ stateidx + charid ].next ) {
1387 if ( trie->trans[ stateidx + charid ].next ) {
1388 if (o_used == 1) {
1389 for ( ; zp < pos ; zp++ ) {
1390 if ( ! trie->trans[ zp ].next ) {
1391 break;
1392 }
1393 }
1394 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1395 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1396 trie->trans[ zp ].check = state;
1397 if ( ++zp > pos ) pos = zp;
1398 break;
1399 }
1400 used--;
1401 }
1402 if ( !flag ) {
1403 flag = 1;
1404 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1405 }
1406 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1407 trie->trans[ pos ].check = state;
1408 pos++;
1409 }
1410 }
1411 }
1412 Renew( trie->trans, pos + 1, reg_trie_trans);
1413 Renew( trie->states, laststate + 1, reg_trie_state);
1414 DEBUG_TRIE_COMPILE_MORE_r(
1415 PerlIO_printf( Perl_debug_log, " Alloc: %d Orig: %d elements, Final:%d. Savings of %%%5.2f\n",
1416 ( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ), next_alloc, pos,
1417 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1418 );
1419
1420 } /* end table compress */
1421 }
1422
1423 DEBUG_TRIE_COMPILE_r({
1424 U32 state;
1425 /*
1426 Now we print it out again, in a slightly different form as there is additional
1427 info we want to be able to see when its compressed. They are close enough for
1428 visual comparison though.
1429 */
1430 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1431
1432 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1433 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1434 if ( tmp ) {
1435 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1436 }
1437 }
1438 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
1439 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1440 PerlIO_printf( Perl_debug_log, "-----");
1441 PerlIO_printf( Perl_debug_log, "\n");
1442 for( state = 1 ; state < trie->laststate ; state++ ) {
1443 U32 base = trie->states[ state ].trans.base;
1444
1445 PerlIO_printf( Perl_debug_log, "#%04X ", state);
1446
1447 if ( trie->states[ state ].wordnum ) {
1448 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1449 } else {
1450 PerlIO_printf( Perl_debug_log, "%6s", "" );
1451 }
1452
1453 PerlIO_printf( Perl_debug_log, " @%04X ", base );
1454
1455 if ( base ) {
1456 U32 ofs = 0;
1457
1458 while( ( base + ofs - trie->uniquecharcount ) >=0 &&
1459 trie->trans[ base + ofs - trie->uniquecharcount ].check != state )
1460 ofs++;
1461
1462 PerlIO_printf( Perl_debug_log, "+%02X[ ", ofs);
1463
1464 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1465 if ( ( base + ofs - trie->uniquecharcount>=0) &&
1466 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1467 {
1468 PerlIO_printf( Perl_debug_log, "%04X ",
1469 trie->trans[ base + ofs - trie->uniquecharcount ].next );
1470 } else {
1471 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1472 }
1473 }
1474
1475 PerlIO_printf( Perl_debug_log, "]", ofs);
1476
1477 }
1478 PerlIO_printf( Perl_debug_log, "\n" );
1479 }
1480 });
1481
1482 {
1483 /* now finally we "stitch in" the new TRIE node
1484 This means we convert either the first branch or the first Exact,
1485 depending on whether the thing following (in 'last') is a branch
1486 or not and whther first is the startbranch (ie is it a sub part of
1487 the alternation or is it the whole thing.)
1488 Assuming its a sub part we conver the EXACT otherwise we convert
1489 the whole branch sequence, including the first.
1490 */
1491 regnode *convert;
1492
1493
1494
1495
1496 if ( first == startbranch && OP( last ) != BRANCH ) {
1497 convert = first;
1498 } else {
1499 convert = NEXTOPER( first );
1500 NEXT_OFF( first ) = (U16)(last - first);
1501 }
1502
1503 OP( convert ) = TRIE + (U8)( flags - EXACT );
1504 NEXT_OFF( convert ) = (U16)(tail - convert);
1505 ARG_SET( convert, data_slot );
1506
1507 /* tells us if we need to handle accept buffers specially */
1508 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1509
1510
1511 /* needed for dumping*/
1512 DEBUG_r({
1513 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1514 /* We now need to mark all of the space originally used by the
1515 branches as optimized away. This keeps the dumpuntil from
1516 throwing a wobbly as it doesnt use regnext() to traverse the
1517 opcodes.
1518 */
1519 while( optimize < last ) {
1520 OP( optimize ) = OPTIMIZED;
1521 optimize++;
1522 }
1523 });
1524 } /* end node insert */
1525 return 1;
1526}
1527
1528
1529
1530/*
5d1c421c
JH
1531 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1532 * These need to be revisited when a newer toolchain becomes available.
1533 */
1534#if defined(__sparc64__) && defined(__GNUC__)
1535# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1536# undef SPARC64_GCC_WORKAROUND
1537# define SPARC64_GCC_WORKAROUND 1
1538# endif
1539#endif
1540
653099ff
GS
1541/* REx optimizer. Converts nodes into quickier variants "in place".
1542 Finds fixed substrings. */
1543
c277df42
IZ
1544/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
1545 to the position after last scanned or to NULL. */
1546
a3621e74 1547
76e3520e 1548STATIC I32
a3621e74 1549S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1550 /* scanp: Start here (read-write). */
1551 /* deltap: Write maxlen-minlen here. */
1552 /* last: Stop before this one. */
1553{
1554 I32 min = 0, pars = 0, code;
1555 regnode *scan = *scanp, *next;
1556 I32 delta = 0;
1557 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1558 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1559 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1560 scan_data_t data_fake;
653099ff 1561 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1562 SV *re_trie_maxbuff = NULL;
1563
1564 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1565
c277df42
IZ
1566 while (scan && OP(scan) != END && scan < last) {
1567 /* Peephole optimizer: */
a3621e74
YO
1568 DEBUG_OPTIMISE_r({
1569 SV *mysv=sv_newmortal();
1570 regprop( mysv, scan);
1571 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08X)\n",depth*2,"",SvPV_nolen(mysv),scan);
1572 });
c277df42 1573
22c35a8c 1574 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 1575 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1576 regnode *n = regnext(scan);
1577 U32 stringok = 1;
1578#ifdef DEBUGGING
1579 regnode *stop = scan;
b81d288d 1580#endif
c277df42 1581
cd439c50 1582 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1583 /* Skip NOTHING, merge EXACT*. */
1584 while (n &&
b81d288d 1585 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
1586 (stringok && (OP(n) == OP(scan))))
1587 && NEXT_OFF(n)
1588 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1589 if (OP(n) == TAIL || n > next)
1590 stringok = 0;
22c35a8c 1591 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
1592 NEXT_OFF(scan) += NEXT_OFF(n);
1593 next = n + NODE_STEP_REGNODE;
1594#ifdef DEBUGGING
1595 if (stringok)
1596 stop = n;
b81d288d 1597#endif
c277df42 1598 n = regnext(n);
a0ed51b3 1599 }
f49d4d0f 1600 else if (stringok) {
cd439c50 1601 int oldl = STR_LEN(scan);
c277df42 1602 regnode *nnext = regnext(n);
f49d4d0f 1603
b81d288d 1604 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1605 break;
1606 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1607 STR_LEN(scan) += STR_LEN(n);
1608 next = n + NODE_SZ_STR(n);
c277df42 1609 /* Now we can overwrite *n : */
f49d4d0f 1610 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1611#ifdef DEBUGGING
f49d4d0f 1612 stop = next - 1;
b81d288d 1613#endif
c277df42
IZ
1614 n = nnext;
1615 }
1616 }
61a36c01 1617
a3621e74 1618 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1619/*
1620 Two problematic code points in Unicode casefolding of EXACT nodes:
1621
1622 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1623 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1624
1625 which casefold to
1626
1627 Unicode UTF-8
1628
1629 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1630 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1631
1632 This means that in case-insensitive matching (or "loose matching",
1633 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1634 length of the above casefolded versions) can match a target string
1635 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1636 This would rather mess up the minimum length computation.
1637
1638 What we'll do is to look for the tail four bytes, and then peek
1639 at the preceding two bytes to see whether we need to decrease
1640 the minimum length by four (six minus two).
1641
1642 Thanks to the design of UTF-8, there cannot be false matches:
1643 A sequence of valid UTF-8 bytes cannot be a subsequence of
1644 another valid sequence of UTF-8 bytes.
1645
1646*/
1647 char *s0 = STRING(scan), *s, *t;
1648 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
1649 char *t0 = "\xcc\x88\xcc\x81";
1650 char *t1 = t0 + 3;
1651
1652 for (s = s0 + 2;
1653 s < s2 && (t = ninstr(s, s1, t0, t1));
1654 s = t + 4) {
1655 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1656 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1657 min -= 4;
1658 }
1659 }
1660
c277df42
IZ
1661#ifdef DEBUGGING
1662 /* Allow dumping */
cd439c50 1663 n = scan + NODE_SZ_STR(scan);
c277df42 1664 while (n <= stop) {
22c35a8c 1665 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1666 OP(n) = OPTIMIZED;
1667 NEXT_OFF(n) = 0;
1668 }
1669 n++;
1670 }
653099ff 1671#endif
c277df42 1672 }
a3621e74
YO
1673
1674
1675
653099ff
GS
1676 /* Follow the next-chain of the current node and optimize
1677 away all the NOTHINGs from it. */
c277df42 1678 if (OP(scan) != CURLYX) {
048cfca1
GS
1679 int max = (reg_off_by_arg[OP(scan)]
1680 ? I32_MAX
1681 /* I32 may be smaller than U16 on CRAYs! */
1682 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1683 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1684 int noff;
1685 regnode *n = scan;
b81d288d 1686
c277df42
IZ
1687 /* Skip NOTHING and LONGJMP. */
1688 while ((n = regnext(n))
22c35a8c 1689 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1690 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1691 && off + noff < max)
1692 off += noff;
1693 if (reg_off_by_arg[OP(scan)])
1694 ARG(scan) = off;
b81d288d 1695 else
c277df42
IZ
1696 NEXT_OFF(scan) = off;
1697 }
a3621e74 1698
653099ff
GS
1699 /* The principal pseudo-switch. Cannot be a switch, since we
1700 look into several different things. */
b81d288d 1701 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1702 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1703 next = regnext(scan);
1704 code = OP(scan);
a3621e74 1705 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1706
1707 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1708 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1709 struct regnode_charclass_class accum;
a3621e74 1710 regnode *startbranch=scan;
c277df42 1711
653099ff 1712 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1713 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1714 if (flags & SCF_DO_STCLASS)
830247a4 1715 cl_init_zero(pRExC_state, &accum);
a3621e74 1716
c277df42 1717 while (OP(scan) == code) {
830247a4 1718 I32 deltanext, minnext, f = 0, fake;
653099ff 1719 struct regnode_charclass_class this_class;
c277df42
IZ
1720
1721 num++;
1722 data_fake.flags = 0;
b81d288d 1723 if (data) {
2c2d71f5 1724 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1725 data_fake.last_closep = data->last_closep;
1726 }
1727 else
1728 data_fake.last_closep = &fake;
c277df42
IZ
1729 next = regnext(scan);
1730 scan = NEXTOPER(scan);
1731 if (code != BRANCH)
1732 scan = NEXTOPER(scan);
653099ff 1733 if (flags & SCF_DO_STCLASS) {
830247a4 1734 cl_init(pRExC_state, &this_class);
653099ff
GS
1735 data_fake.start_class = &this_class;
1736 f = SCF_DO_STCLASS_AND;
b81d288d 1737 }
e1901655
IZ
1738 if (flags & SCF_WHILEM_VISITED_POS)
1739 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1740
653099ff 1741 /* we suppose the run is continuous, last=next...*/
830247a4 1742 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1743 next, &data_fake, f,depth+1);
b81d288d 1744 if (min1 > minnext)
c277df42
IZ
1745 min1 = minnext;
1746 if (max1 < minnext + deltanext)
1747 max1 = minnext + deltanext;
1748 if (deltanext == I32_MAX)
aca2d497 1749 is_inf = is_inf_internal = 1;
c277df42
IZ
1750 scan = next;
1751 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1752 pars++;
405ff068 1753 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1754 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1755 if (data)
1756 data->whilem_c = data_fake.whilem_c;
653099ff 1757 if (flags & SCF_DO_STCLASS)
830247a4 1758 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1759 if (code == SUSPEND)
c277df42
IZ
1760 break;
1761 }
1762 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1763 min1 = 0;
1764 if (flags & SCF_DO_SUBSTR) {
1765 data->pos_min += min1;
1766 data->pos_delta += max1 - min1;
1767 if (max1 != min1 || is_inf)
1768 data->longest = &(data->longest_float);
1769 }
1770 min += min1;
1771 delta += max1 - min1;
653099ff 1772 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1773 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1774 if (min1) {
1775 cl_and(data->start_class, &and_with);
1776 flags &= ~SCF_DO_STCLASS;
1777 }
1778 }
1779 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1780 if (min1) {
1781 cl_and(data->start_class, &accum);
653099ff 1782 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1783 }
1784 else {
b81d288d 1785 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1786 * data->start_class */
1787 StructCopy(data->start_class, &and_with,
1788 struct regnode_charclass_class);
1789 flags &= ~SCF_DO_STCLASS_AND;
1790 StructCopy(&accum, data->start_class,
1791 struct regnode_charclass_class);
1792 flags |= SCF_DO_STCLASS_OR;
1793 data->start_class->flags |= ANYOF_EOS;
1794 }
653099ff 1795 }
a3621e74
YO
1796
1797 /* demq.
1798
1799 Assuming this was/is a branch we are dealing with: 'scan' now
1800 points at the item that follows the branch sequence, whatever
1801 it is. We now start at the beginning of the sequence and look
1802 for subsequences of
1803
1804 BRANCH->EXACT=>X
1805 BRANCH->EXACT=>X
1806
1807 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1808
1809 If we can find such a subseqence we need to turn the first
1810 element into a trie and then add the subsequent branch exact
1811 strings to the trie.
1812
1813 We have two cases
1814
1815 1. patterns where the whole set of branch can be converted to a trie,
1816
1817 2. patterns where only a subset of the alternations can be
1818 converted to a trie.
1819
1820 In case 1 we can replace the whole set with a single regop
1821 for the trie. In case 2 we need to keep the start and end
1822 branchs so
1823
1824 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1825 becomes BRANCH TRIE; BRANCH X;
1826
1827 Hypthetically when we know the regex isnt anchored we can
1828 turn a case 1 into a DFA and let it rip... Every time it finds a match
1829 it would just call its tail, no WHILEM/CURLY needed.
1830
1831 */
1832 if (DO_TRIE) {
1833 if (!re_trie_maxbuff) {
1834 re_trie_maxbuff=get_sv(RE_TRIE_MAXBUFF, 1);
1835 if (!SvIOK(re_trie_maxbuff))
1836 sv_setiv(re_trie_maxbuff, TRIE_SIMPLE_MAX_BUFF);
1837
1838 }
1839 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1840 regnode *cur;
1841 regnode *first = (regnode *)NULL;
1842 regnode *last = (regnode *)NULL;
1843 regnode *tail = scan;
1844 U8 optype = 0;
1845 U32 count=0;
1846
1847#ifdef DEBUGGING
1848 SV *mysv = sv_newmortal(); /* for dumping */
1849#endif
1850 /* var tail is used because there may be a TAIL
1851 regop in the way. Ie, the exacts will point to the
1852 thing following the TAIL, but the last branch will
1853 point at the TAIL. So we advance tail. If we
1854 have nested (?:) we may have to move through several
1855 tails.
1856 */
1857
1858 while ( OP( tail ) == TAIL ) {
1859 /* this is the TAIL generated by (?:) */
1860 tail = regnext( tail );
1861 }
1862
1863 DEBUG_OPTIMISE_r({
1864 regprop( mysv, tail );
1865 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
1866 depth * 2 + 2, "", "Tail node is:", SvPV_nolen( mysv ),
1867 (RExC_seen_evals) ? "[EVAL]" : ""
1868 );
1869 });
1870 /*
1871
1872 step through the branches, cur represents each
1873 branch, noper is the first thing to be matched
1874 as part of that branch and noper_next is the
1875 regnext() of that node. if noper is an EXACT
1876 and noper_next is the same as scan (our current
1877 position in the regex) then the EXACT branch is
1878 a possible optimization target. Once we have
1879 two or more consequetive such branches we can
1880 create a trie of the EXACT's contents and stich
1881 it in place. If the sequence represents all of
1882 the branches we eliminate the whole thing and
1883 replace it with a single TRIE. If it is a
1884 subsequence then we need to stitch it in. This
1885 means the first branch has to remain, and needs
1886 to be repointed at the item on the branch chain
1887 following the last branch optimized. This could
1888 be either a BRANCH, in which case the
1889 subsequence is internal, or it could be the
1890 item following the branch sequence in which
1891 case the subsequence is at the end.
1892
1893 */
1894
1895 /* dont use tail as the end marker for this traverse */
1896 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1897 regnode *noper = NEXTOPER( cur );
1898 regnode *noper_next = regnext( noper );
1899
1900
1901 DEBUG_OPTIMISE_r({
1902 regprop( mysv, cur);
1903 PerlIO_printf( Perl_debug_log, "%*s%s",
1904 depth * 2 + 2," ", SvPV_nolen( mysv ) );
1905
1906 regprop( mysv, noper);
1907 PerlIO_printf( Perl_debug_log, " -> %s",
1908 SvPV_nolen(mysv));
1909
1910 if ( noper_next ) {
1911 regprop( mysv, noper_next );
1912 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1913 SvPV_nolen(mysv));
1914 }
1915 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1916 first, last, cur );
1917 });
1918 if ( ( first ? OP( noper ) == optype
1919 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1920 && noper_next == tail && count<U16_MAX)
1921 {
1922 count++;
1923 if ( !first ) {
1924 first = cur;
1925 optype = OP( noper );
1926 } else {
1927 DEBUG_OPTIMISE_r(
1928 if (!last ) {
1929 regprop( mysv, first);
1930 PerlIO_printf( Perl_debug_log, "%*s%s",
1931 depth * 2 + 2, "F:", SvPV_nolen( mysv ) );
1932 regprop( mysv, NEXTOPER(first) );
1933 PerlIO_printf( Perl_debug_log, " -> %s\n",
1934 SvPV_nolen( mysv ) );
1935 }
1936 );
1937 last = cur;
1938 DEBUG_OPTIMISE_r({
1939 regprop( mysv, cur);
1940 PerlIO_printf( Perl_debug_log, "%*s%s",
1941 depth * 2 + 2, "N:", SvPV_nolen( mysv ) );
1942 regprop( mysv, noper );
1943 PerlIO_printf( Perl_debug_log, " -> %s\n",
1944 SvPV_nolen( mysv ) );
1945 });
1946 }
1947 } else {
1948 if ( last ) {
1949 DEBUG_OPTIMISE_r(
1950 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1951 depth * 2 + 2, "E:", "**END**" );
1952 );
1953 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1954 }
1955 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1956 && noper_next == tail )
1957 {
1958 count = 1;
1959 first = cur;
1960 optype = OP( noper );
1961 } else {
1962 count = 0;
1963 first = NULL;
1964 optype = 0;
1965 }
1966 last = NULL;
1967 }
1968 }
1969 DEBUG_OPTIMISE_r({
1970 regprop( mysv, cur);
1971 PerlIO_printf( Perl_debug_log,
1972 "%*s%s\t(0x%p,0x%p,0x%p)\n", depth * 2 + 2,
1973 " ", SvPV_nolen( mysv ), first, last, cur);
1974
1975 });
1976 if ( last ) {
1977 DEBUG_OPTIMISE_r(
1978 PerlIO_printf( Perl_debug_log, "%*s%s\n",
1979 depth * 2 + 2, "E:", "==END==" );
1980 );
1981 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1982 }
1983 }
1984 }
a0ed51b3 1985 }
a3621e74 1986 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 1987 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 1988 } else /* single branch is optimized. */
c277df42
IZ
1989 scan = NEXTOPER(scan);
1990 continue;
a0ed51b3
LW
1991 }
1992 else if (OP(scan) == EXACT) {
cd439c50 1993 I32 l = STR_LEN(scan);
1aa99e6b 1994 UV uc = *((U8*)STRING(scan));
a0ed51b3 1995 if (UTF) {
1aa99e6b
IH
1996 U8 *s = (U8*)STRING(scan);
1997 l = utf8_length(s, s + l);
9041c2e3 1998 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
1999 }
2000 min += l;
c277df42 2001 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2002 /* The code below prefers earlier match for fixed
2003 offset, later match for variable offset. */
2004 if (data->last_end == -1) { /* Update the start info. */
2005 data->last_start_min = data->pos_min;
2006 data->last_start_max = is_inf
b81d288d 2007 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2008 }
cd439c50 2009 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292
JH
2010 {
2011 SV * sv = data->last_found;
2012 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2013 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2014 if (mg && mg->mg_len >= 0)
5e43f467
JH
2015 mg->mg_len += utf8_length((U8*)STRING(scan),
2016 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2017 }
33b8afdf
JH
2018 if (UTF)
2019 SvUTF8_on(data->last_found);
c277df42
IZ
2020 data->last_end = data->pos_min + l;
2021 data->pos_min += l; /* As in the first entry. */
2022 data->flags &= ~SF_BEFORE_EOL;
2023 }
653099ff
GS
2024 if (flags & SCF_DO_STCLASS_AND) {
2025 /* Check whether it is compatible with what we know already! */
2026 int compat = 1;
2027
1aa99e6b 2028 if (uc >= 0x100 ||
516a5887 2029 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2030 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2031 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2032 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2033 )
653099ff
GS
2034 compat = 0;
2035 ANYOF_CLASS_ZERO(data->start_class);
2036 ANYOF_BITMAP_ZERO(data->start_class);
2037 if (compat)
1aa99e6b 2038 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2039 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2040 if (uc < 0x100)
2041 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2042 }
2043 else if (flags & SCF_DO_STCLASS_OR) {
2044 /* false positive possible if the class is case-folded */
1aa99e6b 2045 if (uc < 0x100)
9b877dbb
IH
2046 ANYOF_BITMAP_SET(data->start_class, uc);
2047 else
2048 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2049 data->start_class->flags &= ~ANYOF_EOS;
2050 cl_and(data->start_class, &and_with);
2051 }
2052 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2053 }
653099ff 2054 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2055 I32 l = STR_LEN(scan);
1aa99e6b 2056 UV uc = *((U8*)STRING(scan));
653099ff
GS
2057
2058 /* Search for fixed substrings supports EXACT only. */
b81d288d 2059 if (flags & SCF_DO_SUBSTR)
830247a4 2060 scan_commit(pRExC_state, data);
a0ed51b3 2061 if (UTF) {
1aa99e6b
IH
2062 U8 *s = (U8 *)STRING(scan);
2063 l = utf8_length(s, s + l);
9041c2e3 2064 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2065 }
2066 min += l;
c277df42 2067 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 2068 data->pos_min += l;
653099ff
GS
2069 if (flags & SCF_DO_STCLASS_AND) {
2070 /* Check whether it is compatible with what we know already! */
2071 int compat = 1;
2072
1aa99e6b 2073 if (uc >= 0x100 ||
516a5887 2074 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2075 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2076 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2077 compat = 0;
2078 ANYOF_CLASS_ZERO(data->start_class);
2079 ANYOF_BITMAP_ZERO(data->start_class);
2080 if (compat) {
1aa99e6b 2081 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2082 data->start_class->flags &= ~ANYOF_EOS;
2083 data->start_class->flags |= ANYOF_FOLD;
2084 if (OP(scan) == EXACTFL)
2085 data->start_class->flags |= ANYOF_LOCALE;
2086 }
2087 }
2088 else if (flags & SCF_DO_STCLASS_OR) {
2089 if (data->start_class->flags & ANYOF_FOLD) {
2090 /* false positive possible if the class is case-folded.
2091 Assume that the locale settings are the same... */
1aa99e6b
IH
2092 if (uc < 0x100)
2093 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2094 data->start_class->flags &= ~ANYOF_EOS;
2095 }
2096 cl_and(data->start_class, &and_with);
2097 }
2098 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2099 }
4d61ec05 2100 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 2101 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2102 I32 f = flags, pos_before = 0;
c277df42 2103 regnode *oscan = scan;
653099ff
GS
2104 struct regnode_charclass_class this_class;
2105 struct regnode_charclass_class *oclass = NULL;
727f22e3 2106 I32 next_is_eval = 0;
653099ff 2107
22c35a8c 2108 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2109 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2110 scan = NEXTOPER(scan);
2111 goto finish;
2112 case PLUS:
653099ff 2113 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2114 next = NEXTOPER(scan);
653099ff 2115 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2116 mincount = 1;
2117 maxcount = REG_INFTY;
c277df42
IZ
2118 next = regnext(scan);
2119 scan = NEXTOPER(scan);
2120 goto do_curly;
2121 }
2122 }
2123 if (flags & SCF_DO_SUBSTR)
2124 data->pos_min++;
2125 min++;
2126 /* Fall through. */
2127 case STAR:
653099ff
GS
2128 if (flags & SCF_DO_STCLASS) {
2129 mincount = 0;
b81d288d 2130 maxcount = REG_INFTY;
653099ff
GS
2131 next = regnext(scan);
2132 scan = NEXTOPER(scan);
2133 goto do_curly;
2134 }
b81d288d 2135 is_inf = is_inf_internal = 1;
c277df42
IZ
2136 scan = regnext(scan);
2137 if (flags & SCF_DO_SUBSTR) {
830247a4 2138 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2139 data->longest = &(data->longest_float);
2140 }
2141 goto optimize_curly_tail;
2142 case CURLY:
b81d288d 2143 mincount = ARG1(scan);
c277df42
IZ
2144 maxcount = ARG2(scan);
2145 next = regnext(scan);
cb434fcc
IZ
2146 if (OP(scan) == CURLYX) {
2147 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2148 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2149 }
c277df42 2150 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2151 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2152 do_curly:
2153 if (flags & SCF_DO_SUBSTR) {
830247a4 2154 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2155 pos_before = data->pos_min;
2156 }
2157 if (data) {
2158 fl = data->flags;
2159 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2160 if (is_inf)
2161 data->flags |= SF_IS_INF;
2162 }
653099ff 2163 if (flags & SCF_DO_STCLASS) {
830247a4 2164 cl_init(pRExC_state, &this_class);
653099ff
GS
2165 oclass = data->start_class;
2166 data->start_class = &this_class;
2167 f |= SCF_DO_STCLASS_AND;
2168 f &= ~SCF_DO_STCLASS_OR;
2169 }
e1901655
IZ
2170 /* These are the cases when once a subexpression
2171 fails at a particular position, it cannot succeed
2172 even after backtracking at the enclosing scope.
b81d288d 2173
e1901655
IZ
2174 XXXX what if minimal match and we are at the
2175 initial run of {n,m}? */
2176 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2177 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2178
c277df42 2179 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2180 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2181 (mincount == 0
2182 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2183
2184 if (flags & SCF_DO_STCLASS)
2185 data->start_class = oclass;
2186 if (mincount == 0 || minnext == 0) {
2187 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2188 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2189 }
2190 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2191 /* Switch to OR mode: cache the old value of
653099ff
GS
2192 * data->start_class */
2193 StructCopy(data->start_class, &and_with,
2194 struct regnode_charclass_class);
2195 flags &= ~SCF_DO_STCLASS_AND;
2196 StructCopy(&this_class, data->start_class,
2197 struct regnode_charclass_class);
2198 flags |= SCF_DO_STCLASS_OR;
2199 data->start_class->flags |= ANYOF_EOS;
2200 }
2201 } else { /* Non-zero len */
2202 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2203 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2204 cl_and(data->start_class, &and_with);
2205 }
2206 else if (flags & SCF_DO_STCLASS_AND)
2207 cl_and(data->start_class, &this_class);
2208 flags &= ~SCF_DO_STCLASS;
2209 }
c277df42
IZ
2210 if (!scan) /* It was not CURLYX, but CURLY. */
2211 scan = next;
84037bb0 2212 if (ckWARN(WARN_REGEXP)
727f22e3
JP
2213 /* ? quantifier ok, except for (?{ ... }) */
2214 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2215 && (minnext == 0) && (deltanext == 0)
99799961 2216 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 2217 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 2218 {
830247a4 2219 vWARN(RExC_parse,
b45f050a
JF
2220 "Quantifier unexpected on zero-length expression");
2221 }
2222
c277df42 2223 min += minnext * mincount;
b81d288d 2224 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2225 && (minnext + deltanext) > 0)
2226 || deltanext == I32_MAX);
aca2d497 2227 is_inf |= is_inf_internal;
c277df42
IZ
2228 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2229
2230 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2231 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2232 && data->flags & SF_IN_PAR
2233 && !(data->flags & SF_HAS_EVAL)
2234 && !deltanext && minnext == 1 ) {
2235 /* Try to optimize to CURLYN. */
2236 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
2237 regnode *nxt1 = nxt;
2238#ifdef DEBUGGING
2239 regnode *nxt2;
2240#endif
c277df42
IZ
2241
2242 /* Skip open. */
2243 nxt = regnext(nxt);
4d61ec05 2244 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 2245 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 2246 && STR_LEN(nxt) == 1))
c277df42 2247 goto nogo;
497b47a8 2248#ifdef DEBUGGING
c277df42 2249 nxt2 = nxt;
497b47a8 2250#endif
c277df42 2251 nxt = regnext(nxt);
b81d288d 2252 if (OP(nxt) != CLOSE)
c277df42
IZ
2253 goto nogo;
2254 /* Now we know that nxt2 is the only contents: */
eb160463 2255 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2256 OP(oscan) = CURLYN;
2257 OP(nxt1) = NOTHING; /* was OPEN. */
2258#ifdef DEBUGGING
2259 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2260 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2261 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2262 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2263 OP(nxt + 1) = OPTIMIZED; /* was count. */
2264 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2265#endif
c277df42 2266 }
c277df42
IZ
2267 nogo:
2268
2269 /* Try optimization CURLYX => CURLYM. */
b81d288d 2270 if ( OP(oscan) == CURLYX && data
c277df42 2271 && !(data->flags & SF_HAS_PAR)
c277df42 2272 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2273 && !deltanext /* atom is fixed width */
2274 && minnext != 0 /* CURLYM can't handle zero width */
2275 ) {
c277df42
IZ
2276 /* XXXX How to optimize if data == 0? */
2277 /* Optimize to a simpler form. */
2278 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2279 regnode *nxt2;
2280
2281 OP(oscan) = CURLYM;
2282 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2283 && (OP(nxt2) != WHILEM))
c277df42
IZ
2284 nxt = nxt2;
2285 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2286 /* Need to optimize away parenths. */
2287 if (data->flags & SF_IN_PAR) {
2288 /* Set the parenth number. */
2289 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2290
b81d288d 2291 if (OP(nxt) != CLOSE)
b45f050a 2292 FAIL("Panic opt close");
eb160463 2293 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2294 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2295 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2296#ifdef DEBUGGING
2297 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2298 OP(nxt + 1) = OPTIMIZED; /* was count. */
2299 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2300 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2301#endif
c277df42
IZ
2302#if 0
2303 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2304 regnode *nnxt = regnext(nxt1);
b81d288d 2305
c277df42
IZ
2306 if (nnxt == nxt) {
2307 if (reg_off_by_arg[OP(nxt1)])
2308 ARG_SET(nxt1, nxt2 - nxt1);
2309 else if (nxt2 - nxt1 < U16_MAX)
2310 NEXT_OFF(nxt1) = nxt2 - nxt1;
2311 else
2312 OP(nxt) = NOTHING; /* Cannot beautify */
2313 }
2314 nxt1 = nnxt;
2315 }
2316#endif
2317 /* Optimize again: */
b81d288d 2318 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2319 NULL, 0,depth+1);
a0ed51b3
LW
2320 }
2321 else
c277df42 2322 oscan->flags = 0;
c277df42 2323 }
e1901655
IZ
2324 else if ((OP(oscan) == CURLYX)
2325 && (flags & SCF_WHILEM_VISITED_POS)
2326 /* See the comment on a similar expression above.
2327 However, this time it not a subexpression
2328 we care about, but the expression itself. */
2329 && (maxcount == REG_INFTY)
2330 && data && ++data->whilem_c < 16) {
2331 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2332 /* Find WHILEM (as in regexec.c) */
2333 regnode *nxt = oscan + NEXT_OFF(oscan);
2334
2335 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2336 nxt += ARG(nxt);
eb160463
GS
2337 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2338 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2339 }
b81d288d 2340 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2341 pars++;
2342 if (flags & SCF_DO_SUBSTR) {
2343 SV *last_str = Nullsv;
2344 int counted = mincount != 0;
2345
2346 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2347#if defined(SPARC64_GCC_WORKAROUND)
2348 I32 b = 0;
2349 STRLEN l = 0;
2350 char *s = NULL;
2351 I32 old = 0;
2352
2353 if (pos_before >= data->last_start_min)
2354 b = pos_before;
2355 else
2356 b = data->last_start_min;
2357
2358 l = 0;
2359 s = SvPV(data->last_found, l);
2360 old = b - data->last_start_min;
2361
2362#else
b81d288d 2363 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2364 ? pos_before : data->last_start_min;
2365 STRLEN l;
2366 char *s = SvPV(data->last_found, l);
a0ed51b3 2367 I32 old = b - data->last_start_min;
5d1c421c 2368#endif
a0ed51b3
LW
2369
2370 if (UTF)
2371 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2372
a0ed51b3 2373 l -= old;
c277df42 2374 /* Get the added string: */
79cb57f6 2375 last_str = newSVpvn(s + old, l);
0e933229
IH
2376 if (UTF)
2377 SvUTF8_on(last_str);
c277df42
IZ
2378 if (deltanext == 0 && pos_before == b) {
2379 /* What was added is a constant string */
2380 if (mincount > 1) {
2381 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2382 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
2383 SvPVX(last_str), l, mincount - 1);
2384 SvCUR(last_str) *= mincount;
2385 /* Add additional parts. */
b81d288d 2386 SvCUR_set(data->last_found,
c277df42
IZ
2387 SvCUR(data->last_found) - l);
2388 sv_catsv(data->last_found, last_str);
0eda9292
JH
2389 {
2390 SV * sv = data->last_found;
2391 MAGIC *mg =
2392 SvUTF8(sv) && SvMAGICAL(sv) ?
2393 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2394 if (mg && mg->mg_len >= 0)
2395 mg->mg_len += CHR_SVLEN(last_str);
2396 }
c277df42
IZ
2397 data->last_end += l * (mincount - 1);
2398 }
2a8d9689
HS
2399 } else {
2400 /* start offset must point into the last copy */
2401 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2402 data->last_start_max += is_inf ? I32_MAX
2403 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2404 }
2405 }
2406 /* It is counted once already... */
2407 data->pos_min += minnext * (mincount - counted);
2408 data->pos_delta += - counted * deltanext +
2409 (minnext + deltanext) * maxcount - minnext * mincount;
2410 if (mincount != maxcount) {
653099ff
GS
2411 /* Cannot extend fixed substrings found inside
2412 the group. */
830247a4 2413 scan_commit(pRExC_state,data);
c277df42
IZ
2414 if (mincount && last_str) {
2415 sv_setsv(data->last_found, last_str);
2416 data->last_end = data->pos_min;
b81d288d 2417 data->last_start_min =
a0ed51b3 2418 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2419 data->last_start_max = is_inf
2420 ? I32_MAX
c277df42 2421 : data->pos_min + data->pos_delta
a0ed51b3 2422 - CHR_SVLEN(last_str);
c277df42
IZ
2423 }
2424 data->longest = &(data->longest_float);
2425 }
aca2d497 2426 SvREFCNT_dec(last_str);
c277df42 2427 }
405ff068 2428 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2429 data->flags |= SF_HAS_EVAL;
2430 optimize_curly_tail:
c277df42 2431 if (OP(oscan) != CURLYX) {
22c35a8c 2432 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2433 && NEXT_OFF(next))
2434 NEXT_OFF(oscan) += NEXT_OFF(next);
2435 }
c277df42 2436 continue;
653099ff 2437 default: /* REF and CLUMP only? */
c277df42 2438 if (flags & SCF_DO_SUBSTR) {
830247a4 2439 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2440 data->longest = &(data->longest_float);
2441 }
aca2d497 2442 is_inf = is_inf_internal = 1;
653099ff 2443 if (flags & SCF_DO_STCLASS_OR)
830247a4 2444 cl_anything(pRExC_state, data->start_class);
653099ff 2445 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2446 break;
2447 }
a0ed51b3 2448 }
ffc61ed2 2449 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 2450 int value = 0;
653099ff 2451
c277df42 2452 if (flags & SCF_DO_SUBSTR) {
830247a4 2453 scan_commit(pRExC_state,data);
c277df42
IZ
2454 data->pos_min++;
2455 }
2456 min++;
653099ff
GS
2457 if (flags & SCF_DO_STCLASS) {
2458 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2459
2460 /* Some of the logic below assumes that switching
2461 locale on will only add false positives. */
2462 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2463 case SANY:
653099ff
GS
2464 default:
2465 do_default:
2466 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2467 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2468 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2469 break;
2470 case REG_ANY:
2471 if (OP(scan) == SANY)
2472 goto do_default;
2473 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2474 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2475 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2476 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2477 }
2478 if (flags & SCF_DO_STCLASS_AND || !value)
2479 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2480 break;
2481 case ANYOF:
2482 if (flags & SCF_DO_STCLASS_AND)
2483 cl_and(data->start_class,
2484 (struct regnode_charclass_class*)scan);
2485 else
830247a4 2486 cl_or(pRExC_state, data->start_class,
653099ff
GS
2487 (struct regnode_charclass_class*)scan);
2488 break;
2489 case ALNUM:
2490 if (flags & SCF_DO_STCLASS_AND) {
2491 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2492 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2493 for (value = 0; value < 256; value++)
2494 if (!isALNUM(value))
2495 ANYOF_BITMAP_CLEAR(data->start_class, value);
2496 }
2497 }
2498 else {
2499 if (data->start_class->flags & ANYOF_LOCALE)
2500 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2501 else {
2502 for (value = 0; value < 256; value++)
2503 if (isALNUM(value))
b81d288d 2504 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2505 }
2506 }
2507 break;
2508 case ALNUML:
2509 if (flags & SCF_DO_STCLASS_AND) {
2510 if (data->start_class->flags & ANYOF_LOCALE)
2511 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2512 }
2513 else {
2514 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2515 data->start_class->flags |= ANYOF_LOCALE;
2516 }
2517 break;
2518 case NALNUM:
2519 if (flags & SCF_DO_STCLASS_AND) {
2520 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2521 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2522 for (value = 0; value < 256; value++)
2523 if (isALNUM(value))
2524 ANYOF_BITMAP_CLEAR(data->start_class, value);
2525 }
2526 }
2527 else {
2528 if (data->start_class->flags & ANYOF_LOCALE)
2529 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2530 else {
2531 for (value = 0; value < 256; value++)
2532 if (!isALNUM(value))
b81d288d 2533 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2534 }
2535 }
2536 break;
2537 case NALNUML:
2538 if (flags & SCF_DO_STCLASS_AND) {
2539 if (data->start_class->flags & ANYOF_LOCALE)
2540 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2541 }
2542 else {
2543 data->start_class->flags |= ANYOF_LOCALE;
2544 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2545 }
2546 break;
2547 case SPACE:
2548 if (flags & SCF_DO_STCLASS_AND) {
2549 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2550 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2551 for (value = 0; value < 256; value++)
2552 if (!isSPACE(value))
2553 ANYOF_BITMAP_CLEAR(data->start_class, value);
2554 }
2555 }
2556 else {
2557 if (data->start_class->flags & ANYOF_LOCALE)
2558 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2559 else {
2560 for (value = 0; value < 256; value++)
2561 if (isSPACE(value))
b81d288d 2562 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2563 }
2564 }
2565 break;
2566 case SPACEL:
2567 if (flags & SCF_DO_STCLASS_AND) {
2568 if (data->start_class->flags & ANYOF_LOCALE)
2569 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2570 }
2571 else {
2572 data->start_class->flags |= ANYOF_LOCALE;
2573 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2574 }
2575 break;
2576 case NSPACE:
2577 if (flags & SCF_DO_STCLASS_AND) {
2578 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2579 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2580 for (value = 0; value < 256; value++)
2581 if (isSPACE(value))
2582 ANYOF_BITMAP_CLEAR(data->start_class, value);
2583 }
2584 }
2585 else {
2586 if (data->start_class->flags & ANYOF_LOCALE)
2587 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2588 else {
2589 for (value = 0; value < 256; value++)
2590 if (!isSPACE(value))
b81d288d 2591 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2592 }
2593 }
2594 break;
2595 case NSPACEL:
2596 if (flags & SCF_DO_STCLASS_AND) {
2597 if (data->start_class->flags & ANYOF_LOCALE) {
2598 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2599 for (value = 0; value < 256; value++)
2600 if (!isSPACE(value))
2601 ANYOF_BITMAP_CLEAR(data->start_class, value);
2602 }
2603 }
2604 else {
2605 data->start_class->flags |= ANYOF_LOCALE;
2606 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2607 }
2608 break;
2609 case DIGIT:
2610 if (flags & SCF_DO_STCLASS_AND) {
2611 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2612 for (value = 0; value < 256; value++)
2613 if (!isDIGIT(value))
2614 ANYOF_BITMAP_CLEAR(data->start_class, value);
2615 }
2616 else {
2617 if (data->start_class->flags & ANYOF_LOCALE)
2618 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2619 else {
2620 for (value = 0; value < 256; value++)
2621 if (isDIGIT(value))
b81d288d 2622 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2623 }
2624 }
2625 break;
2626 case NDIGIT:
2627 if (flags & SCF_DO_STCLASS_AND) {
2628 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2629 for (value = 0; value < 256; value++)
2630 if (isDIGIT(value))
2631 ANYOF_BITMAP_CLEAR(data->start_class, value);
2632 }
2633 else {
2634 if (data->start_class->flags & ANYOF_LOCALE)
2635 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2636 else {
2637 for (value = 0; value < 256; value++)
2638 if (!isDIGIT(value))
b81d288d 2639 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2640 }
2641 }
2642 break;
2643 }
2644 if (flags & SCF_DO_STCLASS_OR)
2645 cl_and(data->start_class, &and_with);
2646 flags &= ~SCF_DO_STCLASS;
2647 }
a0ed51b3 2648 }
22c35a8c 2649 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2650 data->flags |= (OP(scan) == MEOL
2651 ? SF_BEFORE_MEOL
2652 : SF_BEFORE_SEOL);
a0ed51b3 2653 }
653099ff
GS
2654 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2655 /* Lookbehind, or need to calculate parens/evals/stclass: */
2656 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2657 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2658 /* Lookahead/lookbehind */
cb434fcc 2659 I32 deltanext, minnext, fake = 0;
c277df42 2660 regnode *nscan;
653099ff
GS
2661 struct regnode_charclass_class intrnl;
2662 int f = 0;
c277df42
IZ
2663
2664 data_fake.flags = 0;
b81d288d 2665 if (data) {
2c2d71f5 2666 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2667 data_fake.last_closep = data->last_closep;
2668 }
2669 else
2670 data_fake.last_closep = &fake;
653099ff
GS
2671 if ( flags & SCF_DO_STCLASS && !scan->flags
2672 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2673 cl_init(pRExC_state, &intrnl);
653099ff 2674 data_fake.start_class = &intrnl;
e1901655 2675 f |= SCF_DO_STCLASS_AND;
653099ff 2676 }
e1901655
IZ
2677 if (flags & SCF_WHILEM_VISITED_POS)
2678 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2679 next = regnext(scan);
2680 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2681 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2682 if (scan->flags) {
2683 if (deltanext) {
9baa0206 2684 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2685 }
2686 else if (minnext > U8_MAX) {
9baa0206 2687 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2688 }
eb160463 2689 scan->flags = (U8)minnext;
c277df42
IZ
2690 }
2691 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2692 pars++;
405ff068 2693 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2694 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2695 if (data)
2696 data->whilem_c = data_fake.whilem_c;
e1901655 2697 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
2698 int was = (data->start_class->flags & ANYOF_EOS);
2699
2700 cl_and(data->start_class, &intrnl);
2701 if (was)
2702 data->start_class->flags |= ANYOF_EOS;
2703 }
a0ed51b3
LW
2704 }
2705 else if (OP(scan) == OPEN) {
c277df42 2706 pars++;
a0ed51b3 2707 }
cb434fcc 2708 else if (OP(scan) == CLOSE) {
eb160463 2709 if ((I32)ARG(scan) == is_par) {
cb434fcc 2710 next = regnext(scan);
c277df42 2711
cb434fcc
IZ
2712 if ( next && (OP(next) != WHILEM) && next < last)
2713 is_par = 0; /* Disable optimization */
2714 }
2715 if (data)
2716 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2717 }
2718 else if (OP(scan) == EVAL) {
c277df42
IZ
2719 if (data)
2720 data->flags |= SF_HAS_EVAL;
2721 }
96776eda 2722 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2723 if (flags & SCF_DO_SUBSTR) {
830247a4 2724 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2725 data->longest = &(data->longest_float);
2726 }
2727 is_inf = is_inf_internal = 1;
653099ff 2728 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2729 cl_anything(pRExC_state, data->start_class);
96776eda 2730 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2731 }
c277df42
IZ
2732 /* Else: zero-length, ignore. */
2733 scan = regnext(scan);
2734 }
2735
2736 finish:
2737 *scanp = scan;
aca2d497 2738 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2739 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2740 data->pos_delta = I32_MAX - data->pos_min;
2741 if (is_par > U8_MAX)
2742 is_par = 0;
2743 if (is_par && pars==1 && data) {
2744 data->flags |= SF_IN_PAR;
2745 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2746 }
2747 else if (pars && data) {
c277df42
IZ
2748 data->flags |= SF_HAS_PAR;
2749 data->flags &= ~SF_IN_PAR;
2750 }
653099ff
GS
2751 if (flags & SCF_DO_STCLASS_OR)
2752 cl_and(data->start_class, &and_with);
c277df42
IZ
2753 return min;
2754}
2755
76e3520e 2756STATIC I32
830247a4 2757S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 2758{
830247a4 2759 if (RExC_rx->data) {
b81d288d
AB
2760 Renewc(RExC_rx->data,
2761 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2762 char, struct reg_data);
830247a4
IZ
2763 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2764 RExC_rx->data->count += n;
a0ed51b3
LW
2765 }
2766 else {
830247a4 2767 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2768 char, struct reg_data);
830247a4
IZ
2769 New(1208, RExC_rx->data->what, n, U8);
2770 RExC_rx->data->count = n;
c277df42 2771 }
830247a4
IZ
2772 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2773 return RExC_rx->data->count - n;
c277df42
IZ
2774}
2775
d88dccdf 2776void
864dbfa3 2777Perl_reginitcolors(pTHX)
d88dccdf 2778{
d88dccdf
IZ
2779 int i = 0;
2780 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 2781
d88dccdf
IZ
2782 if (s) {
2783 PL_colors[0] = s = savepv(s);
2784 while (++i < 6) {
2785 s = strchr(s, '\t');
2786 if (s) {
2787 *s = '\0';
2788 PL_colors[i] = ++s;
2789 }
2790 else
c712d376 2791 PL_colors[i] = s = "";
d88dccdf
IZ
2792 }
2793 } else {
b81d288d 2794 while (i < 6)
d88dccdf
IZ
2795 PL_colors[i++] = "";
2796 }
2797 PL_colorset = 1;
2798}
2799
8615cb43 2800
a687059c 2801/*
e50aee73 2802 - pregcomp - compile a regular expression into internal code
a687059c
LW
2803 *
2804 * We can't allocate space until we know how big the compiled form will be,
2805 * but we can't compile it (and thus know how big it is) until we've got a
2806 * place to put the code. So we cheat: we compile it twice, once with code
2807 * generation turned off and size counting turned on, and once "for real".
2808 * This also means that we don't allocate space until we are sure that the
2809 * thing really will compile successfully, and we never have to move the
2810 * code and thus invalidate pointers into it. (Note that it has to be in
2811 * one piece because free() must be able to free it all.) [NB: not true in perl]
2812 *
2813 * Beware that the optimization-preparation code in here knows about some
2814 * of the structure of the compiled regexp. [I'll say.]
2815 */
2816regexp *
864dbfa3 2817Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2818{
a0d0e21e 2819 register regexp *r;
c277df42 2820 regnode *scan;
c277df42 2821 regnode *first;
a0d0e21e 2822 I32 flags;
a0d0e21e
LW
2823 I32 minlen = 0;
2824 I32 sawplus = 0;
2825 I32 sawopen = 0;
2c2d71f5 2826 scan_data_t data;
830247a4
IZ
2827 RExC_state_t RExC_state;
2828 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 2829
a3621e74
YO
2830 GET_RE_DEBUG_FLAGS_DECL;
2831
a0d0e21e 2832 if (exp == NULL)
c277df42 2833 FAIL("NULL regexp argument");
a0d0e21e 2834
a5961de5 2835 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 2836
5cfc7842 2837 RExC_precomp = exp;
a3621e74
YO
2838 DEBUG_r(if (!PL_colorset) reginitcolors());
2839 DEBUG_COMPILE_r({
2840 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
2841 PL_colors[4],PL_colors[5],PL_colors[0],
2842 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2843 });
e2509266 2844 RExC_flags = pm->op_pmflags;
830247a4 2845 RExC_sawback = 0;
bbce6d69 2846
830247a4
IZ
2847 RExC_seen = 0;
2848 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2849 RExC_seen_evals = 0;
2850 RExC_extralen = 0;
c277df42 2851
bbce6d69 2852 /* First pass: determine size, legality. */
830247a4 2853 RExC_parse = exp;
fac92740 2854 RExC_start = exp;
830247a4
IZ
2855 RExC_end = xend;
2856 RExC_naughty = 0;
2857 RExC_npar = 1;
2858 RExC_size = 0L;
2859 RExC_emit = &PL_regdummy;
2860 RExC_whilem_seen = 0;
85ddcde9
JH
2861#if 0 /* REGC() is (currently) a NOP at the first pass.
2862 * Clever compilers notice this and complain. --jhi */
830247a4 2863 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 2864#endif
830247a4 2865 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 2866 RExC_precomp = Nullch;
a0d0e21e
LW
2867 return(NULL);
2868 }
a3621e74 2869 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 2870
c277df42
IZ
2871 /* Small enough for pointer-storage convention?
2872 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
2873 if (RExC_size >= 0x10000L && RExC_extralen)
2874 RExC_size += RExC_extralen;
c277df42 2875 else
830247a4
IZ
2876 RExC_extralen = 0;
2877 if (RExC_whilem_seen > 15)
2878 RExC_whilem_seen = 15;
a0d0e21e 2879
bbce6d69 2880 /* Allocate space and initialize. */
830247a4 2881 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 2882 char, regexp);
a0d0e21e 2883 if (r == NULL)
b45f050a
JF
2884 FAIL("Regexp out of space");
2885
0f79a09d
GS
2886#ifdef DEBUGGING
2887 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 2888 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 2889#endif
c277df42 2890 r->refcnt = 1;
bbce6d69 2891 r->prelen = xend - exp;
5cfc7842 2892 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 2893 r->subbeg = NULL;
ed252734
NC
2894#ifdef PERL_COPY_ON_WRITE
2895 r->saved_copy = Nullsv;
2896#endif
cf93c79d 2897 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 2898 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
2899
2900 r->substrs = 0; /* Useful during FAIL. */
2901 r->startp = 0; /* Useful during FAIL. */
2902 r->endp = 0; /* Useful during FAIL. */
2903
fac92740
MJD
2904 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2905 if (r->offsets) {
2906 r->offsets[0] = RExC_size;
2907 }
a3621e74 2908 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
392fbf5d 2909 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 2910 r->offsets ? "Got" : "Couldn't get",
392fbf5d 2911 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 2912
830247a4 2913 RExC_rx = r;
bbce6d69 2914
2915 /* Second pass: emit code. */
e2509266 2916 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
2917 RExC_parse = exp;
2918 RExC_end = xend;
2919 RExC_naughty = 0;
2920 RExC_npar = 1;
fac92740 2921 RExC_emit_start = r->program;
830247a4 2922 RExC_emit = r->program;
2cd61cdb 2923 /* Store the count of eval-groups for security checks: */
eb160463 2924 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 2925 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 2926 r->data = 0;
830247a4 2927 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
2928 return(NULL);
2929
a3621e74 2930
a0d0e21e 2931 /* Dig out information for optimizations. */
cf93c79d 2932 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 2933 pm->op_pmflags = RExC_flags;
a0ed51b3 2934 if (UTF)
5ff6fc6d 2935 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 2936 r->regstclass = NULL;
830247a4 2937 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 2938 r->reganch |= ROPT_NAUGHTY;
c277df42 2939 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
2940
2941 /* XXXX To minimize changes to RE engine we always allocate
2942 3-units-long substrs field. */
2943 Newz(1004, r->substrs, 1, struct reg_substr_data);
2944
2c2d71f5 2945 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 2946 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 2947 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 2948 I32 fake;
c5254dd6 2949 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
2950 struct regnode_charclass_class ch_class;
2951 int stclass_flag;
cb434fcc 2952 I32 last_close = 0;
a0d0e21e
LW
2953
2954 first = scan;
c277df42 2955 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 2956 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 2957 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
2958 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2959 (OP(first) == PLUS) ||
2960 (OP(first) == MINMOD) ||
653099ff 2961 /* An {n,m} with n>0 */
22c35a8c 2962 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
2963 if (OP(first) == PLUS)
2964 sawplus = 1;
2965 else
2966 first += regarglen[(U8)OP(first)];
2967 first = NEXTOPER(first);
a687059c
LW
2968 }
2969
a0d0e21e
LW
2970 /* Starting-point info. */
2971 again:
653099ff 2972 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
2973 if (OP(first) == EXACT)
2974 ; /* Empty, get anchored substr later. */
2975 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
2976 r->regstclass = first;
2977 }
653099ff 2978 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 2979 r->regstclass = first;
22c35a8c
GS
2980 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2981 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 2982 r->regstclass = first;
22c35a8c 2983 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
2984 r->reganch |= (OP(first) == MBOL
2985 ? ROPT_ANCH_MBOL
2986 : (OP(first) == SBOL
2987 ? ROPT_ANCH_SBOL
2988 : ROPT_ANCH_BOL));
a0d0e21e 2989 first = NEXTOPER(first);
774d564b 2990 goto again;
2991 }
2992 else if (OP(first) == GPOS) {
2993 r->reganch |= ROPT_ANCH_GPOS;
2994 first = NEXTOPER(first);
2995 goto again;
a0d0e21e 2996 }
e09294f4 2997 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 2998 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
2999 !(r->reganch & ROPT_ANCH) )
3000 {
3001 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
3002 int type = OP(NEXTOPER(first));
3003
ffc61ed2 3004 if (type == REG_ANY)
cad2e5aa
JH
3005 type = ROPT_ANCH_MBOL;
3006 else
3007 type = ROPT_ANCH_SBOL;
3008
3009 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3010 first = NEXTOPER(first);
774d564b 3011 goto again;
a0d0e21e 3012 }
b81d288d 3013 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3014 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3015 /* x+ must match at the 1st pos of run of x's */
3016 r->reganch |= ROPT_SKIP;
a0d0e21e 3017
c277df42 3018 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 3019 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 3020 (IV)(first - scan + 1)));
a0d0e21e
LW
3021 /*
3022 * If there's something expensive in the r.e., find the
3023 * longest literal string that must appear and make it the
3024 * regmust. Resolve ties in favor of later strings, since
3025 * the regstart check works with the beginning of the r.e.
3026 * and avoiding duplication strengthens checking. Not a
3027 * strong reason, but sufficient in the absence of others.
3028 * [Now we resolve ties in favor of the earlier string if
c277df42 3029 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3030 * earlier string may buy us something the later one won't.]
3031 */
a0d0e21e 3032 minlen = 0;
a687059c 3033
79cb57f6
GS
3034 data.longest_fixed = newSVpvn("",0);
3035 data.longest_float = newSVpvn("",0);
3036 data.last_found = newSVpvn("",0);
c277df42
IZ
3037 data.longest = &(data.longest_fixed);
3038 first = scan;
653099ff 3039 if (!r->regstclass) {
830247a4 3040 cl_init(pRExC_state, &ch_class);
653099ff
GS
3041 data.start_class = &ch_class;
3042 stclass_flag = SCF_DO_STCLASS_AND;
3043 } else /* XXXX Check for BOUND? */
3044 stclass_flag = 0;
cb434fcc 3045 data.last_closep = &last_close;
653099ff 3046
830247a4 3047 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3048 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3049 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3050 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3051 && !RExC_seen_zerolen
3052 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3053 r->reganch |= ROPT_CHECK_ALL;
830247a4 3054 scan_commit(pRExC_state, &data);
c277df42
IZ
3055 SvREFCNT_dec(data.last_found);
3056
a0ed51b3 3057 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3058 if (longest_float_length
c277df42
IZ
3059 || (data.flags & SF_FL_BEFORE_EOL
3060 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3061 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3062 int t;
3063
a0ed51b3 3064 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3065 && data.offset_fixed == data.offset_float_min
3066 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3067 goto remove_float; /* As in (a)+. */
3068
33b8afdf
JH
3069 if (SvUTF8(data.longest_float)) {
3070 r->float_utf8 = data.longest_float;
3071 r->float_substr = Nullsv;
3072 } else {
3073 r->float_substr = data.longest_float;
3074 r->float_utf8 = Nullsv;
3075 }
c277df42
IZ
3076 r->float_min_offset = data.offset_float_min;
3077 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3078 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3079 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3080 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3081 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3082 }
3083 else {
aca2d497 3084 remove_float:
33b8afdf 3085 r->float_substr = r->float_utf8 = Nullsv;
c277df42 3086 SvREFCNT_dec(data.longest_float);
c5254dd6 3087 longest_float_length = 0;
a0d0e21e 3088 }
c277df42 3089
a0ed51b3 3090 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3091 if (longest_fixed_length
c277df42
IZ
3092 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3093 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3094 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3095 int t;
3096
33b8afdf
JH
3097 if (SvUTF8(data.longest_fixed)) {
3098 r->anchored_utf8 = data.longest_fixed;
3099 r->anchored_substr = Nullsv;
3100 } else {
3101 r->anchored_substr = data.longest_fixed;
3102 r->anchored_utf8 = Nullsv;
3103 }
c277df42 3104 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3105 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3106 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3107 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3108 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3109 }
3110 else {
33b8afdf 3111 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 3112 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3113 longest_fixed_length = 0;
a0d0e21e 3114 }
b81d288d 3115 if (r->regstclass
ffc61ed2 3116 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3117 r->regstclass = NULL;
33b8afdf
JH
3118 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3119 && stclass_flag
653099ff 3120 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3121 && !cl_is_anything(data.start_class))
3122 {
830247a4 3123 I32 n = add_data(pRExC_state, 1, "f");
653099ff 3124
b81d288d 3125 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
3126 struct regnode_charclass_class);
3127 StructCopy(data.start_class,
830247a4 3128 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3129 struct regnode_charclass_class);
830247a4 3130 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3131 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 3132 PL_regdata = r->data; /* for regprop() */
a3621e74 3133 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
9c5ffd7c
JH
3134 regprop(sv, (regnode*)data.start_class);
3135 PerlIO_printf(Perl_debug_log,
3136 "synthetic stclass `%s'.\n",
3137 SvPVX(sv));});
653099ff 3138 }
c277df42
IZ
3139
3140 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3141 if (longest_fixed_length > longest_float_length) {
c277df42 3142 r->check_substr = r->anchored_substr;
33b8afdf 3143 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3144 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3145 if (r->reganch & ROPT_ANCH_SINGLE)
3146 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3147 }
3148 else {
c277df42 3149 r->check_substr = r->float_substr;
33b8afdf 3150 r->check_utf8 = r->float_utf8;
c277df42
IZ
3151 r->check_offset_min = data.offset_float_min;
3152 r->check_offset_max = data.offset_float_max;
a0d0e21e 3153 }
30382c73
IZ
3154 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3155 This should be changed ASAP! */
33b8afdf 3156 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3157 r->reganch |= RE_USE_INTUIT;
33b8afdf 3158 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3159 r->reganch |= RE_INTUIT_TAIL;
3160 }
a0ed51b3
LW
3161 }
3162 else {
c277df42
IZ
3163 /* Several toplevels. Best we can is to set minlen. */
3164 I32 fake;
653099ff 3165 struct regnode_charclass_class ch_class;
cb434fcc 3166 I32 last_close = 0;
c277df42 3167
a3621e74 3168 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3169 scan = r->program + 1;
830247a4 3170 cl_init(pRExC_state, &ch_class);
653099ff 3171 data.start_class = &ch_class;
cb434fcc 3172 data.last_closep = &last_close;
a3621e74 3173 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf
JH
3174 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3175 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 3176 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3177 && !cl_is_anything(data.start_class))
3178 {
830247a4 3179 I32 n = add_data(pRExC_state, 1, "f");
653099ff 3180
b81d288d 3181 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
3182 struct regnode_charclass_class);
3183 StructCopy(data.start_class,
830247a4 3184 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3185 struct regnode_charclass_class);
830247a4 3186 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3187 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3188 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
9c5ffd7c
JH
3189 regprop(sv, (regnode*)data.start_class);
3190 PerlIO_printf(Perl_debug_log,
3191 "synthetic stclass `%s'.\n",
3192 SvPVX(sv));});
653099ff 3193 }
a0d0e21e
LW
3194 }
3195
a0d0e21e 3196 r->minlen = minlen;
b81d288d 3197 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3198 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3199 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3200 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3201 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3202 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3203 if (RExC_seen & REG_SEEN_CANY)
3204 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
3205 Newz(1002, r->startp, RExC_npar, I32);
3206 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 3207 PL_regdata = r->data; /* for regprop() */
a3621e74 3208 DEBUG_COMPILE_r(regdump(r));
a0d0e21e 3209 return(r);
a687059c
LW
3210}
3211
3212/*
3213 - reg - regular expression, i.e. main body or parenthesized thing
3214 *
3215 * Caller must absorb opening parenthesis.
3216 *
3217 * Combining parenthesis handling with the base level of regular expression
3218 * is a trifle forced, but the need to tie the tails of the branches to what
3219 * follows makes it hard to avoid.
3220 */
76e3520e 3221STATIC regnode *
830247a4 3222S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 3223 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3224{
c277df42
IZ
3225 register regnode *ret; /* Will be the head of the group. */
3226 register regnode *br;
3227 register regnode *lastbr;
3228 register regnode *ender = 0;
a0d0e21e 3229 register I32 parno = 0;
e2509266 3230 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
3231
3232 /* for (?g), (?gc), and (?o) warnings; warning
3233 about (?c) will warn about (?g) -- japhy */
3234
3235 I32 wastedflags = 0x00,
3236 wasted_o = 0x01,
3237 wasted_g = 0x02,
3238 wasted_gc = 0x02 | 0x04,
3239 wasted_c = 0x04;
3240
fac92740 3241 char * parse_start = RExC_parse; /* MJD */
830247a4 3242 char *oregcomp_parse = RExC_parse;
c277df42 3243 char c;
a0d0e21e 3244
821b33a5 3245 *flagp = 0; /* Tentatively. */
a0d0e21e 3246
9d1d55b5 3247
a0d0e21e
LW
3248 /* Make an OPEN node, if parenthesized. */
3249 if (paren) {
fac92740 3250 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3251 U32 posflags = 0, negflags = 0;
3252 U32 *flagsp = &posflags;
0f5d15d6 3253 int logical = 0;
830247a4 3254 char *seqstart = RExC_parse;
ca9dfc88 3255
830247a4
IZ
3256 RExC_parse++;
3257 paren = *RExC_parse++;
c277df42 3258 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3259 switch (paren) {
fac92740 3260 case '<': /* (?<...) */
830247a4 3261 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3262 if (*RExC_parse == '!')
c277df42 3263 paren = ',';
b81d288d 3264 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3265 goto unknown;
830247a4 3266 RExC_parse++;
fac92740
MJD
3267 case '=': /* (?=...) */
3268 case '!': /* (?!...) */
830247a4 3269 RExC_seen_zerolen++;
fac92740
MJD
3270 case ':': /* (?:...) */
3271 case '>': /* (?>...) */
a0d0e21e 3272 break;
fac92740
MJD
3273 case '$': /* (?$...) */
3274 case '@': /* (?@...) */
8615cb43 3275 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3276 break;
fac92740 3277 case '#': /* (?#...) */
830247a4
IZ
3278 while (*RExC_parse && *RExC_parse != ')')
3279 RExC_parse++;
3280 if (*RExC_parse != ')')
c277df42 3281 FAIL("Sequence (?#... not terminated");
830247a4 3282 nextchar(pRExC_state);
a0d0e21e
LW
3283 *flagp = TRYAGAIN;
3284 return NULL;
fac92740 3285 case 'p': /* (?p...) */
9014280d 3286 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3287 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3288 /* FALL THROUGH*/
fac92740 3289 case '?': /* (??...) */
0f5d15d6 3290 logical = 1;
438a3801
YST
3291 if (*RExC_parse != '{')
3292 goto unknown;
830247a4 3293 paren = *RExC_parse++;
0f5d15d6 3294 /* FALL THROUGH */
fac92740 3295 case '{': /* (?{...}) */
c277df42 3296 {
c277df42
IZ
3297 I32 count = 1, n = 0;
3298 char c;
830247a4 3299 char *s = RExC_parse;
c277df42
IZ
3300 SV *sv;
3301 OP_4tree *sop, *rop;
3302
830247a4
IZ
3303 RExC_seen_zerolen++;
3304 RExC_seen |= REG_SEEN_EVAL;
3305 while (count && (c = *RExC_parse)) {
3306 if (c == '\\' && RExC_parse[1])
3307 RExC_parse++;
b81d288d 3308 else if (c == '{')
c277df42 3309 count++;
b81d288d 3310 else if (c == '}')
c277df42 3311 count--;
830247a4 3312 RExC_parse++;
c277df42 3313 }
830247a4 3314 if (*RExC_parse != ')')
b45f050a 3315 {
b81d288d 3316 RExC_parse = s;
b45f050a
JF
3317 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3318 }
c277df42 3319 if (!SIZE_ONLY) {
f3548bdc 3320 PAD *pad;
b81d288d
AB
3321
3322 if (RExC_parse - 1 - s)
830247a4 3323 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3324 else
79cb57f6 3325 sv = newSVpvn("", 0);
c277df42 3326
569233ed
SB
3327 ENTER;
3328 Perl_save_re_context(aTHX);
f3548bdc 3329 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3330 sop->op_private |= OPpREFCOUNTED;
3331 /* re_dup will OpREFCNT_inc */
3332 OpREFCNT_set(sop, 1);
569233ed 3333 LEAVE;
c277df42 3334
830247a4
IZ
3335 n = add_data(pRExC_state, 3, "nop");
3336 RExC_rx->data->data[n] = (void*)rop;
3337 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3338 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3339 SvREFCNT_dec(sv);
a0ed51b3 3340 }
e24b16f9 3341 else { /* First pass */
830247a4 3342 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3343 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3344 /* No compiled RE interpolated, has runtime
3345 components ===> unsafe. */
3346 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3347 if (PL_tainting && PL_tainted)
cc6b7395 3348 FAIL("Eval-group in insecure regular expression");
923e4eb5 3349 if (IN_PERL_COMPILETIME)
b5c19bd7 3350 PL_cv_has_eval = 1;
c277df42 3351 }
b5c19bd7 3352
830247a4 3353 nextchar(pRExC_state);
0f5d15d6 3354 if (logical) {
830247a4 3355 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3356 if (!SIZE_ONLY)
3357 ret->flags = 2;
830247a4 3358 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3359 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3360 return ret;
3361 }
ccb2c380
MP
3362 ret = reganode(pRExC_state, EVAL, n);
3363 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3364 Set_Node_Offset(ret, parse_start);
3365 return ret;
c277df42 3366 }
fac92740 3367 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3368 {
fac92740 3369 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3370 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3371 || RExC_parse[1] == '<'
830247a4 3372 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3373 I32 flag;
3374
830247a4 3375 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3376 if (!SIZE_ONLY)
3377 ret->flags = 1;
830247a4 3378 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 3379 goto insert_if;
b81d288d 3380 }
a0ed51b3 3381 }
830247a4 3382 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3383 /* (?(1)...) */
830247a4 3384 parno = atoi(RExC_parse++);
c277df42 3385
830247a4
IZ
3386 while (isDIGIT(*RExC_parse))
3387 RExC_parse++;
fac92740
MJD
3388 ret = reganode(pRExC_state, GROUPP, parno);
3389
830247a4 3390 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3391 vFAIL("Switch condition not recognized");
c277df42 3392 insert_if:
830247a4
IZ
3393 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3394 br = regbranch(pRExC_state, &flags, 1);
c277df42 3395 if (br == NULL)
830247a4 3396 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3397 else
830247a4
IZ
3398 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3399 c = *nextchar(pRExC_state);
d1b80229
IZ
3400 if (flags&HASWIDTH)
3401 *flagp |= HASWIDTH;
c277df42 3402 if (c == '|') {
830247a4
IZ
3403 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3404 regbranch(pRExC_state, &flags, 1);
3405 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
3406 if (flags&HASWIDTH)
3407 *flagp |= HASWIDTH;
830247a4 3408 c = *nextchar(pRExC_state);
a0ed51b3
LW
3409 }
3410 else
c277df42
IZ
3411 lastbr = NULL;
3412 if (c != ')')
8615cb43 3413 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
3414 ender = reg_node(pRExC_state, TAIL);
3415 regtail(pRExC_state, br, ender);
c277df42 3416 if (lastbr) {
830247a4
IZ
3417 regtail(pRExC_state, lastbr, ender);
3418 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3419 }
3420 else
830247a4 3421 regtail(pRExC_state, ret, ender);
c277df42 3422 return ret;
a0ed51b3
LW
3423 }
3424 else {
830247a4 3425 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3426 }
3427 }
1b1626e4 3428 case 0:
830247a4 3429 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3430 vFAIL("Sequence (? incomplete");
1b1626e4 3431 break;
a0d0e21e 3432 default:
830247a4 3433 --RExC_parse;
fac92740 3434 parse_flags: /* (?i) */
830247a4 3435 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3436 /* (?g), (?gc) and (?o) are useless here
3437 and must be globally applied -- japhy */
3438
3439 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3440 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3441 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3442 if (! (wastedflags & wflagbit) ) {
3443 wastedflags |= wflagbit;
3444 vWARN5(
3445 RExC_parse + 1,
3446 "Useless (%s%c) - %suse /%c modifier",
3447 flagsp == &negflags ? "?-" : "?",
3448 *RExC_parse,
3449 flagsp == &negflags ? "don't " : "",
3450 *RExC_parse
3451 );
3452 }
3453 }
3454 }
3455 else if (*RExC_parse == 'c') {
3456 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3457 if (! (wastedflags & wasted_c) ) {
3458 wastedflags |= wasted_gc;
3459 vWARN3(
3460 RExC_parse + 1,
3461 "Useless (%sc) - %suse /gc modifier",
3462 flagsp == &negflags ? "?-" : "?",
3463 flagsp == &negflags ? "don't " : ""
3464 );
3465 }
3466 }
3467 }
3468 else { pmflag(flagsp, *RExC_parse); }
3469
830247a4 3470 ++RExC_parse;
ca9dfc88 3471 }
830247a4 3472 if (*RExC_parse == '-') {
ca9dfc88 3473 flagsp = &negflags;
9d1d55b5 3474 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3475 ++RExC_parse;
ca9dfc88 3476 goto parse_flags;
48c036b1 3477 }
e2509266
JH
3478 RExC_flags |= posflags;
3479 RExC_flags &= ~negflags;
830247a4
IZ
3480 if (*RExC_parse == ':') {
3481 RExC_parse++;
ca9dfc88
IZ
3482 paren = ':';
3483 break;
3484 }
c277df42 3485 unknown:
830247a4
IZ
3486 if (*RExC_parse != ')') {
3487 RExC_parse++;
3488 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3489 }
830247a4 3490 nextchar(pRExC_state);
a0d0e21e
LW
3491 *flagp = TRYAGAIN;
3492 return NULL;
3493 }
3494 }
fac92740 3495 else { /* (...) */
830247a4
IZ
3496 parno = RExC_npar;
3497 RExC_npar++;
3498 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3499 Set_Node_Length(ret, 1); /* MJD */
3500 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 3501 open = 1;
a0d0e21e 3502 }
a0ed51b3 3503 }
fac92740 3504 else /* ! paren */
a0d0e21e
LW
3505 ret = NULL;
3506
3507 /* Pick up the branches, linking them together. */
fac92740 3508 parse_start = RExC_parse; /* MJD */
830247a4 3509 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
3510 /* branch_len = (paren != 0); */
3511
a0d0e21e
LW
3512 if (br == NULL)
3513 return(NULL);
830247a4
IZ
3514 if (*RExC_parse == '|') {
3515 if (!SIZE_ONLY && RExC_extralen) {
3516 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3517 }
fac92740 3518 else { /* MJD */
830247a4 3519 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3520 Set_Node_Length(br, paren != 0);
3521 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3522 }
c277df42
IZ
3523 have_branch = 1;
3524 if (SIZE_ONLY)
830247a4 3525 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3526 }
3527 else if (paren == ':') {
c277df42
IZ
3528 *flagp |= flags&SIMPLE;
3529 }
3530 if (open) { /* Starts with OPEN. */
830247a4 3531 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3532 }
3533 else if (paren != '?') /* Not Conditional */
a0d0e21e 3534 ret = br;
32a0ca98 3535 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3536 lastbr = br;
830247a4
IZ
3537 while (*RExC_parse == '|') {
3538 if (!SIZE_ONLY && RExC_extralen) {
3539 ender = reganode(pRExC_state, LONGJMP,0);
3540 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3541 }
3542 if (SIZE_ONLY)
830247a4
IZ
3543 RExC_extralen += 2; /* Account for LONGJMP. */
3544 nextchar(pRExC_state);
3545 br = regbranch(pRExC_state, &flags, 0);
fac92740 3546
a687059c 3547 if (br == NULL)
a0d0e21e 3548 return(NULL);
830247a4 3549 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3550 lastbr = br;
821b33a5
IZ
3551 if (flags&HASWIDTH)
3552 *flagp |= HASWIDTH;
a687059c 3553 *flagp |= flags&SPSTART;
a0d0e21e
LW
3554 }
3555
c277df42
IZ
3556 if (have_branch || paren != ':') {
3557 /* Make a closing node, and hook it on the end. */
3558 switch (paren) {
3559 case ':':
830247a4 3560 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3561 break;
3562 case 1:
830247a4 3563 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3564 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3565 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3566 break;
3567 case '<':
c277df42
IZ
3568 case ',':
3569 case '=':
3570 case '!':
c277df42 3571 *flagp &= ~HASWIDTH;
821b33a5
IZ
3572 /* FALL THROUGH */
3573 case '>':
830247a4 3574 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3575 break;
3576 case 0:
830247a4 3577 ender = reg_node(pRExC_state, END);
c277df42
IZ
3578 break;
3579 }
830247a4 3580 regtail(pRExC_state, lastbr, ender);
a0d0e21e 3581
c277df42
IZ
3582 if (have_branch) {
3583 /* Hook the tails of the branches to the closing node. */
3584 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 3585 regoptail(pRExC_state, br, ender);
c277df42
IZ
3586 }
3587 }
a0d0e21e 3588 }
c277df42
IZ
3589
3590 {
3591 char *p;
3592 static char parens[] = "=!<,>";
3593
3594 if (paren && (p = strchr(parens, paren))) {
eb160463 3595 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
3596 int flag = (p - parens) > 1;
3597
3598 if (paren == '>')
3599 node = SUSPEND, flag = 0;
830247a4 3600 reginsert(pRExC_state, node,ret);
45948336
EP
3601 Set_Node_Cur_Length(ret);
3602 Set_Node_Offset(ret, parse_start + 1);
c277df42 3603 ret->flags = flag;
830247a4 3604 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 3605 }
a0d0e21e
LW
3606 }
3607
3608 /* Check for proper termination. */
ce3e6498 3609 if (paren) {
e2509266 3610 RExC_flags = oregflags;
830247a4
IZ
3611 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3612 RExC_parse = oregcomp_parse;
380a0633 3613 vFAIL("Unmatched (");
ce3e6498 3614 }
a0ed51b3 3615 }
830247a4
IZ
3616 else if (!paren && RExC_parse < RExC_end) {
3617 if (*RExC_parse == ')') {
3618 RExC_parse++;
380a0633 3619 vFAIL("Unmatched )");
a0ed51b3
LW
3620 }
3621 else
b45f050a 3622 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
3623 /* NOTREACHED */
3624 }
a687059c 3625
a0d0e21e 3626 return(ret);
a687059c
LW
3627}
3628
3629/*
3630 - regbranch - one alternative of an | operator
3631 *
3632 * Implements the concatenation operator.
3633 */
76e3520e 3634STATIC regnode *
830247a4 3635S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 3636{
c277df42
IZ
3637 register regnode *ret;
3638 register regnode *chain = NULL;
3639 register regnode *latest;
3640 I32 flags = 0, c = 0;
a0d0e21e 3641
b81d288d 3642 if (first)
c277df42
IZ
3643 ret = NULL;
3644 else {
b81d288d 3645 if (!SIZE_ONLY && RExC_extralen)
830247a4 3646 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 3647 else {
830247a4 3648 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
3649 Set_Node_Length(ret, 1);
3650 }
c277df42
IZ
3651 }
3652
b81d288d 3653 if (!first && SIZE_ONLY)
830247a4 3654 RExC_extralen += 1; /* BRANCHJ */
b81d288d 3655
c277df42 3656 *flagp = WORST; /* Tentatively. */
a0d0e21e 3657
830247a4
IZ
3658 RExC_parse--;
3659 nextchar(pRExC_state);
3660 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 3661 flags &= ~TRYAGAIN;
830247a4 3662 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
3663 if (latest == NULL) {
3664 if (flags & TRYAGAIN)
3665 continue;
3666 return(NULL);
a0ed51b3
LW
3667 }
3668 else if (ret == NULL)
c277df42 3669 ret = latest;
a0d0e21e 3670 *flagp |= flags&HASWIDTH;
c277df42 3671 if (chain == NULL) /* First piece. */
a0d0e21e
LW
3672 *flagp |= flags&SPSTART;
3673 else {
830247a4
IZ
3674 RExC_naughty++;
3675 regtail(pRExC_state, chain, latest);
a687059c 3676 }
a0d0e21e 3677 chain = latest;
c277df42
IZ
3678 c++;
3679 }
3680 if (chain == NULL) { /* Loop ran zero times. */
830247a4 3681 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
3682 if (ret == NULL)
3683 ret = chain;
3684 }
3685 if (c == 1) {
3686 *flagp |= flags&SIMPLE;
a0d0e21e 3687 }
a687059c 3688
a0d0e21e 3689 return(ret);
a687059c
LW
3690}
3691
3692/*
3693 - regpiece - something followed by possible [*+?]
3694 *
3695 * Note that the branching code sequences used for ? and the general cases
3696 * of * and + are somewhat optimized: they use the same NOTHING node as
3697 * both the endmarker for their branch list and the body of the last branch.
3698 * It might seem that this node could be dispensed with entirely, but the
3699 * endmarker role is not redundant.
3700 */
76e3520e 3701STATIC regnode *
830247a4 3702S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3703{
c277df42 3704 register regnode *ret;
a0d0e21e
LW
3705 register char op;
3706 register char *next;
3707 I32 flags;
830247a4 3708 char *origparse = RExC_parse;
a0d0e21e
LW
3709 char *maxpos;
3710 I32 min;
c277df42 3711 I32 max = REG_INFTY;
fac92740 3712 char *parse_start;
a0d0e21e 3713
830247a4 3714 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
3715 if (ret == NULL) {
3716 if (flags & TRYAGAIN)
3717 *flagp |= TRYAGAIN;
3718 return(NULL);
3719 }
3720
830247a4 3721 op = *RExC_parse;
a0d0e21e 3722
830247a4 3723 if (op == '{' && regcurly(RExC_parse)) {
fac92740 3724 parse_start = RExC_parse; /* MJD */
830247a4 3725 next = RExC_parse + 1;
a0d0e21e
LW
3726 maxpos = Nullch;
3727 while (isDIGIT(*next) || *next == ',') {
3728 if (*next == ',') {
3729 if (maxpos)
3730 break;
3731 else
3732 maxpos = next;
a687059c 3733 }
a0d0e21e
LW
3734 next++;
3735 }
3736 if (*next == '}') { /* got one */
3737 if (!maxpos)
3738 maxpos = next;
830247a4
IZ
3739 RExC_parse++;
3740 min = atoi(RExC_parse);
a0d0e21e
LW
3741 if (*maxpos == ',')
3742 maxpos++;
3743 else
830247a4 3744 maxpos = RExC_parse;
a0d0e21e
LW
3745 max = atoi(maxpos);
3746 if (!max && *maxpos != '0')
c277df42
IZ
3747 max = REG_INFTY; /* meaning "infinity" */
3748 else if (max >= REG_INFTY)
8615cb43 3749 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
3750 RExC_parse = next;
3751 nextchar(pRExC_state);
a0d0e21e
LW
3752
3753 do_curly:
3754 if ((flags&SIMPLE)) {
830247a4
IZ
3755 RExC_naughty += 2 + RExC_naughty / 2;
3756 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
3757 Set_Node_Offset(ret, parse_start+1); /* MJD */
3758 Set_Node_Cur_Length(ret);
a0d0e21e
LW
3759 }
3760 else {
830247a4 3761 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
3762
3763 w->flags = 0;
830247a4
IZ
3764 regtail(pRExC_state, ret, w);
3765 if (!SIZE_ONLY && RExC_extralen) {
3766 reginsert(pRExC_state, LONGJMP,ret);
3767 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
3768 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3769 }
830247a4 3770 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
3771 /* MJD hk */
3772 Set_Node_Offset(ret, parse_start+1);
3773 Set_Node_Length(ret,
3774 op == '{' ? (RExC_parse - parse_start) : 1);
3775
830247a4 3776 if (!SIZE_ONLY && RExC_extralen)
c277df42 3777 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 3778 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 3779 if (SIZE_ONLY)
830247a4
IZ
3780 RExC_whilem_seen++, RExC_extralen += 3;
3781 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 3782 }
c277df42 3783 ret->flags = 0;
a0d0e21e
LW
3784
3785 if (min > 0)
821b33a5
IZ
3786 *flagp = WORST;
3787 if (max > 0)
3788 *flagp |= HASWIDTH;
a0d0e21e 3789 if (max && max < min)
8615cb43 3790 vFAIL("Can't do {n,m} with n > m");
c277df42 3791 if (!SIZE_ONLY) {
eb160463
GS
3792 ARG1_SET(ret, (U16)min);
3793 ARG2_SET(ret, (U16)max);
a687059c 3794 }
a687059c 3795
a0d0e21e 3796 goto nest_check;
a687059c 3797 }
a0d0e21e 3798 }
a687059c 3799
a0d0e21e
LW
3800 if (!ISMULT1(op)) {
3801 *flagp = flags;
a687059c 3802 return(ret);
a0d0e21e 3803 }
bb20fd44 3804
c277df42 3805#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
3806
3807 /* if this is reinstated, don't forget to put this back into perldiag:
3808
3809 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3810
3811 (F) The part of the regexp subject to either the * or + quantifier
3812 could match an empty string. The {#} shows in the regular
3813 expression about where the problem was discovered.
3814
3815 */
3816
bb20fd44 3817 if (!(flags&HASWIDTH) && op != '?')
b45f050a 3818 vFAIL("Regexp *+ operand could be empty");
b81d288d 3819#endif
bb20fd44 3820
fac92740 3821 parse_start = RExC_parse;
830247a4 3822 nextchar(pRExC_state);
a0d0e21e 3823
821b33a5 3824 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
3825
3826 if (op == '*' && (flags&SIMPLE)) {
830247a4 3827 reginsert(pRExC_state, STAR, ret);
c277df42 3828 ret->flags = 0;
830247a4 3829 RExC_naughty += 4;
a0d0e21e
LW
3830 }
3831 else if (op == '*') {
3832 min = 0;
3833 goto do_curly;
a0ed51b3
LW
3834 }
3835 else if (op == '+' && (flags&SIMPLE)) {
830247a4 3836 reginsert(pRExC_state, PLUS, ret);
c277df42 3837 ret->flags = 0;
830247a4 3838 RExC_naughty += 3;
a0d0e21e
LW
3839 }
3840 else if (op == '+') {
3841 min = 1;
3842 goto do_curly;
a0ed51b3
LW
3843 }
3844 else if (op == '?') {
a0d0e21e
LW
3845 min = 0; max = 1;
3846 goto do_curly;
3847 }
3848 nest_check:
e476b1b5 3849 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 3850 vWARN3(RExC_parse,
b45f050a 3851 "%.*s matches null string many times",
830247a4 3852 RExC_parse - origparse,
b45f050a 3853 origparse);
a0d0e21e
LW
3854 }
3855
830247a4
IZ
3856 if (*RExC_parse == '?') {
3857 nextchar(pRExC_state);
3858 reginsert(pRExC_state, MINMOD, ret);
3859 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 3860 }
830247a4
IZ
3861 if (ISMULT2(RExC_parse)) {
3862 RExC_parse++;
b45f050a
JF
3863 vFAIL("Nested quantifiers");
3864 }
a0d0e21e
LW
3865
3866 return(ret);
a687059c
LW
3867}
3868
3869/*
3870 - regatom - the lowest level
3871 *
3872 * Optimization: gobbles an entire sequence of ordinary characters so that
3873 * it can turn them into a single node, which is smaller to store and
3874 * faster to run. Backslashed characters are exceptions, each becoming a
3875 * separate node; the code is simpler that way and it's not worth fixing.
3876 *
b45f050a 3877 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 3878STATIC regnode *
830247a4 3879S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3880{
c277df42 3881 register regnode *ret = 0;
a0d0e21e 3882 I32 flags;
45948336 3883 char *parse_start = RExC_parse;
a0d0e21e
LW
3884
3885 *flagp = WORST; /* Tentatively. */
3886
3887tryagain:
830247a4 3888 switch (*RExC_parse) {
a0d0e21e 3889 case '^':
830247a4
IZ
3890 RExC_seen_zerolen++;
3891 nextchar(pRExC_state);
e2509266 3892 if (RExC_flags & PMf_MULTILINE)
830247a4 3893 ret = reg_node(pRExC_state, MBOL);
e2509266 3894 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3895 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3896 else
830247a4 3897 ret = reg_node(pRExC_state, BOL);
fac92740 3898 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3899 break;
3900 case '$':
830247a4 3901 nextchar(pRExC_state);
b81d288d 3902 if (*RExC_parse)
830247a4 3903 RExC_seen_zerolen++;
e2509266 3904 if (RExC_flags & PMf_MULTILINE)
830247a4 3905 ret = reg_node(pRExC_state, MEOL);
e2509266 3906 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3907 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3908 else
830247a4 3909 ret = reg_node(pRExC_state, EOL);
fac92740 3910 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3911 break;
3912 case '.':
830247a4 3913 nextchar(pRExC_state);
e2509266 3914 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
3915 ret = reg_node(pRExC_state, SANY);
3916 else
3917 ret = reg_node(pRExC_state, REG_ANY);
3918 *flagp |= HASWIDTH|SIMPLE;
830247a4 3919 RExC_naughty++;
fac92740 3920 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3921 break;
3922 case '[':
b45f050a 3923 {
830247a4 3924 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 3925 ret = regclass(pRExC_state);
830247a4
IZ
3926 if (*RExC_parse != ']') {
3927 RExC_parse = oregcomp_parse;
b45f050a
JF
3928 vFAIL("Unmatched [");
3929 }
830247a4 3930 nextchar(pRExC_state);
a0d0e21e 3931 *flagp |= HASWIDTH|SIMPLE;
fac92740 3932 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 3933 break;
b45f050a 3934 }
a0d0e21e 3935 case '(':
830247a4
IZ
3936 nextchar(pRExC_state);
3937 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 3938 if (ret == NULL) {
bf93d4cc 3939 if (flags & TRYAGAIN) {
830247a4 3940 if (RExC_parse == RExC_end) {
bf93d4cc
GS
3941 /* Make parent create an empty node if needed. */
3942 *flagp |= TRYAGAIN;
3943 return(NULL);
3944 }
a0d0e21e 3945 goto tryagain;
bf93d4cc 3946 }
a0d0e21e
LW
3947 return(NULL);
3948 }
c277df42 3949 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
3950 break;
3951 case '|':
3952 case ')':
3953 if (flags & TRYAGAIN) {
3954 *flagp |= TRYAGAIN;
3955 return NULL;
3956 }
b45f050a 3957 vFAIL("Internal urp");
a0d0e21e
LW
3958 /* Supposed to be caught earlier. */
3959 break;
85afd4ae 3960 case '{':
830247a4
IZ
3961 if (!regcurly(RExC_parse)) {
3962 RExC_parse++;
85afd4ae
CS
3963 goto defchar;
3964 }
3965 /* FALL THROUGH */
a0d0e21e
LW
3966 case '?':
3967 case '+':
3968 case '*':
830247a4 3969 RExC_parse++;
b45f050a 3970 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
3971 break;
3972 case '\\':
830247a4 3973 switch (*++RExC_parse) {
a0d0e21e 3974 case 'A':
830247a4
IZ
3975 RExC_seen_zerolen++;
3976 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3977 *flagp |= SIMPLE;
830247a4 3978 nextchar(pRExC_state);
fac92740 3979 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3980 break;
3981 case 'G':
830247a4
IZ
3982 ret = reg_node(pRExC_state, GPOS);
3983 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 3984 *flagp |= SIMPLE;
830247a4 3985 nextchar(pRExC_state);
fac92740 3986 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3987 break;
3988 case 'Z':
830247a4 3989 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3990 *flagp |= SIMPLE;
a1917ab9 3991 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 3992 nextchar(pRExC_state);
a0d0e21e 3993 break;
b85d18e9 3994 case 'z':
830247a4 3995 ret = reg_node(pRExC_state, EOS);
b85d18e9 3996 *flagp |= SIMPLE;
830247a4
IZ
3997 RExC_seen_zerolen++; /* Do not optimize RE away */
3998 nextchar(pRExC_state);
fac92740 3999 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 4000 break;
4a2d328f 4001 case 'C':
f33976b4
DB
4002 ret = reg_node(pRExC_state, CANY);
4003 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 4004 *flagp |= HASWIDTH|SIMPLE;
830247a4 4005 nextchar(pRExC_state);
fac92740 4006 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
4007 break;
4008 case 'X':
830247a4 4009 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 4010 *flagp |= HASWIDTH;
830247a4 4011 nextchar(pRExC_state);
fac92740 4012 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 4013 break;
a0d0e21e 4014 case 'w':
eb160463 4015 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 4016 *flagp |= HASWIDTH|SIMPLE;
830247a4 4017 nextchar(pRExC_state);
fac92740 4018 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4019 break;
4020 case 'W':
eb160463 4021 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4022 *flagp |= HASWIDTH|SIMPLE;
830247a4 4023 nextchar(pRExC_state);
fac92740 4024 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4025 break;
4026 case 'b':
830247a4
IZ
4027 RExC_seen_zerolen++;
4028 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4029 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4030 *flagp |= SIMPLE;
830247a4 4031 nextchar(pRExC_state);
fac92740 4032 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4033 break;
4034 case 'B':
830247a4
IZ
4035 RExC_seen_zerolen++;
4036 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4037 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4038 *flagp |= SIMPLE;
830247a4 4039 nextchar(pRExC_state);
fac92740 4040 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4041 break;
4042 case 's':
eb160463 4043 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4044 *flagp |= HASWIDTH|SIMPLE;
830247a4 4045 nextchar(pRExC_state);
fac92740 4046 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4047 break;
4048 case 'S':
eb160463 4049 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4050 *flagp |= HASWIDTH|SIMPLE;
830247a4 4051 nextchar(pRExC_state);
fac92740 4052 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4053 break;
4054 case 'd':
ffc61ed2 4055 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4056 *flagp |= HASWIDTH|SIMPLE;
830247a4 4057 nextchar(pRExC_state);
fac92740 4058 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4059 break;
4060 case 'D':
ffc61ed2 4061 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4062 *flagp |= HASWIDTH|SIMPLE;
830247a4 4063 nextchar(pRExC_state);
fac92740 4064 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4065 break;
a14b48bc
LW
4066 case 'p':
4067 case 'P':
3568d838 4068 {
830247a4 4069 char* oldregxend = RExC_end;
ccb2c380 4070 char* parse_start = RExC_parse - 2;
a14b48bc 4071
830247a4 4072 if (RExC_parse[1] == '{') {
3568d838 4073 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4074 RExC_end = strchr(RExC_parse, '}');
4075 if (!RExC_end) {
0da60cf5 4076 U8 c = (U8)*RExC_parse;
830247a4
IZ
4077 RExC_parse += 2;
4078 RExC_end = oldregxend;
0da60cf5 4079 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4080 }
830247a4 4081 RExC_end++;
a14b48bc 4082 }
af6f566e 4083 else {
830247a4 4084 RExC_end = RExC_parse + 2;
af6f566e
HS
4085 if (RExC_end > oldregxend)
4086 RExC_end = oldregxend;
4087 }
830247a4 4088 RExC_parse--;
a14b48bc 4089
ffc61ed2 4090 ret = regclass(pRExC_state);
a14b48bc 4091
830247a4
IZ
4092 RExC_end = oldregxend;
4093 RExC_parse--;
ccb2c380
MP
4094
4095 Set_Node_Offset(ret, parse_start + 2);
4096 Set_Node_Cur_Length(ret);
830247a4 4097 nextchar(pRExC_state);
a14b48bc
LW
4098 *flagp |= HASWIDTH|SIMPLE;
4099 }
4100 break;
a0d0e21e
LW
4101 case 'n':
4102 case 'r':
4103 case 't':
4104 case 'f':
4105 case 'e':
4106 case 'a':
4107 case 'x':
4108 case 'c':
4109 case '0':
4110 goto defchar;
4111 case '1': case '2': case '3': case '4':
4112 case '5': case '6': case '7': case '8': case '9':
4113 {
830247a4 4114 I32 num = atoi(RExC_parse);
a0d0e21e 4115
830247a4 4116 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4117 goto defchar;
4118 else {
fac92740 4119 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4120 while (isDIGIT(*RExC_parse))
4121 RExC_parse++;
b45f050a 4122
eb160463 4123 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4124 vFAIL("Reference to nonexistent group");
830247a4 4125 RExC_sawback = 1;
eb160463
GS
4126 ret = reganode(pRExC_state,
4127 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4128 num);
a0d0e21e 4129 *flagp |= HASWIDTH;
fac92740
MJD
4130
4131 /* override incorrect value set in reganode MJD */
4132 Set_Node_Offset(ret, parse_start+1);
4133 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4134 RExC_parse--;
4135 nextchar(pRExC_state);
a0d0e21e
LW
4136 }
4137 }
4138 break;
4139 case '\0':
830247a4 4140 if (RExC_parse >= RExC_end)
b45f050a 4141 FAIL("Trailing \\");
a0d0e21e
LW
4142 /* FALL THROUGH */
4143 default:
c9f97d15
IZ
4144 /* Do not generate `unrecognized' warnings here, we fall
4145 back into the quick-grab loop below */
45948336 4146 parse_start--;
a0d0e21e
LW
4147 goto defchar;
4148 }
4149 break;
4633a7c4
LW
4150
4151 case '#':
e2509266 4152 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4153 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4154 if (RExC_parse < RExC_end)
4633a7c4
LW
4155 goto tryagain;
4156 }
4157 /* FALL THROUGH */
4158
a0d0e21e 4159 default: {
ba210ebe 4160 register STRLEN len;
58ae7d3f 4161 register UV ender;
a0d0e21e 4162 register char *p;
c277df42 4163 char *oldp, *s;
ba210ebe 4164 STRLEN numlen;
80aecb99 4165 STRLEN foldlen;
89ebb4a3 4166 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4167
4168 parse_start = RExC_parse - 1;
a0d0e21e 4169
830247a4 4170 RExC_parse++;
a0d0e21e
LW
4171
4172 defchar:
58ae7d3f 4173 ender = 0;
eb160463
GS
4174 ret = reg_node(pRExC_state,
4175 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4176 s = STRING(ret);
830247a4
IZ
4177 for (len = 0, p = RExC_parse - 1;
4178 len < 127 && p < RExC_end;
a0d0e21e
LW
4179 len++)
4180 {
4181 oldp = p;
5b5a24f7 4182
e2509266 4183 if (RExC_flags & PMf_EXTENDED)
830247a4 4184 p = regwhite(p, RExC_end);
a0d0e21e
LW
4185 switch (*p) {
4186 case '^':
4187 case '$':
4188 case '.':
4189 case '[':
4190 case '(':
4191 case ')':
4192 case '|':
4193 goto loopdone;
4194 case '\\':
4195 switch (*++p) {
4196 case 'A':
1ed8eac0
JF
4197 case 'C':
4198 case 'X':
a0d0e21e
LW
4199 case 'G':
4200 case 'Z':
b85d18e9 4201 case 'z':
a0d0e21e
LW
4202 case 'w':
4203 case 'W':
4204 case 'b':
4205 case 'B':
4206 case 's':
4207 case 'S':
4208 case 'd':
4209 case 'D':
a14b48bc
LW
4210 case 'p':
4211 case 'P':
a0d0e21e
LW
4212 --p;
4213 goto loopdone;
4214 case 'n':
4215 ender = '\n';
4216 p++;
a687059c 4217 break;
a0d0e21e
LW
4218 case 'r':
4219 ender = '\r';
4220 p++;
a687059c 4221 break;
a0d0e21e
LW
4222 case 't':
4223 ender = '\t';
4224 p++;
a687059c 4225 break;
a0d0e21e
LW
4226 case 'f':
4227 ender = '\f';
4228 p++;
a687059c 4229 break;
a0d0e21e 4230 case 'e':
c7f1f016 4231 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4232 p++;
a687059c 4233 break;
a0d0e21e 4234 case 'a':
c7f1f016 4235 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4236 p++;
a687059c 4237 break;
a0d0e21e 4238 case 'x':
a0ed51b3
LW
4239 if (*++p == '{') {
4240 char* e = strchr(p, '}');
b81d288d 4241
b45f050a 4242 if (!e) {
830247a4 4243 RExC_parse = p + 1;
b45f050a
JF
4244 vFAIL("Missing right brace on \\x{}");
4245 }
de5f0749 4246 else {
a4c04bdc
NC
4247 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4248 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4249 numlen = e - p - 1;
4250 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4251 if (ender > 0xff)
4252 RExC_utf8 = 1;
a0ed51b3
LW
4253 p = e + 1;
4254 }
a0ed51b3
LW
4255 }
4256 else {
a4c04bdc 4257 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4258 numlen = 2;
4259 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4260 p += numlen;
4261 }
a687059c 4262 break;
a0d0e21e
LW
4263 case 'c':
4264 p++;
bbce6d69 4265 ender = UCHARAT(p++);
4266 ender = toCTRL(ender);
a687059c 4267 break;
a0d0e21e
LW
4268 case '0': case '1': case '2': case '3':case '4':
4269 case '5': case '6': case '7': case '8':case '9':
4270 if (*p == '0' ||
830247a4 4271 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
4272 I32 flags = 0;
4273 numlen = 3;
4274 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4275 p += numlen;
4276 }
4277 else {
4278 --p;
4279 goto loopdone;
a687059c
LW
4280 }
4281 break;
a0d0e21e 4282 case '\0':
830247a4 4283 if (p >= RExC_end)
b45f050a 4284 FAIL("Trailing \\");
a687059c 4285 /* FALL THROUGH */
a0d0e21e 4286 default:
e476b1b5 4287 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 4288 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4289 goto normal_default;
a0d0e21e
LW
4290 }
4291 break;
a687059c 4292 default:
a0ed51b3 4293 normal_default:
fd400ab9 4294 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 4295 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 4296 &numlen, 0);
a0ed51b3
LW
4297 p += numlen;
4298 }
4299 else
4300 ender = *p++;
a0d0e21e 4301 break;
a687059c 4302 }
e2509266 4303 if (RExC_flags & PMf_EXTENDED)
830247a4 4304 p = regwhite(p, RExC_end);
60a8b682
JH
4305 if (UTF && FOLD) {
4306 /* Prime the casefolded buffer. */
ac7e0132 4307 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4308 }
a0d0e21e
LW
4309 if (ISMULT2(p)) { /* Back off on ?+*. */
4310 if (len)
4311 p = oldp;
16ea2a2e 4312 else if (UTF) {
0ebc6274
JH
4313 STRLEN unilen;
4314
80aecb99 4315 if (FOLD) {
60a8b682 4316 /* Emit all the Unicode characters. */
80aecb99
JH
4317 for (foldbuf = tmpbuf;
4318 foldlen;
4319 foldlen -= numlen) {
4320 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4321 if (numlen > 0) {
0ebc6274
JH
4322 reguni(pRExC_state, ender, s, &unilen);
4323 s += unilen;
4324 len += unilen;
4325 /* In EBCDIC the numlen
4326 * and unilen can differ. */
9dc45d57 4327 foldbuf += numlen;
47654450
JH
4328 if (numlen >= foldlen)
4329 break;
9dc45d57
JH
4330 }
4331 else
4332 break; /* "Can't happen." */
80aecb99
JH
4333 }
4334 }
4335 else {
0ebc6274 4336 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4337 if (unilen > 0) {
0ebc6274
JH
4338 s += unilen;
4339 len += unilen;
9dc45d57 4340 }
80aecb99 4341 }
a0ed51b3 4342 }
a0d0e21e
LW
4343 else {
4344 len++;
eb160463 4345 REGC((char)ender, s++);
a0d0e21e
LW
4346 }
4347 break;
a687059c 4348 }
16ea2a2e 4349 if (UTF) {
0ebc6274
JH
4350 STRLEN unilen;
4351
80aecb99 4352 if (FOLD) {
60a8b682 4353 /* Emit all the Unicode characters. */
80aecb99
JH
4354 for (foldbuf = tmpbuf;
4355 foldlen;
4356 foldlen -= numlen) {
4357 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4358 if (numlen > 0) {
0ebc6274
JH
4359 reguni(pRExC_state, ender, s, &unilen);
4360 len += unilen;
4361 s += unilen;
4362 /* In EBCDIC the numlen
4363 * and unilen can differ. */
9dc45d57 4364 foldbuf += numlen;
47654450
JH
4365 if (numlen >= foldlen)
4366 break;
9dc45d57
JH
4367 }
4368 else
4369 break;
80aecb99
JH
4370 }
4371 }
4372 else {
0ebc6274 4373 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4374 if (unilen > 0) {
0ebc6274
JH
4375 s += unilen;
4376 len += unilen;
9dc45d57 4377 }
80aecb99
JH
4378 }
4379 len--;
a0ed51b3
LW
4380 }
4381 else
eb160463 4382 REGC((char)ender, s++);
a0d0e21e
LW
4383 }
4384 loopdone:
830247a4 4385 RExC_parse = p - 1;
fac92740 4386 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4387 nextchar(pRExC_state);
793db0cb
JH
4388 {
4389 /* len is STRLEN which is unsigned, need to copy to signed */
4390 IV iv = len;
4391 if (iv < 0)
4392 vFAIL("Internal disaster");
4393 }
a0d0e21e
LW
4394 if (len > 0)
4395 *flagp |= HASWIDTH;
090f7165 4396 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4397 *flagp |= SIMPLE;
c277df42 4398 if (!SIZE_ONLY)
cd439c50
IZ
4399 STR_LEN(ret) = len;
4400 if (SIZE_ONLY)
830247a4 4401 RExC_size += STR_SZ(len);
cd439c50 4402 else
830247a4 4403 RExC_emit += STR_SZ(len);
a687059c 4404 }
a0d0e21e
LW
4405 break;
4406 }
a687059c 4407
60a8b682
JH
4408 /* If the encoding pragma is in effect recode the text of
4409 * any EXACT-kind nodes. */
22c54be3 4410 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
4411 STRLEN oldlen = STR_LEN(ret);
4412 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4413
4414 if (RExC_utf8)
4415 SvUTF8_on(sv);
4416 if (sv_utf8_downgrade(sv, TRUE)) {
4417 char *s = sv_recode_to_utf8(sv, PL_encoding);
4418 STRLEN newlen = SvCUR(sv);
4419
4420 if (SvUTF8(sv))
4421 RExC_utf8 = 1;
4422 if (!SIZE_ONLY) {
a3621e74
YO
4423 GET_RE_DEBUG_FLAGS_DECL;
4424 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
4425 (int)oldlen, STRING(ret),
4426 (int)newlen, s));
4427 Copy(s, STRING(ret), newlen, char);
4428 STR_LEN(ret) += newlen - oldlen;
4429 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4430 } else
4431 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4432 }
a72c7584
JH
4433 }
4434
a0d0e21e 4435 return(ret);
a687059c
LW
4436}
4437
873ef191 4438STATIC char *
cea2e8a9 4439S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
4440{
4441 while (p < e) {
4442 if (isSPACE(*p))
4443 ++p;
4444 else if (*p == '#') {
4445 do {
4446 p++;
4447 } while (p < e && *p != '\n');
4448 }
4449 else
4450 break;
4451 }
4452 return p;
4453}
4454
b8c5462f
JH
4455/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4456 Character classes ([:foo:]) can also be negated ([:^foo:]).
4457 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4458 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 4459 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
4460
4461#define POSIXCC_DONE(c) ((c) == ':')
4462#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4463#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4464
b8c5462f 4465STATIC I32
830247a4 4466S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
4467{
4468 char *posixcc = 0;
936ed897 4469 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 4470
830247a4 4471 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 4472 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
4473 POSIXCC(UCHARAT(RExC_parse))) {
4474 char c = UCHARAT(RExC_parse);
830247a4 4475 char* s = RExC_parse++;
b81d288d 4476
9a86a77b 4477 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
4478 RExC_parse++;
4479 if (RExC_parse == RExC_end)
620e46c5 4480 /* Grandfather lone [:, [=, [. */
830247a4 4481 RExC_parse = s;
620e46c5 4482 else {
830247a4 4483 char* t = RExC_parse++; /* skip over the c */
b8c5462f 4484
80916619
NC
4485 assert(*t == c);
4486
9a86a77b 4487 if (UCHARAT(RExC_parse) == ']') {
830247a4 4488 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
4489 posixcc = s + 1;
4490 if (*s == ':') {
4491 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
80916619
NC
4492 I32 skip = t - posixcc;
4493
4494 /* Initially switch on the length of the name. */
4495 switch (skip) {
4496 case 4:
4497 if (memEQ(posixcc, "word", 4)) {
4498 /* this is not POSIX, this is the Perl \w */;
4499 namedclass
4500 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4501 }
cc4319de 4502 break;
80916619
NC
4503 case 5:
4504 /* Names all of length 5. */
4505 /* alnum alpha ascii blank cntrl digit graph lower
4506 print punct space upper */
4507 /* Offset 4 gives the best switch position. */
4508 switch (posixcc[4]) {
4509 case 'a':
4510 if (memEQ(posixcc, "alph", 4)) {
4511 /* a */
4512 namedclass
4513 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4514 }
4515 break;
4516 case 'e':
4517 if (memEQ(posixcc, "spac", 4)) {
4518 /* e */
4519 namedclass
4520 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4521 }
4522 break;
4523 case 'h':
4524 if (memEQ(posixcc, "grap", 4)) {
4525 /* h */
4526 namedclass
4527 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4528 }
4529 break;
4530 case 'i':
4531 if (memEQ(posixcc, "asci", 4)) {
4532 /* i */
4533 namedclass
4534 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4535 }
4536 break;
4537 case 'k':
4538 if (memEQ(posixcc, "blan", 4)) {
4539 /* k */
4540 namedclass
4541 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4542 }
4543 break;
4544 case 'l':
4545 if (memEQ(posixcc, "cntr", 4)) {
4546 /* l */
4547 namedclass
4548 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4549 }
4550 break;
4551 case 'm':
4552 if (memEQ(posixcc, "alnu", 4)) {
4553 /* m */
4554 namedclass
4555 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4556 }
4557 break;
4558 case 'r':
4559 if (memEQ(posixcc, "lowe", 4)) {
4560 /* r */
4561 namedclass
4562 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4563 }
4564 if (memEQ(posixcc, "uppe", 4)) {
8fdec511 4565 /* r */
80916619
NC
4566 namedclass
4567 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4568 }
4569 break;
4570 case 't':
4571 if (memEQ(posixcc, "digi", 4)) {
4572 /* t */
4573 namedclass
4574 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4575 }
4576 if (memEQ(posixcc, "prin", 4)) {
8fdec511 4577 /* t */
80916619
NC
4578 namedclass
4579 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4580 }
4581 if (memEQ(posixcc, "punc", 4)) {
4582 /* t */
4583 namedclass
4584 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4585 }
4586 break;
b8c5462f
JH
4587 }
4588 break;
80916619
NC
4589 case 6:
4590 if (memEQ(posixcc, "xdigit", 6)) {
4591 namedclass
4592 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
4593 }
4594 break;
4595 }
80916619
NC
4596
4597 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
4598 {
4599 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4600 t - s - 1, s + 1);
4601 }
80916619
NC
4602 assert (posixcc[skip] == ':');
4603 assert (posixcc[skip+1] == ']');
b45f050a 4604 } else if (!SIZE_ONLY) {
b8c5462f 4605 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 4606
830247a4 4607 /* adjust RExC_parse so the warning shows after
b45f050a 4608 the class closes */
9a86a77b 4609 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 4610 RExC_parse++;
b45f050a
JF
4611 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4612 }
b8c5462f
JH
4613 } else {
4614 /* Maternal grandfather:
4615 * "[:" ending in ":" but not in ":]" */
830247a4 4616 RExC_parse = s;
767d463e 4617 }
620e46c5
JH
4618 }
4619 }
4620
b8c5462f
JH
4621 return namedclass;
4622}
4623
4624STATIC void
830247a4 4625S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 4626{
b938889d 4627 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
830247a4 4628 char *s = RExC_parse;
93733859 4629 char c = *s++;
b8c5462f
JH
4630
4631 while(*s && isALNUM(*s))
4632 s++;
4633 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
4634 if (ckWARN(WARN_REGEXP))
4635 vWARN3(s+2,
4636 "POSIX syntax [%c %c] belongs inside character classes",
4637 c, c);
b45f050a
JF
4638
4639 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 4640 if (POSIXCC_NOTYET(c)) {
830247a4 4641 /* adjust RExC_parse so the error shows after
b45f050a 4642 the class closes */
9a86a77b 4643 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
4644 ;
4645 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4646 }
b8c5462f
JH
4647 }
4648 }
620e46c5
JH
4649}
4650
76e3520e 4651STATIC regnode *
830247a4 4652S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 4653{
ffc61ed2 4654 register UV value;
9a86a77b 4655 register UV nextvalue;
3568d838 4656 register IV prevvalue = OOB_UNICODE;
ffc61ed2 4657 register IV range = 0;
c277df42 4658 register regnode *ret;
ba210ebe 4659 STRLEN numlen;
ffc61ed2 4660 IV namedclass;
9c5ffd7c 4661 char *rangebegin = 0;
936ed897 4662 bool need_class = 0;
9c5ffd7c 4663 SV *listsv = Nullsv;
ffc61ed2
JH
4664 register char *e;
4665 UV n;
9e55ce06
JH
4666 bool optimize_invert = TRUE;
4667 AV* unicode_alternate = 0;
1b2d223b
JH
4668#ifdef EBCDIC
4669 UV literal_endpoint = 0;
4670#endif
ffc61ed2
JH
4671
4672 ret = reganode(pRExC_state, ANYOF, 0);
4673
4674 if (!SIZE_ONLY)
4675 ANYOF_FLAGS(ret) = 0;
4676
9a86a77b 4677 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
4678 RExC_naughty++;
4679 RExC_parse++;
4680 if (!SIZE_ONLY)
4681 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4682 }
a0d0e21e 4683
936ed897 4684 if (SIZE_ONLY)
830247a4 4685 RExC_size += ANYOF_SKIP;
936ed897 4686 else {
830247a4 4687 RExC_emit += ANYOF_SKIP;
936ed897
IZ
4688 if (FOLD)
4689 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4690 if (LOC)
4691 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
4692 ANYOF_BITMAP_ZERO(ret);
4693 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 4694 }
b8c5462f 4695
9a86a77b
JH
4696 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4697
b938889d 4698 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 4699 checkposixcc(pRExC_state);
b8c5462f 4700
f064b6ad
HS
4701 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4702 if (UCHARAT(RExC_parse) == ']')
4703 goto charclassloop;
ffc61ed2 4704
9a86a77b 4705 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
4706
4707 charclassloop:
4708
4709 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4710
73b437c8 4711 if (!range)
830247a4 4712 rangebegin = RExC_parse;
ffc61ed2 4713 if (UTF) {
5e12f4fb 4714 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
4715 RExC_end - RExC_parse,
4716 &numlen, 0);
ffc61ed2
JH
4717 RExC_parse += numlen;
4718 }
4719 else
4720 value = UCHARAT(RExC_parse++);
9a86a77b
JH
4721 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4722 if (value == '[' && POSIXCC(nextvalue))
830247a4 4723 namedclass = regpposixcc(pRExC_state, value);
620e46c5 4724 else if (value == '\\') {
ffc61ed2 4725 if (UTF) {
5e12f4fb 4726 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
4727 RExC_end - RExC_parse,
4728 &numlen, 0);
4729 RExC_parse += numlen;
4730 }
4731 else
4732 value = UCHARAT(RExC_parse++);
470c3474 4733 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 4734 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
4735 * be a problem later if we want switch on Unicode.
4736 * A similar issue a little bit later when switching on
4737 * namedclass. --jhi */
ffc61ed2 4738 switch ((I32)value) {
b8c5462f
JH
4739 case 'w': namedclass = ANYOF_ALNUM; break;
4740 case 'W': namedclass = ANYOF_NALNUM; break;
4741 case 's': namedclass = ANYOF_SPACE; break;
4742 case 'S': namedclass = ANYOF_NSPACE; break;
4743 case 'd': namedclass = ANYOF_DIGIT; break;
4744 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
4745 case 'p':
4746 case 'P':
af6f566e 4747 if (RExC_parse >= RExC_end)
2a4859cd 4748 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 4749 if (*RExC_parse == '{') {
0da60cf5 4750 U8 c = (U8)value;
ffc61ed2
JH
4751 e = strchr(RExC_parse++, '}');
4752 if (!e)
0da60cf5 4753 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
4754 while (isSPACE(UCHARAT(RExC_parse)))
4755 RExC_parse++;
4756 if (e == RExC_parse)
0da60cf5 4757 vFAIL2("Empty \\%c{}", c);
ffc61ed2 4758 n = e - RExC_parse;
ab13f0c7
JH
4759 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4760 n--;
ffc61ed2
JH
4761 }
4762 else {
4763 e = RExC_parse;
4764 n = 1;
4765 }
4766 if (!SIZE_ONLY) {
ab13f0c7
JH
4767 if (UCHARAT(RExC_parse) == '^') {
4768 RExC_parse++;
4769 n--;
4770 value = value == 'p' ? 'P' : 'p'; /* toggle */
4771 while (isSPACE(UCHARAT(RExC_parse))) {
4772 RExC_parse++;
4773 n--;
4774 }
4775 }
ffc61ed2 4776 if (value == 'p')
ab13f0c7
JH
4777 Perl_sv_catpvf(aTHX_ listsv,
4778 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 4779 else
ab13f0c7
JH
4780 Perl_sv_catpvf(aTHX_ listsv,
4781 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
4782 }
4783 RExC_parse = e + 1;
4784 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2
JP
4785 namedclass = ANYOF_MAX; /* no official name, but it's named */
4786 break;
b8c5462f
JH
4787 case 'n': value = '\n'; break;
4788 case 'r': value = '\r'; break;
4789 case 't': value = '\t'; break;
4790 case 'f': value = '\f'; break;
4791 case 'b': value = '\b'; break;
c7f1f016
NIS
4792 case 'e': value = ASCII_TO_NATIVE('\033');break;
4793 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 4794 case 'x':
ffc61ed2 4795 if (*RExC_parse == '{') {
a4c04bdc
NC
4796 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4797 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 4798 e = strchr(RExC_parse++, '}');
b81d288d 4799 if (!e)
ffc61ed2 4800 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
4801
4802 numlen = e - RExC_parse;
4803 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4804 RExC_parse = e + 1;
4805 }
4806 else {
a4c04bdc 4807 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4808 numlen = 2;
4809 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4810 RExC_parse += numlen;
4811 }
b8c5462f
JH
4812 break;
4813 case 'c':
830247a4 4814 value = UCHARAT(RExC_parse++);
b8c5462f
JH
4815 value = toCTRL(value);
4816 break;
4817 case '0': case '1': case '2': case '3': case '4':
4818 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
4819 {
4820 I32 flags = 0;
4821 numlen = 3;
4822 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 4823 RExC_parse += numlen;
b8c5462f 4824 break;
53305cf1 4825 }
1028017a 4826 default:
e476b1b5 4827 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
4828 vWARN2(RExC_parse,
4829 "Unrecognized escape \\%c in character class passed through",
4830 (int)value);
1028017a 4831 break;
b8c5462f 4832 }
ffc61ed2 4833 } /* end of \blah */
1b2d223b
JH
4834#ifdef EBCDIC
4835 else
4836 literal_endpoint++;
4837#endif
ffc61ed2
JH
4838
4839 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4840
4841 if (!SIZE_ONLY && !need_class)
936ed897 4842 ANYOF_CLASS_ZERO(ret);
ffc61ed2 4843
936ed897 4844 need_class = 1;
ffc61ed2
JH
4845
4846 /* a bad range like a-\d, a-[:digit:] ? */
4847 if (range) {
73b437c8 4848 if (!SIZE_ONLY) {
e476b1b5 4849 if (ckWARN(WARN_REGEXP))
830247a4 4850 vWARN4(RExC_parse,
b45f050a 4851 "False [] range \"%*.*s\"",
830247a4
IZ
4852 RExC_parse - rangebegin,
4853 RExC_parse - rangebegin,
b45f050a 4854 rangebegin);
3568d838
JH
4855 if (prevvalue < 256) {
4856 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
4857 ANYOF_BITMAP_SET(ret, '-');
4858 }
4859 else {
4860 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4861 Perl_sv_catpvf(aTHX_ listsv,
3568d838 4862 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 4863 }
b8c5462f 4864 }
ffc61ed2
JH
4865
4866 range = 0; /* this was not a true range */
73b437c8 4867 }
ffc61ed2 4868
73b437c8 4869 if (!SIZE_ONLY) {
c49a72a9
NC
4870 const char *what = NULL;
4871 char yesno = 0;
4872
3568d838
JH
4873 if (namedclass > OOB_NAMEDCLASS)
4874 optimize_invert = FALSE;
e2962f66
JH
4875 /* Possible truncation here but in some 64-bit environments
4876 * the compiler gets heartburn about switch on 64-bit values.
4877 * A similar issue a little earlier when switching on value.
98f323fa 4878 * --jhi */
e2962f66 4879 switch ((I32)namedclass) {
73b437c8
JH
4880 case ANYOF_ALNUM:
4881 if (LOC)
936ed897 4882 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
4883 else {
4884 for (value = 0; value < 256; value++)
4885 if (isALNUM(value))
936ed897 4886 ANYOF_BITMAP_SET(ret, value);
73b437c8 4887 }
c49a72a9
NC
4888 yesno = '+';
4889 what = "Word";
73b437c8
JH
4890 break;
4891 case ANYOF_NALNUM:
4892 if (LOC)
936ed897 4893 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
4894 else {
4895 for (value = 0; value < 256; value++)
4896 if (!isALNUM(value))
936ed897 4897 ANYOF_BITMAP_SET(ret, value);
73b437c8 4898 }
c49a72a9
NC
4899 yesno = '!';
4900 what = "Word";
73b437c8 4901 break;
ffc61ed2 4902 case ANYOF_ALNUMC:
73b437c8 4903 if (LOC)
ffc61ed2 4904 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
4905 else {
4906 for (value = 0; value < 256; value++)
ffc61ed2 4907 if (isALNUMC(value))
936ed897 4908 ANYOF_BITMAP_SET(ret, value);
73b437c8 4909 }
c49a72a9
NC
4910 yesno = '+';
4911 what = "Alnum";
73b437c8
JH
4912 break;
4913 case ANYOF_NALNUMC:
4914 if (LOC)
936ed897 4915 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
4916 else {
4917 for (value = 0; value < 256; value++)
4918 if (!isALNUMC(value))
936ed897 4919 ANYOF_BITMAP_SET(ret, value);
73b437c8 4920 }
c49a72a9
NC
4921 yesno = '!';
4922 what = "Alnum";
73b437c8
JH
4923 break;
4924 case ANYOF_ALPHA:
4925 if (LOC)
936ed897 4926 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
4927 else {
4928 for (value = 0; value < 256; value++)
4929 if (isALPHA(value))
936ed897 4930 ANYOF_BITMAP_SET(ret, value);
73b437c8 4931 }
c49a72a9
NC
4932 yesno = '+';
4933 what = "Alpha";
73b437c8
JH
4934 break;
4935 case ANYOF_NALPHA:
4936 if (LOC)
936ed897 4937 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
4938 else {
4939 for (value = 0; value < 256; value++)
4940 if (!isALPHA(value))
936ed897 4941 ANYOF_BITMAP_SET(ret, value);
73b437c8 4942 }
c49a72a9
NC
4943 yesno = '!';
4944 what = "Alpha";
73b437c8
JH
4945 break;
4946 case ANYOF_ASCII:
4947 if (LOC)
936ed897 4948 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 4949 else {
c7f1f016 4950#ifndef EBCDIC
1ba5c669
JH
4951 for (value = 0; value < 128; value++)
4952 ANYOF_BITMAP_SET(ret, value);
4953#else /* EBCDIC */
ffbc6a93 4954 for (value = 0; value < 256; value++) {
3a3c4447
JH
4955 if (isASCII(value))
4956 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4957 }
1ba5c669 4958#endif /* EBCDIC */
73b437c8 4959 }
c49a72a9
NC
4960 yesno = '+';
4961 what = "ASCII";
73b437c8
JH
4962 break;
4963 case ANYOF_NASCII:
4964 if (LOC)
936ed897 4965 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 4966 else {
c7f1f016 4967#ifndef EBCDIC
1ba5c669
JH
4968 for (value = 128; value < 256; value++)
4969 ANYOF_BITMAP_SET(ret, value);
4970#else /* EBCDIC */
ffbc6a93 4971 for (value = 0; value < 256; value++) {
3a3c4447
JH
4972 if (!isASCII(value))
4973 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4974 }
1ba5c669 4975#endif /* EBCDIC */
73b437c8 4976 }
c49a72a9
NC
4977 yesno = '!';
4978 what = "ASCII";
73b437c8 4979 break;
aaa51d5e
JF
4980 case ANYOF_BLANK:
4981 if (LOC)
4982 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4983 else {
4984 for (value = 0; value < 256; value++)
4985 if (isBLANK(value))
4986 ANYOF_BITMAP_SET(ret, value);
4987 }
c49a72a9
NC
4988 yesno = '+';
4989 what = "Blank";
aaa51d5e
JF
4990 break;
4991 case ANYOF_NBLANK:
4992 if (LOC)
4993 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4994 else {
4995 for (value = 0; value < 256; value++)
4996 if (!isBLANK(value))
4997 ANYOF_BITMAP_SET(ret, value);
4998 }
c49a72a9
NC
4999 yesno = '!';
5000 what = "Blank";
aaa51d5e 5001 break;
73b437c8
JH
5002 case ANYOF_CNTRL:
5003 if (LOC)
936ed897 5004 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
5005 else {
5006 for (value = 0; value < 256; value++)
5007 if (isCNTRL(value))
936ed897 5008 ANYOF_BITMAP_SET(ret, value);
73b437c8 5009 }
c49a72a9
NC
5010 yesno = '+';
5011 what = "Cntrl";
73b437c8
JH
5012 break;
5013 case ANYOF_NCNTRL:
5014 if (LOC)
936ed897 5015 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5016 else {
5017 for (value = 0; value < 256; value++)
5018 if (!isCNTRL(value))
936ed897 5019 ANYOF_BITMAP_SET(ret, value);
73b437c8 5020 }
c49a72a9
NC
5021 yesno = '!';
5022 what = "Cntrl";
ffc61ed2
JH
5023 break;
5024 case ANYOF_DIGIT:
5025 if (LOC)
5026 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5027 else {
5028 /* consecutive digits assumed */
5029 for (value = '0'; value <= '9'; value++)
5030 ANYOF_BITMAP_SET(ret, value);
5031 }
c49a72a9
NC
5032 yesno = '+';
5033 what = "Digit";
ffc61ed2
JH
5034 break;
5035 case ANYOF_NDIGIT:
5036 if (LOC)
5037 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5038 else {
5039 /* consecutive digits assumed */
5040 for (value = 0; value < '0'; value++)
5041 ANYOF_BITMAP_SET(ret, value);
5042 for (value = '9' + 1; value < 256; value++)
5043 ANYOF_BITMAP_SET(ret, value);
5044 }
c49a72a9
NC
5045 yesno = '!';
5046 what = "Digit";
73b437c8
JH
5047 break;
5048 case ANYOF_GRAPH:
5049 if (LOC)
936ed897 5050 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5051 else {
5052 for (value = 0; value < 256; value++)
5053 if (isGRAPH(value))
936ed897 5054 ANYOF_BITMAP_SET(ret, value);
73b437c8 5055 }
c49a72a9
NC
5056 yesno = '+';
5057 what = "Graph";
73b437c8
JH
5058 break;
5059 case ANYOF_NGRAPH:
5060 if (LOC)
936ed897 5061 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5062 else {
5063 for (value = 0; value < 256; value++)
5064 if (!isGRAPH(value))
936ed897 5065 ANYOF_BITMAP_SET(ret, value);
73b437c8 5066 }
c49a72a9
NC
5067 yesno = '!';
5068 what = "Graph";
73b437c8
JH
5069 break;
5070 case ANYOF_LOWER:
5071 if (LOC)
936ed897 5072 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5073 else {
5074 for (value = 0; value < 256; value++)
5075 if (isLOWER(value))
936ed897 5076 ANYOF_BITMAP_SET(ret, value);
73b437c8 5077 }
c49a72a9
NC
5078 yesno = '+';
5079 what = "Lower";
73b437c8
JH
5080 break;
5081 case ANYOF_NLOWER:
5082 if (LOC)
936ed897 5083 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5084 else {
5085 for (value = 0; value < 256; value++)
5086 if (!isLOWER(value))
936ed897 5087 ANYOF_BITMAP_SET(ret, value);
73b437c8 5088 }
c49a72a9
NC
5089 yesno = '!';
5090 what = "Lower";
73b437c8
JH
5091 break;
5092 case ANYOF_PRINT:
5093 if (LOC)
936ed897 5094 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5095 else {
5096 for (value = 0; value < 256; value++)
5097 if (isPRINT(value))
936ed897 5098 ANYOF_BITMAP_SET(ret, value);
73b437c8 5099 }
c49a72a9
NC
5100 yesno = '+';
5101 what = "Print";
73b437c8
JH
5102 break;
5103 case ANYOF_NPRINT:
5104 if (LOC)
936ed897 5105 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5106 else {
5107 for (value = 0; value < 256; value++)
5108 if (!isPRINT(value))
936ed897 5109 ANYOF_BITMAP_SET(ret, value);
73b437c8 5110 }
c49a72a9
NC
5111 yesno = '!';
5112 what = "Print";
73b437c8 5113 break;
aaa51d5e
JF
5114 case ANYOF_PSXSPC:
5115 if (LOC)
5116 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5117 else {
5118 for (value = 0; value < 256; value++)
5119 if (isPSXSPC(value))
5120 ANYOF_BITMAP_SET(ret, value);
5121 }
c49a72a9
NC
5122 yesno = '+';
5123 what = "Space";
aaa51d5e
JF
5124 break;
5125 case ANYOF_NPSXSPC:
5126 if (LOC)
5127 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5128 else {
5129 for (value = 0; value < 256; value++)
5130 if (!isPSXSPC(value))
5131 ANYOF_BITMAP_SET(ret, value);
5132 }
c49a72a9
NC
5133 yesno = '!';
5134 what = "Space";
aaa51d5e 5135 break;
73b437c8
JH
5136 case ANYOF_PUNCT:
5137 if (LOC)
936ed897 5138 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5139 else {
5140 for (value = 0; value < 256; value++)
5141 if (isPUNCT(value))
936ed897 5142 ANYOF_BITMAP_SET(ret, value);
73b437c8 5143 }
c49a72a9
NC
5144 yesno = '+';
5145 what = "Punct";
73b437c8
JH
5146 break;
5147 case ANYOF_NPUNCT:
5148 if (LOC)
936ed897 5149 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5150 else {
5151 for (value = 0; value < 256; value++)
5152 if (!isPUNCT(value))
936ed897 5153 ANYOF_BITMAP_SET(ret, value);
73b437c8 5154 }
c49a72a9
NC
5155 yesno = '!';
5156 what = "Punct";
ffc61ed2
JH
5157 break;
5158 case ANYOF_SPACE:
5159 if (LOC)
5160 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5161 else {
5162 for (value = 0; value < 256; value++)
5163 if (isSPACE(value))
5164 ANYOF_BITMAP_SET(ret, value);
5165 }
c49a72a9
NC
5166 yesno = '+';
5167 what = "SpacePerl";
ffc61ed2
JH
5168 break;
5169 case ANYOF_NSPACE:
5170 if (LOC)
5171 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5172 else {
5173 for (value = 0; value < 256; value++)
5174 if (!isSPACE(value))
5175 ANYOF_BITMAP_SET(ret, value);
5176 }
c49a72a9
NC
5177 yesno = '!';
5178 what = "SpacePerl";
73b437c8
JH
5179 break;
5180 case ANYOF_UPPER:
5181 if (LOC)
936ed897 5182 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5183 else {
5184 for (value = 0; value < 256; value++)
5185 if (isUPPER(value))
936ed897 5186 ANYOF_BITMAP_SET(ret, value);
73b437c8 5187 }
c49a72a9
NC
5188 yesno = '+';
5189 what = "Upper";
73b437c8
JH
5190 break;
5191 case ANYOF_NUPPER:
5192 if (LOC)
936ed897 5193 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5194 else {
5195 for (value = 0; value < 256; value++)
5196 if (!isUPPER(value))
936ed897 5197 ANYOF_BITMAP_SET(ret, value);
73b437c8 5198 }
c49a72a9
NC
5199 yesno = '!';
5200 what = "Upper";
73b437c8
JH
5201 break;
5202 case ANYOF_XDIGIT:
5203 if (LOC)
936ed897 5204 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5205 else {
5206 for (value = 0; value < 256; value++)
5207 if (isXDIGIT(value))
936ed897 5208 ANYOF_BITMAP_SET(ret, value);
73b437c8 5209 }
c49a72a9
NC
5210 yesno = '+';
5211 what = "XDigit";
73b437c8
JH
5212 break;
5213 case ANYOF_NXDIGIT:
5214 if (LOC)
936ed897 5215 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5216 else {
5217 for (value = 0; value < 256; value++)
5218 if (!isXDIGIT(value))
936ed897 5219 ANYOF_BITMAP_SET(ret, value);
73b437c8 5220 }
c49a72a9
NC
5221 yesno = '!';
5222 what = "XDigit";
73b437c8 5223 break;
f81125e2
JP
5224 case ANYOF_MAX:
5225 /* this is to handle \p and \P */
5226 break;
73b437c8 5227 default:
b45f050a 5228 vFAIL("Invalid [::] class");
73b437c8 5229 break;
b8c5462f 5230 }
c49a72a9
NC
5231 if (what) {
5232 /* Strings such as "+utf8::isWord\n" */
5233 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5234 }
b8c5462f 5235 if (LOC)
936ed897 5236 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5237 continue;
a0d0e21e 5238 }
ffc61ed2
JH
5239 } /* end of namedclass \blah */
5240
a0d0e21e 5241 if (range) {
eb160463 5242 if (prevvalue > (IV)value) /* b-a */ {
b45f050a 5243 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
5244 RExC_parse - rangebegin,
5245 RExC_parse - rangebegin,
b45f050a 5246 rangebegin);
3568d838 5247 range = 0; /* not a valid range */
73b437c8 5248 }
a0d0e21e
LW
5249 }
5250 else {
3568d838 5251 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5252 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5253 RExC_parse[1] != ']') {
5254 RExC_parse++;
ffc61ed2
JH
5255
5256 /* a bad range like \w-, [:word:]- ? */
5257 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 5258 if (ckWARN(WARN_REGEXP))
830247a4 5259 vWARN4(RExC_parse,
b45f050a 5260 "False [] range \"%*.*s\"",
830247a4
IZ
5261 RExC_parse - rangebegin,
5262 RExC_parse - rangebegin,
b45f050a 5263 rangebegin);
73b437c8 5264 if (!SIZE_ONLY)
936ed897 5265 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5266 } else
ffc61ed2
JH
5267 range = 1; /* yeah, it's a range! */
5268 continue; /* but do it the next time */
a0d0e21e 5269 }
a687059c 5270 }
ffc61ed2 5271
93733859 5272 /* now is the next time */
ae5c130c 5273 if (!SIZE_ONLY) {
3568d838
JH
5274 IV i;
5275
5276 if (prevvalue < 256) {
5277 IV ceilvalue = value < 256 ? value : 255;
5278
5279#ifdef EBCDIC
1b2d223b
JH
5280 /* In EBCDIC [\x89-\x91] should include
5281 * the \x8e but [i-j] should not. */
5282 if (literal_endpoint == 2 &&
5283 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5284 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5285 {
3568d838
JH
5286 if (isLOWER(prevvalue)) {
5287 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5288 if (isLOWER(i))
5289 ANYOF_BITMAP_SET(ret, i);
5290 } else {
3568d838 5291 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5292 if (isUPPER(i))
5293 ANYOF_BITMAP_SET(ret, i);
5294 }
8ada0baa 5295 }
ffc61ed2 5296 else
8ada0baa 5297#endif
a5961de5
JH
5298 for (i = prevvalue; i <= ceilvalue; i++)
5299 ANYOF_BITMAP_SET(ret, i);
3568d838 5300 }
a5961de5 5301 if (value > 255 || UTF) {
b08decb7
JH
5302 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5303 UV natvalue = NATIVE_TO_UNI(value);
5304
ffc61ed2 5305 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5306 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5307 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5308 prevnatvalue, natvalue);
5309 }
5310 else if (prevnatvalue == natvalue) {
5311 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5312 if (FOLD) {
89ebb4a3 5313 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5314 STRLEN foldlen;
2f3bf011 5315 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5316
c840d2a2
JH
5317 /* If folding and foldable and a single
5318 * character, insert also the folded version
5319 * to the charclass. */
9e55ce06 5320 if (f != value) {
eb160463 5321 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5322 Perl_sv_catpvf(aTHX_ listsv,
5323 "%04"UVxf"\n", f);
5324 else {
5325 /* Any multicharacter foldings
5326 * require the following transform:
5327 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5328 * where E folds into "pq" and F folds
5329 * into "rst", all other characters
5330 * fold to single characters. We save
5331 * away these multicharacter foldings,
5332 * to be later saved as part of the
5333 * additional "s" data. */
5334 SV *sv;
5335
5336 if (!unicode_alternate)
5337 unicode_alternate = newAV();
5338 sv = newSVpvn((char*)foldbuf, foldlen);
5339 SvUTF8_on(sv);
5340 av_push(unicode_alternate, sv);
5341 }
5342 }
254ba52a 5343
60a8b682
JH
5344 /* If folding and the value is one of the Greek
5345 * sigmas insert a few more sigmas to make the
5346 * folding rules of the sigmas to work right.
5347 * Note that not all the possible combinations
5348 * are handled here: some of them are handled
9e55ce06
JH
5349 * by the standard folding rules, and some of
5350 * them (literal or EXACTF cases) are handled
5351 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5352 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5353 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5354 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5355 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5356 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5357 }
5358 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5359 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5360 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5361 }
5362 }
ffc61ed2 5363 }
1b2d223b
JH
5364#ifdef EBCDIC
5365 literal_endpoint = 0;
5366#endif
8ada0baa 5367 }
ffc61ed2
JH
5368
5369 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5370 }
ffc61ed2 5371
936ed897 5372 if (need_class) {
4f66b38d 5373 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5374 if (SIZE_ONLY)
830247a4 5375 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5376 else
830247a4 5377 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5378 }
ffc61ed2 5379
ae5c130c 5380 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 5381 if (!SIZE_ONLY &&
ffc61ed2 5382 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5383 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5384 ) {
a0ed51b3 5385 for (value = 0; value < 256; ++value) {
936ed897 5386 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5387 UV fold = PL_fold[value];
ffc61ed2
JH
5388
5389 if (fold != value)
5390 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5391 }
5392 }
936ed897 5393 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5394 }
ffc61ed2 5395
ae5c130c 5396 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 5397 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
5398 /* If the only flag is inversion. */
5399 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5400 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5401 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5402 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5403 }
a0d0e21e 5404
b81d288d 5405 if (!SIZE_ONLY) {
fde631ed 5406 AV *av = newAV();
ffc61ed2
JH
5407 SV *rv;
5408
9e55ce06 5409 /* The 0th element stores the character class description
6a0407ee 5410 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5411 * to initialize the appropriate swash (which gets stored in
5412 * the 1st element), and also useful for dumping the regnode.
5413 * The 2nd element stores the multicharacter foldings,
6a0407ee 5414 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
5415 av_store(av, 0, listsv);
5416 av_store(av, 1, NULL);
9e55ce06 5417 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 5418 rv = newRV_noinc((SV*)av);
19860706 5419 n = add_data(pRExC_state, 1, "s");
830247a4 5420 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 5421 ARG_SET(ret, n);
a0ed51b3
LW
5422 }
5423
5424 return ret;
5425}
5426
76e3520e 5427STATIC char*
830247a4 5428S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 5429{
830247a4 5430 char* retval = RExC_parse++;
a0d0e21e 5431
4633a7c4 5432 for (;;) {
830247a4
IZ
5433 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5434 RExC_parse[2] == '#') {
e994fd66
AE
5435 while (*RExC_parse != ')') {
5436 if (RExC_parse == RExC_end)
5437 FAIL("Sequence (?#... not terminated");
830247a4 5438 RExC_parse++;
e994fd66 5439 }
830247a4 5440 RExC_parse++;
4633a7c4
LW
5441 continue;
5442 }
e2509266 5443 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
5444 if (isSPACE(*RExC_parse)) {
5445 RExC_parse++;
748a9306
LW
5446 continue;
5447 }
830247a4 5448 else if (*RExC_parse == '#') {
e994fd66
AE
5449 while (RExC_parse < RExC_end)
5450 if (*RExC_parse++ == '\n') break;
748a9306
LW
5451 continue;
5452 }
748a9306 5453 }
4633a7c4 5454 return retval;
a0d0e21e 5455 }
a687059c
LW
5456}
5457
5458/*
c277df42 5459- reg_node - emit a node
a0d0e21e 5460*/
76e3520e 5461STATIC regnode * /* Location. */
830247a4 5462S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 5463{
c277df42
IZ
5464 register regnode *ret;
5465 register regnode *ptr;
a687059c 5466
830247a4 5467 ret = RExC_emit;
c277df42 5468 if (SIZE_ONLY) {
830247a4
IZ
5469 SIZE_ALIGN(RExC_size);
5470 RExC_size += 1;
a0d0e21e
LW
5471 return(ret);
5472 }
a687059c 5473
c277df42 5474 NODE_ALIGN_FILL(ret);
a0d0e21e 5475 ptr = ret;
c277df42 5476 FILL_ADVANCE_NODE(ptr, op);
fac92740 5477 if (RExC_offsets) { /* MJD */
ccb2c380 5478 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
5479 "reg_node", __LINE__,
5480 reg_name[op],
5481 RExC_emit - RExC_emit_start > RExC_offsets[0]
5482 ? "Overwriting end of array!\n" : "OK",
5483 RExC_emit - RExC_emit_start,
5484 RExC_parse - RExC_start,
5485 RExC_offsets[0]));
ccb2c380 5486 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
5487 }
5488
830247a4 5489 RExC_emit = ptr;
a687059c 5490
a0d0e21e 5491 return(ret);
a687059c
LW
5492}
5493
5494/*
a0d0e21e
LW
5495- reganode - emit a node with an argument
5496*/
76e3520e 5497STATIC regnode * /* Location. */
830247a4 5498S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 5499{
c277df42
IZ
5500 register regnode *ret;
5501 register regnode *ptr;
fe14fcc3 5502
830247a4 5503 ret = RExC_emit;
c277df42 5504 if (SIZE_ONLY) {
830247a4
IZ
5505 SIZE_ALIGN(RExC_size);
5506 RExC_size += 2;
a0d0e21e
LW
5507 return(ret);
5508 }
fe14fcc3 5509
c277df42 5510 NODE_ALIGN_FILL(ret);
a0d0e21e 5511 ptr = ret;
c277df42 5512 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 5513 if (RExC_offsets) { /* MJD */
ccb2c380 5514 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5515 "reganode",
ccb2c380
MP
5516 __LINE__,
5517 reg_name[op],
fac92740
MJD
5518 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5519 "Overwriting end of array!\n" : "OK",
5520 RExC_emit - RExC_emit_start,
5521 RExC_parse - RExC_start,
5522 RExC_offsets[0]));
ccb2c380 5523 Set_Cur_Node_Offset;
fac92740
MJD
5524 }
5525
830247a4 5526 RExC_emit = ptr;
fe14fcc3 5527
a0d0e21e 5528 return(ret);
fe14fcc3
LW
5529}
5530
5531/*
cd439c50 5532- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
5533*/
5534STATIC void
830247a4 5535S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 5536{
5e12f4fb 5537 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
5538}
5539
5540/*
a0d0e21e
LW
5541- reginsert - insert an operator in front of already-emitted operand
5542*
5543* Means relocating the operand.
5544*/
76e3520e 5545STATIC void
830247a4 5546S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 5547{
c277df42
IZ
5548 register regnode *src;
5549 register regnode *dst;
5550 register regnode *place;
5551 register int offset = regarglen[(U8)op];
b81d288d 5552
22c35a8c 5553/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
5554
5555 if (SIZE_ONLY) {
830247a4 5556 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
5557 return;
5558 }
a687059c 5559
830247a4
IZ
5560 src = RExC_emit;
5561 RExC_emit += NODE_STEP_REGNODE + offset;
5562 dst = RExC_emit;
fac92740 5563 while (src > opnd) {
c277df42 5564 StructCopy(--src, --dst, regnode);
fac92740 5565 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 5566 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 5567 "reg_insert",
ccb2c380
MP
5568 __LINE__,
5569 reg_name[op],
fac92740
MJD
5570 dst - RExC_emit_start > RExC_offsets[0]
5571 ? "Overwriting end of array!\n" : "OK",
5572 src - RExC_emit_start,
5573 dst - RExC_emit_start,
5574 RExC_offsets[0]));
ccb2c380
MP
5575 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5576 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
5577 }
5578 }
5579
a0d0e21e
LW
5580
5581 place = opnd; /* Op node, where operand used to be. */
fac92740 5582 if (RExC_offsets) { /* MJD */
ccb2c380 5583 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5584 "reginsert",
ccb2c380
MP
5585 __LINE__,
5586 reg_name[op],
fac92740
MJD
5587 place - RExC_emit_start > RExC_offsets[0]
5588 ? "Overwriting end of array!\n" : "OK",
5589 place - RExC_emit_start,
5590 RExC_parse - RExC_start,
5591 RExC_offsets[0]));
ccb2c380 5592 Set_Node_Offset(place, RExC_parse);
45948336 5593 Set_Node_Length(place, 1);
fac92740 5594 }
c277df42
IZ
5595 src = NEXTOPER(place);
5596 FILL_ADVANCE_NODE(place, op);
5597 Zero(src, offset, regnode);
a687059c
LW
5598}
5599
5600/*
c277df42 5601- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 5602*/
76e3520e 5603STATIC void
830247a4 5604S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5605{
c277df42
IZ
5606 register regnode *scan;
5607 register regnode *temp;
a0d0e21e 5608
c277df42 5609 if (SIZE_ONLY)
a0d0e21e
LW
5610 return;
5611
5612 /* Find last node. */
5613 scan = p;
5614 for (;;) {
5615 temp = regnext(scan);
5616 if (temp == NULL)
5617 break;
5618 scan = temp;
5619 }
a687059c 5620
c277df42
IZ
5621 if (reg_off_by_arg[OP(scan)]) {
5622 ARG_SET(scan, val - scan);
a0ed51b3
LW
5623 }
5624 else {
c277df42
IZ
5625 NEXT_OFF(scan) = val - scan;
5626 }
a687059c
LW
5627}
5628
5629/*
a0d0e21e
LW
5630- regoptail - regtail on operand of first argument; nop if operandless
5631*/
76e3520e 5632STATIC void
830247a4 5633S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5634{
a0d0e21e 5635 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
5636 if (p == NULL || SIZE_ONLY)
5637 return;
22c35a8c 5638 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 5639 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 5640 }
22c35a8c 5641 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 5642 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
5643 }
5644 else
a0d0e21e 5645 return;
a687059c
LW
5646}
5647
5648/*
5649 - regcurly - a little FSA that accepts {\d+,?\d*}
5650 */
79072805 5651STATIC I32
cea2e8a9 5652S_regcurly(pTHX_ register char *s)
a687059c
LW
5653{
5654 if (*s++ != '{')
5655 return FALSE;
f0fcb552 5656 if (!isDIGIT(*s))
a687059c 5657 return FALSE;
f0fcb552 5658 while (isDIGIT(*s))
a687059c
LW
5659 s++;
5660 if (*s == ',')
5661 s++;
f0fcb552 5662 while (isDIGIT(*s))
a687059c
LW
5663 s++;
5664 if (*s != '}')
5665 return FALSE;
5666 return TRUE;
5667}
5668
a687059c 5669
8fa7f367
JH
5670#ifdef DEBUGGING
5671
76e3520e 5672STATIC regnode *
cea2e8a9 5673S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 5674{
f248d071 5675 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 5676 register regnode *next;
c277df42
IZ
5677
5678 while (op != END && (!last || node < last)) {
5679 /* While that wasn't END last time... */
5680
5681 NODE_ALIGN(node);
5682 op = OP(node);
5683 if (op == CLOSE)
5684 l--;
5685 next = regnext(node);
5686 /* Where, what. */
5687 if (OP(node) == OPTIMIZED)
5688 goto after_print;
5689 regprop(sv, node);
b900a521 5690 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 5691 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
5692 if (next == NULL) /* Next ptr. */
5693 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 5694 else
b900a521 5695 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
5696 (void)PerlIO_putc(Perl_debug_log, '\n');
5697 after_print:
22c35a8c 5698 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
5699 register regnode *nnode = (OP(next) == LONGJMP
5700 ? regnext(next)
c277df42
IZ
5701 : next);
5702 if (last && nnode > last)
5703 nnode = last;
5704 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 5705 }
22c35a8c 5706 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 5707 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3 5708 }
a3621e74
YO
5709 else if ( PL_regkind[(U8)op] == TRIE ) {
5710 I32 n = ARG(node);
5711 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5712 I32 word_idx;
5713 I32 arry_len=av_len(trie->words)+1;
5714 PerlIO_printf(Perl_debug_log,
5715 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%d%s]\n",
5716 (int)(2*(l+3)), "",
5717 trie->wordcount,
5718 trie->charcount,
5719 trie->uniquecharcount,
5720 trie->laststate-1,
5721 node->flags ? " EVAL mode" : "");
5722
5723 for (word_idx=0; word_idx < arry_len; word_idx++) {
5724 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
5725 if (elem_ptr) {
5726 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
5727 (int)(2*(l+4)), "",
5728 PL_colors[0],
5729 SvPV_nolen(*elem_ptr),
5730 PL_colors[1]
5731 );
5732 /*
5733 if (next == NULL)
5734 PerlIO_printf(Perl_debug_log, "(0)\n");
5735 else
5736 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
5737 */
5738 }
5739
5740 }
5741
5742 node = NEXTOPER(node);
5743 node += regarglen[(U8)op];
5744
5745 }
a0ed51b3 5746 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
5747 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5748 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 5749 }
22c35a8c 5750 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
5751 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5752 next, sv, l + 1);
a0ed51b3
LW
5753 }
5754 else if ( op == PLUS || op == STAR) {
c277df42 5755 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
5756 }
5757 else if (op == ANYOF) {
4f66b38d
HS
5758 /* arglen 1 + class block */
5759 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
5760 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
5761 node = NEXTOPER(node);
a0ed51b3 5762 }
22c35a8c 5763 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 5764 /* Literal string, where present. */
cd439c50 5765 node += NODE_SZ_STR(node) - 1;
c277df42 5766 node = NEXTOPER(node);
a0ed51b3
LW
5767 }
5768 else {
c277df42
IZ
5769 node = NEXTOPER(node);
5770 node += regarglen[(U8)op];
5771 }
5772 if (op == CURLYX || op == OPEN)
5773 l++;
5774 else if (op == WHILEM)
5775 l--;
5776 }
5777 return node;
5778}
5779
8fa7f367
JH
5780#endif /* DEBUGGING */
5781
a687059c 5782/*
fd181c75 5783 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
5784 */
5785void
864dbfa3 5786Perl_regdump(pTHX_ regexp *r)
a687059c 5787{
35ff7856 5788#ifdef DEBUGGING
46fc3d4c 5789 SV *sv = sv_newmortal();
a687059c 5790
c277df42 5791 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
5792
5793 /* Header fields of interest. */
c277df42 5794 if (r->anchored_substr)
7b0972df 5795 PerlIO_printf(Perl_debug_log,
b81d288d 5796 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 5797 PL_colors[0],
7b0972df 5798 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 5799 SvPVX(r->anchored_substr),
3280af22 5800 PL_colors[1],
c277df42 5801 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 5802 (IV)r->anchored_offset);
33b8afdf
JH
5803 else if (r->anchored_utf8)
5804 PerlIO_printf(Perl_debug_log,
5805 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
5806 PL_colors[0],
5807 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5808 SvPVX(r->anchored_utf8),
5809 PL_colors[1],
5810 SvTAIL(r->anchored_utf8) ? "$" : "",
5811 (IV)r->anchored_offset);
c277df42 5812 if (r->float_substr)
7b0972df 5813 PerlIO_printf(Perl_debug_log,
b81d288d 5814 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 5815 PL_colors[0],
b81d288d 5816 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 5817 SvPVX(r->float_substr),
3280af22 5818 PL_colors[1],
c277df42 5819 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 5820 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
5821 else if (r->float_utf8)
5822 PerlIO_printf(Perl_debug_log,
5823 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
5824 PL_colors[0],
5825 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5826 SvPVX(r->float_utf8),
5827 PL_colors[1],
5828 SvTAIL(r->float_utf8) ? "$" : "",
5829 (IV)r->float_min_offset, (UV)r->float_max_offset);
5830 if (r->check_substr || r->check_utf8)
b81d288d
AB
5831 PerlIO_printf(Perl_debug_log,
5832 r->check_substr == r->float_substr
33b8afdf 5833 && r->check_utf8 == r->float_utf8
c277df42
IZ
5834 ? "(checking floating" : "(checking anchored");
5835 if (r->reganch & ROPT_NOSCAN)
5836 PerlIO_printf(Perl_debug_log, " noscan");
5837 if (r->reganch & ROPT_CHECK_ALL)
5838 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 5839 if (r->check_substr || r->check_utf8)
c277df42
IZ
5840 PerlIO_printf(Perl_debug_log, ") ");
5841
46fc3d4c 5842 if (r->regstclass) {
5843 regprop(sv, r->regstclass);
5844 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
5845 }
774d564b 5846 if (r->reganch & ROPT_ANCH) {
5847 PerlIO_printf(Perl_debug_log, "anchored");
5848 if (r->reganch & ROPT_ANCH_BOL)
5849 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
5850 if (r->reganch & ROPT_ANCH_MBOL)
5851 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
5852 if (r->reganch & ROPT_ANCH_SBOL)
5853 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 5854 if (r->reganch & ROPT_ANCH_GPOS)
5855 PerlIO_printf(Perl_debug_log, "(GPOS)");
5856 PerlIO_putc(Perl_debug_log, ' ');
5857 }
c277df42
IZ
5858 if (r->reganch & ROPT_GPOS_SEEN)
5859 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 5860 if (r->reganch & ROPT_SKIP)
760ac839 5861 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 5862 if (r->reganch & ROPT_IMPLICIT)
760ac839 5863 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 5864 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
5865 if (r->reganch & ROPT_EVAL_SEEN)
5866 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 5867 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
5868 if (r->offsets) {
5869 U32 i;
5870 U32 len = r->offsets[0];
a3621e74
YO
5871 GET_RE_DEBUG_FLAGS_DECL;
5872 DEBUG_OFFSETS_r({
392fbf5d 5873 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 5874 for (i = 1; i <= len; i++)
392fbf5d
RB
5875 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5876 (UV)r->offsets[i*2-1],
5877 (UV)r->offsets[i*2]);
fac92740 5878 PerlIO_printf(Perl_debug_log, "\n");
a3621e74 5879 });
fac92740 5880 }
17c3b450 5881#endif /* DEBUGGING */
a687059c
LW
5882}
5883
8fa7f367
JH
5884#ifdef DEBUGGING
5885
653099ff
GS
5886STATIC void
5887S_put_byte(pTHX_ SV *sv, int c)
5888{
7be5a6cf 5889 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
5890 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5891 else if (c == '-' || c == ']' || c == '\\' || c == '^')
5892 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
5893 else
5894 Perl_sv_catpvf(aTHX_ sv, "%c", c);
5895}
5896
8fa7f367
JH
5897#endif /* DEBUGGING */
5898
a3621e74 5899
a687059c 5900/*
a0d0e21e
LW
5901- regprop - printable representation of opcode
5902*/
46fc3d4c 5903void
864dbfa3 5904Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 5905{
35ff7856 5906#ifdef DEBUGGING
9b155405 5907 register int k;
a0d0e21e 5908
54dc92de 5909 sv_setpvn(sv, "", 0);
9b155405 5910 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
5911 /* It would be nice to FAIL() here, but this may be called from
5912 regexec.c, and it would be hard to supply pRExC_state. */
5913 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
5914 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
5915
5916 k = PL_regkind[(U8)OP(o)];
5917
2a782b5b
JH
5918 if (k == EXACT) {
5919 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
5920 /* Using is_utf8_string() is a crude hack but it may
5921 * be the best for now since we have no flag "this EXACTish
5922 * node was UTF-8" --jhi */
5923 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
8a989385 5924 char *s = do_utf8 ?
c728cb41
JH
5925 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5926 UNI_DISPLAY_REGEX) :
2a782b5b 5927 STRING(o);
40eddc46 5928 int len = do_utf8 ?
2a782b5b
JH
5929 strlen(s) :
5930 STR_LEN(o);
5931 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5932 PL_colors[0],
5933 len, s,
5934 PL_colors[1]);
a3621e74
YO
5935 } else if (k == TRIE) {/*
5936 this isn't always safe, as Pl_regdata may not be for this regex yet
5937 (depending on where its called from) so its being moved to dumpuntil
5938 I32 n = ARG(o);
5939 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5940 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5941 trie->wordcount,
5942 trie->charcount,
5943 trie->uniquecharcount,
5944 trie->laststate);
5945 */
5946 } else if (k == CURLY) {
cb434fcc 5947 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
5948 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5949 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 5950 }
2c2d71f5
JH
5951 else if (k == WHILEM && o->flags) /* Ordinal/of */
5952 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 5953 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 5954 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 5955 else if (k == LOGICAL)
04ebc1ab 5956 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
5957 else if (k == ANYOF) {
5958 int i, rangestart = -1;
ffc61ed2 5959 U8 flags = ANYOF_FLAGS(o);
a6d05634 5960 const char * const anyofs[] = { /* Should be synchronized with
19860706 5961 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
5962 "\\w",
5963 "\\W",
5964 "\\s",
5965 "\\S",
5966 "\\d",
5967 "\\D",
5968 "[:alnum:]",
5969 "[:^alnum:]",
5970 "[:alpha:]",
5971 "[:^alpha:]",
5972 "[:ascii:]",
5973 "[:^ascii:]",
5974 "[:ctrl:]",
5975 "[:^ctrl:]",
5976 "[:graph:]",
5977 "[:^graph:]",
5978 "[:lower:]",
5979 "[:^lower:]",
5980 "[:print:]",
5981 "[:^print:]",
5982 "[:punct:]",
5983 "[:^punct:]",
5984 "[:upper:]",
aaa51d5e 5985 "[:^upper:]",
653099ff 5986 "[:xdigit:]",
aaa51d5e
JF
5987 "[:^xdigit:]",
5988 "[:space:]",
5989 "[:^space:]",
5990 "[:blank:]",
5991 "[:^blank:]"
653099ff
GS
5992 };
5993
19860706 5994 if (flags & ANYOF_LOCALE)
653099ff 5995 sv_catpv(sv, "{loc}");
19860706 5996 if (flags & ANYOF_FOLD)
653099ff
GS
5997 sv_catpv(sv, "{i}");
5998 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 5999 if (flags & ANYOF_INVERT)
653099ff 6000 sv_catpv(sv, "^");
ffc61ed2
JH
6001 for (i = 0; i <= 256; i++) {
6002 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6003 if (rangestart == -1)
6004 rangestart = i;
6005 } else if (rangestart != -1) {
6006 if (i <= rangestart + 3)
6007 for (; rangestart < i; rangestart++)
653099ff 6008 put_byte(sv, rangestart);
ffc61ed2
JH
6009 else {
6010 put_byte(sv, rangestart);
6011 sv_catpv(sv, "-");
6012 put_byte(sv, i - 1);
653099ff 6013 }
ffc61ed2 6014 rangestart = -1;
653099ff 6015 }
847a199f 6016 }
ffc61ed2
JH
6017
6018 if (o->flags & ANYOF_CLASS)
6019 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
6020 if (ANYOF_CLASS_TEST(o,i))
6021 sv_catpv(sv, anyofs[i]);
6022
6023 if (flags & ANYOF_UNICODE)
6024 sv_catpv(sv, "{unicode}");
1aa99e6b 6025 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 6026 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
6027
6028 {
6029 SV *lv;
9e55ce06 6030 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 6031
ffc61ed2
JH
6032 if (lv) {
6033 if (sw) {
89ebb4a3 6034 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 6035
ffc61ed2 6036 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 6037 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 6038
3568d838 6039 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
6040 if (rangestart == -1)
6041 rangestart = i;
6042 } else if (rangestart != -1) {
6043 U8 *p;
b81d288d 6044
ffc61ed2
JH
6045 if (i <= rangestart + 3)
6046 for (; rangestart < i; rangestart++) {
2b9d42f0 6047 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
6048 put_byte(sv, *p);
6049 }
6050 else {
2b9d42f0 6051 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
6052 put_byte(sv, *p);
6053 sv_catpv(sv, "-");
2b9d42f0 6054 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
6055 put_byte(sv, *p);
6056 }
6057 rangestart = -1;
6058 }
19860706 6059 }
ffc61ed2
JH
6060
6061 sv_catpv(sv, "..."); /* et cetera */
19860706 6062 }
fde631ed 6063
ffc61ed2 6064 {
2e0de35c 6065 char *s = savesvpv(lv);
ffc61ed2 6066 char *origs = s;
b81d288d 6067
ffc61ed2 6068 while(*s && *s != '\n') s++;
b81d288d 6069
ffc61ed2
JH
6070 if (*s == '\n') {
6071 char *t = ++s;
6072
6073 while (*s) {
6074 if (*s == '\n')
6075 *s = ' ';
6076 s++;
6077 }
6078 if (s[-1] == ' ')
6079 s[-1] = 0;
6080
6081 sv_catpv(sv, t);
fde631ed 6082 }
b81d288d 6083
ffc61ed2 6084 Safefree(origs);
fde631ed
JH
6085 }
6086 }
653099ff 6087 }
ffc61ed2 6088
653099ff
GS
6089 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6090 }
9b155405 6091 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 6092 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 6093#endif /* DEBUGGING */
35ff7856 6094}
a687059c 6095
cad2e5aa
JH
6096SV *
6097Perl_re_intuit_string(pTHX_ regexp *prog)
6098{ /* Assume that RE_INTUIT is set */
a3621e74
YO
6099 GET_RE_DEBUG_FLAGS_DECL;
6100 DEBUG_COMPILE_r(
cad2e5aa 6101 { STRLEN n_a;
33b8afdf
JH
6102 char *s = SvPV(prog->check_substr
6103 ? prog->check_substr : prog->check_utf8, n_a);
cad2e5aa
JH
6104
6105 if (!PL_colorset) reginitcolors();
6106 PerlIO_printf(Perl_debug_log,
33b8afdf
JH
6107 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
6108 PL_colors[4],
6109 prog->check_substr ? "" : "utf8 ",
6110 PL_colors[5],PL_colors[0],
cad2e5aa
JH
6111 s,
6112 PL_colors[1],
6113 (strlen(s) > 60 ? "..." : ""));
6114 } );
6115
33b8afdf 6116 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
6117}
6118
2b69d0c2 6119void
864dbfa3 6120Perl_pregfree(pTHX_ struct regexp *r)
a687059c 6121{
9e55ce06
JH
6122#ifdef DEBUGGING
6123 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
a3621e74 6124 SV *re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
9e55ce06 6125#endif
7821416a 6126
a3621e74 6127
7821416a
IZ
6128 if (!r || (--r->refcnt > 0))
6129 return;
a3621e74 6130 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
d103360b
HS
6131 int len;
6132 char *s;
6133
6134 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
6135 r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 6136 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
d103360b 6137 len = SvCUR(dsv);
9e55ce06
JH
6138 if (!PL_colorset)
6139 reginitcolors();
6140 PerlIO_printf(Perl_debug_log,
a3621e74 6141 "%sFreeing REx:%s %s%*.*s%s%s\n",
9e55ce06
JH
6142 PL_colors[4],PL_colors[5],PL_colors[0],
6143 len, len, s,
6144 PL_colors[1],
6145 len > 60 ? "..." : "");
6146 });
cad2e5aa 6147
c277df42 6148 if (r->precomp)
a0d0e21e 6149 Safefree(r->precomp);
fac92740
MJD
6150 if (r->offsets) /* 20010421 MJD */
6151 Safefree(r->offsets);
ed252734
NC
6152 RX_MATCH_COPY_FREE(r);
6153#ifdef PERL_COPY_ON_WRITE
6154 if (r->saved_copy)
6155 SvREFCNT_dec(r->saved_copy);
6156#endif
a193d654
GS
6157 if (r->substrs) {
6158 if (r->anchored_substr)
6159 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6160 if (r->anchored_utf8)
6161 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6162 if (r->float_substr)
6163 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6164 if (r->float_utf8)
6165 SvREFCNT_dec(r->float_utf8);
2779dcf1 6166 Safefree(r->substrs);
a193d654 6167 }
c277df42
IZ
6168 if (r->data) {
6169 int n = r->data->count;
f3548bdc
DM
6170 PAD* new_comppad = NULL;
6171 PAD* old_comppad;
4026c95a 6172 PADOFFSET refcnt;
dfad63ad 6173
c277df42 6174 while (--n >= 0) {
261faec3 6175 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6176 switch (r->data->what[n]) {
6177 case 's':
6178 SvREFCNT_dec((SV*)r->data->data[n]);
6179 break;
653099ff
GS
6180 case 'f':
6181 Safefree(r->data->data[n]);
6182 break;
dfad63ad
HS
6183 case 'p':
6184 new_comppad = (AV*)r->data->data[n];
6185 break;
c277df42 6186 case 'o':
dfad63ad 6187 if (new_comppad == NULL)
cea2e8a9 6188 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6189 PAD_SAVE_LOCAL(old_comppad,
6190 /* Watch out for global destruction's random ordering. */
6191 (SvTYPE(new_comppad) == SVt_PVAV) ?
6192 new_comppad : Null(PAD *)
6193 );
b34c0dd4 6194 OP_REFCNT_LOCK;
4026c95a
SH
6195 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6196 OP_REFCNT_UNLOCK;
6197 if (!refcnt)
9b978d73 6198 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6199
f3548bdc 6200 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6201 SvREFCNT_dec((SV*)new_comppad);
6202 new_comppad = NULL;
c277df42
IZ
6203 break;
6204 case 'n':
9e55ce06 6205 break;
a3621e74
YO
6206 case 't':
6207 {
6208 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6209 U32 refcount;
6210 OP_REFCNT_LOCK;
6211 refcount = trie->refcount--;
6212 OP_REFCNT_UNLOCK;
6213 if ( !refcount ) {
6214 if (trie->charmap)
6215 Safefree(trie->charmap);
6216 if (trie->widecharmap)
6217 SvREFCNT_dec((SV*)trie->widecharmap);
6218 if (trie->states)
6219 Safefree(trie->states);
6220 if (trie->trans)
6221 Safefree(trie->trans);
6222#ifdef DEBUGGING
6223 if (trie->words)
6224 SvREFCNT_dec((SV*)trie->words);
6225 if (trie->revcharmap)
6226 SvREFCNT_dec((SV*)trie->revcharmap);
6227#endif
6228 Safefree(r->data->data[n]); /* do this last!!!! */
6229 }
6230 break;
6231 }
c277df42 6232 default:
830247a4 6233 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6234 }
6235 }
6236 Safefree(r->data->what);
6237 Safefree(r->data);
a0d0e21e
LW
6238 }
6239 Safefree(r->startp);
6240 Safefree(r->endp);
6241 Safefree(r);
a687059c 6242}
c277df42
IZ
6243
6244/*
6245 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6246 */
6247regnode *
864dbfa3 6248Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
6249{
6250 register I32 offset;
6251
3280af22 6252 if (p == &PL_regdummy)
c277df42
IZ
6253 return(NULL);
6254
6255 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6256 if (offset == 0)
6257 return(NULL);
6258
c277df42 6259 return(p+offset);
c277df42
IZ
6260}
6261
01f988be 6262STATIC void
cea2e8a9 6263S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6264{
6265 va_list args;
6266 STRLEN l1 = strlen(pat1);
6267 STRLEN l2 = strlen(pat2);
6268 char buf[512];
06bf62c7 6269 SV *msv;
c277df42
IZ
6270 char *message;
6271
6272 if (l1 > 510)
6273 l1 = 510;
6274 if (l1 + l2 > 510)
6275 l2 = 510 - l1;
6276 Copy(pat1, buf, l1 , char);
6277 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6278 buf[l1 + l2] = '\n';
6279 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6280#ifdef I_STDARG
6281 /* ANSI variant takes additional second argument */
c277df42 6282 va_start(args, pat2);
8736538c
AS
6283#else
6284 va_start(args);
6285#endif
5a844595 6286 msv = vmess(buf, &args);
c277df42 6287 va_end(args);
06bf62c7 6288 message = SvPV(msv,l1);
c277df42
IZ
6289 if (l1 > 512)
6290 l1 = 512;
6291 Copy(message, buf, l1 , char);
197cf9b9 6292 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6293 Perl_croak(aTHX_ "%s", buf);
c277df42 6294}
a0ed51b3
LW
6295
6296/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6297
6298void
864dbfa3 6299Perl_save_re_context(pTHX)
b81d288d 6300{
830247a4 6301 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 6302 SAVEPPTR(PL_bostr);
a0ed51b3
LW
6303 SAVEPPTR(PL_reginput); /* String-input pointer. */
6304 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6305 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
6306 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6307 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6308 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a5db57d6 6309 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
a0ed51b3 6310 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 6311 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 6312 PL_reg_start_tmp = 0;
a0ed51b3
LW
6313 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6314 PL_reg_start_tmpl = 0;
7766f137 6315 SAVEVPTR(PL_regdata);
a0ed51b3
LW
6316 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6317 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 6318 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 6319 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
6320 SAVEVPTR(PL_regcc); /* from regexec.c */
6321 SAVEVPTR(PL_curcop);
7766f137
GS
6322 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6323 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
6324 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6325 SAVESPTR(PL_reg_sv); /* from regexec.c */
9febdf04 6326 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
7766f137 6327 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 6328 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
6329 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6330 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
a5db57d6
GS
6331 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
6332 PL_reg_oldsaved = Nullch;
6333 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6334 PL_reg_oldsavedlen = 0;
ed252734
NC
6335#ifdef PERL_COPY_ON_WRITE
6336 SAVESPTR(PL_nrs);
6337 PL_nrs = Nullsv;
6338#endif
a5db57d6
GS
6339 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6340 PL_reg_maxiter = 0;
6341 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6342 PL_reg_leftiter = 0;
6343 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
6344 PL_reg_poscache = Nullch;
6345 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6346 PL_reg_poscache_size = 0;
6347 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5fb7366e 6348 SAVEI32(PL_regnpar); /* () count. */
e49a9654 6349 SAVEI32(PL_regsize); /* from regexec.c */
ada6e8a9
AMS
6350
6351 {
6352 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
a8514157 6353 U32 i;
ada6e8a9
AMS
6354 GV *mgv;
6355 REGEXP *rx;
cf28e18a 6356 char digits[TYPE_CHARS(long)];
ada6e8a9
AMS
6357
6358 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
6359 for (i = 1; i <= rx->nparens; i++) {
d994d9a1 6360 sprintf(digits, "%lu", (long)i);
ada6e8a9
AMS
6361 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
6362 save_scalar(mgv);
6363 }
6364 }
6365 }
6366
54b6e2fa 6367#ifdef DEBUGGING
b81d288d 6368 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 6369#endif
a0ed51b3 6370}
51371543 6371
51371543 6372static void
acfe0abc 6373clear_re(pTHX_ void *r)
51371543
GS
6374{
6375 ReREFCNT_dec((regexp *)r);
6376}
ffbc6a93 6377
241d1a3b
NC
6378/*
6379 * Local variables:
6380 * c-indentation-style: bsd
6381 * c-basic-offset: 4
6382 * indent-tabs-mode: t
6383 * End:
6384 *
c49a72a9 6385 * vim: shiftwidth=4:
241d1a3b 6386*/