This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Fix typo in description.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e
AD
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
cad2e5aa 38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
d06ea78c 44/* We *really* need to overwrite these symbols: */
56953603
IZ
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
d06ea78c 48# define Perl_pregfree my_regfree
cad2e5aa
JH
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
d06ea78c 51# define Perl_regnext my_regnext
f0b8d043 52# define Perl_save_re_context my_save_re_context
b81d288d 53# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
b81d288d 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 *
79 **** Alterations to Henry's code are...
80 ****
4bb101f2 81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
241d1a3b 82 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 83 ****
9ef589d8
LW
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
86
a687059c
LW
87 *
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
91 */
92#include "EXTERN.h"
864dbfa3 93#define PERL_IN_REGCOMP_C
a687059c 94#include "perl.h"
d06ea78c 95
acfe0abc 96#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
97# include "INTERN.h"
98#endif
c277df42
IZ
99
100#define REG_COMP_C
a687059c
LW
101#include "regcomp.h"
102
d4cce5f1 103#ifdef op
11343788 104#undef op
d4cce5f1 105#endif /* op */
11343788 106
fe14fcc3 107#ifdef MSDOS
7e4e8c89 108# if defined(BUGGY_MSC6)
fe14fcc3 109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 110# pragma optimize("a",off)
fe14fcc3 111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
112# pragma optimize("w",on )
113# endif /* BUGGY_MSC6 */
fe14fcc3
LW
114#endif /* MSDOS */
115
a687059c
LW
116#ifndef STATIC
117#define STATIC static
118#endif
119
830247a4 120typedef struct RExC_state_t {
e2509266 121 U32 flags; /* are we folding, multilining? */
830247a4
IZ
122 char *precomp; /* uncompiled string. */
123 regexp *rx;
fac92740 124 char *start; /* Start of input for compile */
830247a4
IZ
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 128 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 129 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
132 U32 seen;
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
135 I32 extralen;
136 I32 seen_zerolen;
137 I32 seen_evals;
1aa99e6b 138 I32 utf8;
830247a4
IZ
139#if ADD_TO_REGEXEC
140 char *starttry; /* -Dr: where regtry was called. */
141#define RExC_starttry (pRExC_state->starttry)
142#endif
143} RExC_state_t;
144
e2509266 145#define RExC_flags (pRExC_state->flags)
830247a4
IZ
146#define RExC_precomp (pRExC_state->precomp)
147#define RExC_rx (pRExC_state->rx)
fac92740 148#define RExC_start (pRExC_state->start)
830247a4
IZ
149#define RExC_end (pRExC_state->end)
150#define RExC_parse (pRExC_state->parse)
151#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 152#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 153#define RExC_emit (pRExC_state->emit)
fac92740 154#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
155#define RExC_naughty (pRExC_state->naughty)
156#define RExC_sawback (pRExC_state->sawback)
157#define RExC_seen (pRExC_state->seen)
158#define RExC_size (pRExC_state->size)
159#define RExC_npar (pRExC_state->npar)
160#define RExC_extralen (pRExC_state->extralen)
161#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 163#define RExC_utf8 (pRExC_state->utf8)
830247a4 164
a687059c
LW
165#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
a687059c 168
35c8bce7
LW
169#ifdef SPSTART
170#undef SPSTART /* dratted cpp namespace... */
171#endif
a687059c
LW
172/*
173 * Flags to be passed up and down.
174 */
a687059c 175#define WORST 0 /* Worst case. */
821b33a5 176#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
177#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178#define SPSTART 0x4 /* Starts with * or +. */
179#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 180
2c2d71f5
JH
181/* Length of a variant. */
182
183typedef struct scan_data_t {
184 I32 len_min;
185 I32 len_delta;
186 I32 pos_min;
187 I32 pos_delta;
188 SV *last_found;
189 I32 last_end; /* min value, <0 unless valid. */
190 I32 last_start_min;
191 I32 last_start_max;
192 SV **longest; /* Either &l_fixed, or &l_float. */
193 SV *longest_fixed;
194 I32 offset_fixed;
195 SV *longest_float;
196 I32 offset_float_min;
197 I32 offset_float_max;
198 I32 flags;
199 I32 whilem_c;
cb434fcc 200 I32 *last_closep;
653099ff 201 struct regnode_charclass_class *start_class;
2c2d71f5
JH
202} scan_data_t;
203
a687059c 204/*
e50aee73 205 * Forward declarations for pregcomp()'s friends.
a687059c 206 */
a0d0e21e 207
27da23d5
JH
208static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
210
211#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212#define SF_BEFORE_SEOL 0x1
213#define SF_BEFORE_MEOL 0x2
214#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
216
09b7f37c
CB
217#ifdef NO_UNARY_PLUS
218# define SF_FIX_SHIFT_EOL (0+2)
219# define SF_FL_SHIFT_EOL (0+4)
220#else
221# define SF_FIX_SHIFT_EOL (+2)
222# define SF_FL_SHIFT_EOL (+4)
223#endif
c277df42
IZ
224
225#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
227
228#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230#define SF_IS_INF 0x40
231#define SF_HAS_PAR 0x80
232#define SF_IN_PAR 0x100
233#define SF_HAS_EVAL 0x200
4bfe0158 234#define SCF_DO_SUBSTR 0x400
653099ff
GS
235#define SCF_DO_STCLASS_AND 0x0800
236#define SCF_DO_STCLASS_OR 0x1000
237#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 238#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 239
eb160463 240#define UTF (RExC_utf8 != 0)
e2509266
JH
241#define LOC ((RExC_flags & PMf_LOCALE) != 0)
242#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 243
ffc61ed2 244#define OOB_UNICODE 12345678
93733859 245#define OOB_NAMEDCLASS -1
b8c5462f 246
a0ed51b3
LW
247#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
249
8615cb43 250
b45f050a
JF
251/* length of regex to show in messages that don't mark a position within */
252#define RegexLengthToShowInErrorMessages 127
253
254/*
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
258 */
7253e4e3
RK
259#define MARKER1 "<-- HERE" /* marker as it appears in the description */
260#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 261
7253e4e3 262#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
263
264/*
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
267 * "...".
268 */
ccb2c380 269#define FAIL(msg) STMT_START { \
bfed75c6 270 const char *ellipses = ""; \
ccb2c380
MP
271 IV len = RExC_end - RExC_precomp; \
272 \
273 if (!SIZE_ONLY) \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
278 ellipses = "..."; \
279 } \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
282} STMT_END
8615cb43 283
b45f050a
JF
284/*
285 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
286 * args. Show regex, up to a maximum length. If it's too long, chop and add
287 * "...".
288 */
ccb2c380 289#define FAIL2(pat,msg) STMT_START { \
bfed75c6 290 const char *ellipses = ""; \
ccb2c380
MP
291 IV len = RExC_end - RExC_precomp; \
292 \
293 if (!SIZE_ONLY) \
294 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
295 if (len > RegexLengthToShowInErrorMessages) { \
296 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
297 len = RegexLengthToShowInErrorMessages - 10; \
298 ellipses = "..."; \
299 } \
300 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
301 msg, (int)len, RExC_precomp, ellipses); \
302} STMT_END
b45f050a
JF
303
304
305/*
306 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
307 */
ccb2c380 308#define Simple_vFAIL(m) STMT_START { \
a28509cc 309 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
312} STMT_END
b45f050a
JF
313
314/*
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 */
ccb2c380
MP
317#define vFAIL(m) STMT_START { \
318 if (!SIZE_ONLY) \
319 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
320 Simple_vFAIL(m); \
321} STMT_END
b45f050a
JF
322
323/*
324 * Like Simple_vFAIL(), but accepts two arguments.
325 */
ccb2c380 326#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 327 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
328 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
329 (int)offset, RExC_precomp, RExC_precomp + offset); \
330} STMT_END
b45f050a
JF
331
332/*
333 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
334 */
ccb2c380
MP
335#define vFAIL2(m,a1) STMT_START { \
336 if (!SIZE_ONLY) \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
339} STMT_END
b45f050a
JF
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
ccb2c380 345#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 346 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
347 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
348 (int)offset, RExC_precomp, RExC_precomp + offset); \
349} STMT_END
b45f050a
JF
350
351/*
352 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
353 */
ccb2c380
MP
354#define vFAIL3(m,a1,a2) STMT_START { \
355 if (!SIZE_ONLY) \
356 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
357 Simple_vFAIL3(m, a1, a2); \
358} STMT_END
b45f050a
JF
359
360/*
361 * Like Simple_vFAIL(), but accepts four arguments.
362 */
ccb2c380 363#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 364 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
367} STMT_END
b45f050a 368
ccb2c380 369#define vWARN(loc,m) STMT_START { \
a28509cc 370 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
373} STMT_END
374
375#define vWARNdep(loc,m) STMT_START { \
a28509cc 376 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
380} STMT_END
381
382
383#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 384 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
387} STMT_END
388
389#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 390 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
393} STMT_END
394
395#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 396 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
399} STMT_END
400
401#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 402 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
405} STMT_END
9d1d55b5 406
8615cb43 407
cd439c50 408/* Allow for side effects in s */
ccb2c380
MP
409#define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
411} STMT_END
cd439c50 412
fac92740
MJD
413/* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
417 */
418
419#define MJD_OFFSET_DEBUG(x)
a3621e74 420/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
421
422
423#define Set_Node_Offset_To_R(node,byte) STMT_START { \
424 if (! SIZE_ONLY) { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
427 if((node) < 0) { \
551405c4 428 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
429 } else { \
430 RExC_offsets[2*(node)-1] = (byte); \
431 } \
432 } \
433} STMT_END
434
435#define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
438
439#define Set_Node_Length_To_R(node,len) STMT_START { \
440 if (! SIZE_ONLY) { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 442 __LINE__, (int)(node), (int)(len))); \
ccb2c380 443 if((node) < 0) { \
551405c4 444 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
445 } else { \
446 RExC_offsets[2*(node)] = (len); \
447 } \
448 } \
449} STMT_END
450
451#define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454#define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
456
457/* Get offsets and lengths */
458#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
460
acfe0abc 461static void clear_re(pTHX_ void *r);
4327152a 462
653099ff
GS
463/* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
466
4327152a 467STATIC void
830247a4 468S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 469{
e1ec3a88
AL
470 const STRLEN l = CHR_SVLEN(data->last_found);
471 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 472
c277df42 473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 474 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
b81d288d 478 data->flags
c277df42
IZ
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
480 else
481 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
482 }
483 else {
c277df42 484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
485 data->offset_float_max = (l
486 ? data->last_start_max
c277df42 487 : data->pos_min + data->pos_delta);
9051bda5
HS
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
c277df42 490 if (data->flags & SF_BEFORE_EOL)
b81d288d 491 data->flags
c277df42
IZ
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
493 else
494 data->flags &= ~SF_FL_BEFORE_EOL;
495 }
496 }
497 SvCUR_set(data->last_found, 0);
0eda9292 498 {
a28509cc
AL
499 SV * const sv = data->last_found;
500 MAGIC * const mg =
0eda9292
JH
501 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
502 if (mg && mg->mg_len > 0)
503 mg->mg_len = 0;
504 }
c277df42
IZ
505 data->last_end = -1;
506 data->flags &= ~SF_BEFORE_EOL;
507}
508
653099ff
GS
509/* Can match anything (initialization) */
510STATIC void
830247a4 511S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 512{
653099ff 513 ANYOF_CLASS_ZERO(cl);
f8bef550 514 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 515 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
516 if (LOC)
517 cl->flags |= ANYOF_LOCALE;
518}
519
520/* Can match anything (initialization) */
521STATIC int
a28509cc 522S_cl_is_anything(pTHX_ const struct regnode_charclass_class *cl)
653099ff
GS
523{
524 int value;
525
aaa51d5e 526 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
527 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
528 return 1;
1aa99e6b
IH
529 if (!(cl->flags & ANYOF_UNICODE_ALL))
530 return 0;
f8bef550
NC
531 if (!ANYOF_BITMAP_TESTALLSET(cl))
532 return 0;
653099ff
GS
533 return 1;
534}
535
536/* Can match anything (initialization) */
537STATIC void
830247a4 538S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 539{
8ecf7187 540 Zero(cl, 1, struct regnode_charclass_class);
653099ff 541 cl->type = ANYOF;
830247a4 542 cl_anything(pRExC_state, cl);
653099ff
GS
543}
544
545STATIC void
830247a4 546S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 547{
8ecf7187 548 Zero(cl, 1, struct regnode_charclass_class);
653099ff 549 cl->type = ANYOF;
830247a4 550 cl_anything(pRExC_state, cl);
653099ff
GS
551 if (LOC)
552 cl->flags |= ANYOF_LOCALE;
553}
554
555/* 'And' a given class with another one. Can create false positives */
556/* We assume that cl is not inverted */
557STATIC void
558S_cl_and(pTHX_ struct regnode_charclass_class *cl,
a28509cc 559 const struct regnode_charclass_class *and_with)
653099ff 560{
653099ff
GS
561 if (!(and_with->flags & ANYOF_CLASS)
562 && !(cl->flags & ANYOF_CLASS)
563 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
564 && !(and_with->flags & ANYOF_FOLD)
565 && !(cl->flags & ANYOF_FOLD)) {
566 int i;
567
568 if (and_with->flags & ANYOF_INVERT)
569 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
570 cl->bitmap[i] &= ~and_with->bitmap[i];
571 else
572 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573 cl->bitmap[i] &= and_with->bitmap[i];
574 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
575 if (!(and_with->flags & ANYOF_EOS))
576 cl->flags &= ~ANYOF_EOS;
1aa99e6b 577
14ebb1a2
JH
578 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
579 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
580 cl->flags &= ~ANYOF_UNICODE_ALL;
581 cl->flags |= ANYOF_UNICODE;
582 ARG_SET(cl, ARG(and_with));
583 }
14ebb1a2
JH
584 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
585 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 586 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
587 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
588 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 589 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
590}
591
592/* 'OR' a given class with another one. Can create false positives */
593/* We assume that cl is not inverted */
594STATIC void
a28509cc 595S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 596{
653099ff
GS
597 if (or_with->flags & ANYOF_INVERT) {
598 /* We do not use
599 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
600 * <= (B1 | !B2) | (CL1 | !CL2)
601 * which is wasteful if CL2 is small, but we ignore CL2:
602 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
603 * XXXX Can we handle case-fold? Unclear:
604 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
605 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
606 */
607 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
608 && !(or_with->flags & ANYOF_FOLD)
609 && !(cl->flags & ANYOF_FOLD) ) {
610 int i;
611
612 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613 cl->bitmap[i] |= ~or_with->bitmap[i];
614 } /* XXXX: logic is complicated otherwise */
615 else {
830247a4 616 cl_anything(pRExC_state, cl);
653099ff
GS
617 }
618 } else {
619 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 621 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
622 || (cl->flags & ANYOF_FOLD)) ) {
623 int i;
624
625 /* OR char bitmap and class bitmap separately */
626 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
627 cl->bitmap[i] |= or_with->bitmap[i];
628 if (or_with->flags & ANYOF_CLASS) {
629 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
630 cl->classflags[i] |= or_with->classflags[i];
631 cl->flags |= ANYOF_CLASS;
632 }
633 }
634 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 635 cl_anything(pRExC_state, cl);
653099ff
GS
636 }
637 }
638 if (or_with->flags & ANYOF_EOS)
639 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
640
641 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
642 ARG(cl) != ARG(or_with)) {
643 cl->flags |= ANYOF_UNICODE_ALL;
644 cl->flags &= ~ANYOF_UNICODE;
645 }
646 if (or_with->flags & ANYOF_UNICODE_ALL) {
647 cl->flags |= ANYOF_UNICODE_ALL;
648 cl->flags &= ~ANYOF_UNICODE;
649 }
653099ff
GS
650}
651
5d1c421c 652/*
a3621e74
YO
653
654 make_trie(startbranch,first,last,tail,flags)
655 startbranch: the first branch in the whole branch sequence
656 first : start branch of sequence of branch-exact nodes.
657 May be the same as startbranch
658 last : Thing following the last branch.
659 May be the same as tail.
660 tail : item following the branch sequence
661 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
662
663Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
664
665A trie is an N'ary tree where the branches are determined by digital
666decomposition of the key. IE, at the root node you look up the 1st character and
667follow that branch repeat until you find the end of the branches. Nodes can be
668marked as "accepting" meaning they represent a complete word. Eg:
669
670 /he|she|his|hers/
671
672would convert into the following structure. Numbers represent states, letters
673following numbers represent valid transitions on the letter from that state, if
674the number is in square brackets it represents an accepting state, otherwise it
675will be in parenthesis.
676
677 +-h->+-e->[3]-+-r->(8)-+-s->[9]
678 | |
679 | (2)
680 | |
681 (1) +-i->(6)-+-s->[7]
682 |
683 +-s->(3)-+-h->(4)-+-e->[5]
684
685 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
686
687This shows that when matching against the string 'hers' we will begin at state 1
688read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
689then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
690is also accepting. Thus we know that we can match both 'he' and 'hers' with a
691single traverse. We store a mapping from accepting to state to which word was
692matched, and then when we have multiple possibilities we try to complete the
693rest of the regex in the order in which they occured in the alternation.
694
695The only prior NFA like behaviour that would be changed by the TRIE support is
696the silent ignoring of duplicate alternations which are of the form:
697
698 / (DUPE|DUPE) X? (?{ ... }) Y /x
699
700Thus EVAL blocks follwing a trie may be called a different number of times with
701and without the optimisation. With the optimisations dupes will be silently
702ignored. This inconsistant behaviour of EVAL type nodes is well established as
703the following demonstrates:
704
705 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
706
707which prints out 'word' three times, but
708
709 'words'=~/(word|word|word)(?{ print $1 })S/
710
711which doesnt print it out at all. This is due to other optimisations kicking in.
712
713Example of what happens on a structural level:
714
715The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
716
717 1: CURLYM[1] {1,32767}(18)
718 5: BRANCH(8)
719 6: EXACT <ac>(16)
720 8: BRANCH(11)
721 9: EXACT <ad>(16)
722 11: BRANCH(14)
723 12: EXACT <ab>(16)
724 16: SUCCEED(0)
725 17: NOTHING(18)
726 18: END(0)
727
728This would be optimizable with startbranch=5, first=5, last=16, tail=16
729and should turn into:
730
731 1: CURLYM[1] {1,32767}(18)
732 5: TRIE(16)
733 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
734 <ac>
735 <ad>
736 <ab>
737 16: SUCCEED(0)
738 17: NOTHING(18)
739 18: END(0)
740
741Cases where tail != last would be like /(?foo|bar)baz/:
742
743 1: BRANCH(4)
744 2: EXACT <foo>(8)
745 4: BRANCH(7)
746 5: EXACT <bar>(8)
747 7: TAIL(8)
748 8: EXACT <baz>(10)
749 10: END(0)
750
751which would be optimizable with startbranch=1, first=1, last=7, tail=8
752and would end up looking like:
753
754 1: TRIE(8)
755 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
756 <foo>
757 <bar>
758 7: TAIL(8)
759 8: EXACT <baz>(10)
760 10: END(0)
761
762*/
763
764#define TRIE_DEBUG_CHAR \
765 DEBUG_TRIE_COMPILE_r({ \
766 SV *tmp; \
767 if ( UTF ) { \
d0043bd1 768 tmp = newSVpvn( "", 0 ); \
a3621e74
YO
769 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
770 } else { \
e4584336 771 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
a3621e74
YO
772 } \
773 av_push( trie->revcharmap, tmp ); \
774 })
775
776#define TRIE_READ_CHAR STMT_START { \
777 if ( UTF ) { \
778 if ( folder ) { \
779 if ( foldlen > 0 ) { \
780 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
781 foldlen -= len; \
782 scan += len; \
783 len = 0; \
784 } else { \
e1ec3a88 785 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
786 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
787 foldlen -= UNISKIP( uvc ); \
788 scan = foldbuf + UNISKIP( uvc ); \
789 } \
790 } else { \
e1ec3a88 791 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
792 } \
793 } else { \
794 uvc = (U32)*uc; \
795 len = 1; \
796 } \
797} STMT_END
798
799
800#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
801#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
802#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
803#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
804
805#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
806 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
807 TRIE_LIST_LEN( state ) *= 2; \
808 Renew( trie->states[ state ].trans.list, \
809 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
810 } \
811 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
812 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
813 TRIE_LIST_CUR( state )++; \
814} STMT_END
815
816#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 817 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
818 4, reg_trie_trans_le ); \
819 TRIE_LIST_CUR( state ) = 1; \
820 TRIE_LIST_LEN( state ) = 4; \
821} STMT_END
822
823STATIC I32
824S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
825{
27da23d5 826 dVAR;
a3621e74
YO
827 /* first pass, loop through and scan words */
828 reg_trie_data *trie;
829 regnode *cur;
e1ec3a88 830 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
a3621e74
YO
831 STRLEN len = 0;
832 UV uvc = 0;
833 U16 curword = 0;
834 U32 next_alloc = 0;
835 /* we just use folder as a flag in utf8 */
e1ec3a88 836 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
837 ? PL_fold
838 : ( flags == EXACTFL
839 ? PL_fold_locale
840 : NULL
841 )
842 );
843
e1ec3a88 844 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74
YO
845 SV *re_trie_maxbuff;
846
847 GET_RE_DEBUG_FLAGS_DECL;
848
a02a5408 849 Newxz( trie, 1, reg_trie_data );
a3621e74
YO
850 trie->refcount = 1;
851 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 852 Newxz( trie->charmap, 256, U16 );
a3621e74
YO
853 DEBUG_r({
854 trie->words = newAV();
855 trie->revcharmap = newAV();
856 });
857
858
0111c4fd 859 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 860 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 861 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74
YO
862 }
863
864 /* -- First loop and Setup --
865
866 We first traverse the branches and scan each word to determine if it
867 contains widechars, and how many unique chars there are, this is
868 important as we have to build a table with at least as many columns as we
869 have unique chars.
870
871 We use an array of integers to represent the character codes 0..255
872 (trie->charmap) and we use a an HV* to store unicode characters. We use the
873 native representation of the character value as the key and IV's for the
874 coded index.
875
876 *TODO* If we keep track of how many times each character is used we can
877 remap the columns so that the table compression later on is more
878 efficient in terms of memory by ensuring most common value is in the
879 middle and the least common are on the outside. IMO this would be better
880 than a most to least common mapping as theres a decent chance the most
881 common letter will share a node with the least common, meaning the node
882 will not be compressable. With a middle is most common approach the worst
883 case is when we have the least common nodes twice.
884
885 */
886
887
888 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
889 regnode *noper = NEXTOPER( cur );
e1ec3a88 890 const U8 *uc = (U8*)STRING( noper );
a28509cc 891 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
892 STRLEN foldlen = 0;
893 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 894 const U8 *scan = (U8*)NULL;
a3621e74
YO
895
896 for ( ; uc < e ; uc += len ) {
897 trie->charcount++;
898 TRIE_READ_CHAR;
899 if ( uvc < 256 ) {
900 if ( !trie->charmap[ uvc ] ) {
901 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
902 if ( folder )
903 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
904 TRIE_DEBUG_CHAR;
905 }
906 } else {
907 SV** svpp;
908 if ( !trie->widecharmap )
909 trie->widecharmap = newHV();
910
911 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
912
913 if ( !svpp )
e4584336 914 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
915
916 if ( !SvTRUE( *svpp ) ) {
917 sv_setiv( *svpp, ++trie->uniquecharcount );
918 TRIE_DEBUG_CHAR;
919 }
920 }
921 }
922 trie->wordcount++;
923 } /* end first pass */
924 DEBUG_TRIE_COMPILE_r(
925 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
926 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
5d7488b2 927 (int)trie->charcount, trie->uniquecharcount )
a3621e74
YO
928 );
929
930
931 /*
932 We now know what we are dealing with in terms of unique chars and
933 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
934 representation using a flat table will take. If it's over a reasonable
935 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
936 conservative but potentially much slower representation using an array
937 of lists.
938
939 At the end we convert both representations into the same compressed
940 form that will be used in regexec.c for matching with. The latter
941 is a form that cannot be used to construct with but has memory
942 properties similar to the list form and access properties similar
943 to the table form making it both suitable for fast searches and
944 small enough that its feasable to store for the duration of a program.
945
946 See the comment in the code where the compressed table is produced
947 inplace from the flat tabe representation for an explanation of how
948 the compression works.
949
950 */
951
952
953 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
954 /*
955 Second Pass -- Array Of Lists Representation
956
957 Each state will be represented by a list of charid:state records
958 (reg_trie_trans_le) the first such element holds the CUR and LEN
959 points of the allocated array. (See defines above).
960
961 We build the initial structure using the lists, and then convert
962 it into the compressed table form which allows faster lookups
963 (but cant be modified once converted).
964
965
966 */
967
968
969 STRLEN transcount = 1;
970
a02a5408 971 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
972 TRIE_LIST_NEW(1);
973 next_alloc = 2;
974
975 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
976
977 regnode *noper = NEXTOPER( cur );
978 U8 *uc = (U8*)STRING( noper );
a28509cc 979 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
980 U32 state = 1; /* required init */
981 U16 charid = 0; /* sanity init */
982 U8 *scan = (U8*)NULL; /* sanity init */
983 STRLEN foldlen = 0; /* required init */
984 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
985
986
987 for ( ; uc < e ; uc += len ) {
988
989 TRIE_READ_CHAR;
990
991 if ( uvc < 256 ) {
992 charid = trie->charmap[ uvc ];
993 } else {
994 SV** svpp=(SV**)NULL;
995 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
996 if ( !svpp ) {
997 charid = 0;
998 } else {
999 charid=(U16)SvIV( *svpp );
1000 }
1001 }
1002 if ( charid ) {
1003
1004 U16 check;
1005 U32 newstate = 0;
1006
1007 charid--;
1008 if ( !trie->states[ state ].trans.list ) {
1009 TRIE_LIST_NEW( state );
1010 }
1011 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1012 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1013 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1014 break;
1015 }
a28509cc
AL
1016 }
1017 if ( ! newstate ) {
1018 newstate = next_alloc++;
1019 TRIE_LIST_PUSH( state, charid, newstate );
1020 transcount++;
1021 }
1022 state = newstate;
a3621e74 1023 } else {
e4584336 1024 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1025 }
1026 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1027 }
1028
1029 if ( !trie->states[ state ].wordnum ) {
1030 /* we havent inserted this word into the structure yet. */
1031 trie->states[ state ].wordnum = ++curword;
1032
1033 DEBUG_r({
1034 /* store the word for dumping */
1035 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1036 if ( UTF ) SvUTF8_on( tmp );
1037 av_push( trie->words, tmp );
1038 });
1039
1040 } else {
1041 /* Its a dupe. So ignore it. */
1042 }
1043
1044 } /* end second pass */
1045
1046 trie->laststate = next_alloc;
1047 Renew( trie->states, next_alloc, reg_trie_state );
1048
1049 DEBUG_TRIE_COMPILE_MORE_r({
1050 U32 state;
a3621e74 1051
a28509cc 1052 /* print out the table precompression. */
a3621e74
YO
1053
1054 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1055 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1056
1057 for( state=1 ; state < next_alloc ; state ++ ) {
a28509cc 1058 U16 charid;
a3621e74 1059
e4584336 1060 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
a3621e74
YO
1061 if ( ! trie->states[ state ].wordnum ) {
1062 PerlIO_printf( Perl_debug_log, "%5s| ","");
1063 } else {
e4584336 1064 PerlIO_printf( Perl_debug_log, "W%04x| ",
a3621e74
YO
1065 trie->states[ state ].wordnum
1066 );
1067 }
1068 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1069 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
e4584336 1070 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
cfd0369c 1071 SvPV_nolen_const( *tmp ),
a3621e74 1072 TRIE_LIST_ITEM(state,charid).forid,
e4584336 1073 (UV)TRIE_LIST_ITEM(state,charid).newstate
a3621e74
YO
1074 );
1075 }
1076
1077 }
1078 PerlIO_printf( Perl_debug_log, "\n\n" );
1079 });
1080
a02a5408 1081 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1082 {
1083 U32 state;
a3621e74
YO
1084 U32 tp = 0;
1085 U32 zp = 0;
1086
1087
1088 for( state=1 ; state < next_alloc ; state ++ ) {
1089 U32 base=0;
1090
1091 /*
1092 DEBUG_TRIE_COMPILE_MORE_r(
1093 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1094 );
1095 */
1096
1097 if (trie->states[state].trans.list) {
1098 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1099 U16 maxid=minid;
a28509cc 1100 U16 idx;
a3621e74
YO
1101
1102 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1103 if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
1104 minid=TRIE_LIST_ITEM( state, idx).forid;
1105 } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
1106 maxid=TRIE_LIST_ITEM( state, idx).forid;
1107 }
1108 }
1109 if ( transcount < tp + maxid - minid + 1) {
1110 transcount *= 2;
1111 Renew( trie->trans, transcount, reg_trie_trans );
1112 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1113 }
1114 base = trie->uniquecharcount + tp - minid;
1115 if ( maxid == minid ) {
1116 U32 set = 0;
1117 for ( ; zp < tp ; zp++ ) {
1118 if ( ! trie->trans[ zp ].next ) {
1119 base = trie->uniquecharcount + zp - minid;
1120 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1121 trie->trans[ zp ].check = state;
1122 set = 1;
1123 break;
1124 }
1125 }
1126 if ( !set ) {
1127 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1128 trie->trans[ tp ].check = state;
1129 tp++;
1130 zp = tp;
1131 }
1132 } else {
1133 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1134 U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1135 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1136 trie->trans[ tid ].check = state;
1137 }
1138 tp += ( maxid - minid + 1 );
1139 }
1140 Safefree(trie->states[ state ].trans.list);
1141 }
1142 /*
1143 DEBUG_TRIE_COMPILE_MORE_r(
1144 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1145 );
1146 */
1147 trie->states[ state ].trans.base=base;
1148 }
cc601c31 1149 trie->lasttrans = tp + 1;
a3621e74
YO
1150 }
1151 } else {
1152 /*
1153 Second Pass -- Flat Table Representation.
1154
1155 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1156 We know that we will need Charcount+1 trans at most to store the data
1157 (one row per char at worst case) So we preallocate both structures
1158 assuming worst case.
1159
1160 We then construct the trie using only the .next slots of the entry
1161 structs.
1162
1163 We use the .check field of the first entry of the node temporarily to
1164 make compression both faster and easier by keeping track of how many non
1165 zero fields are in the node.
1166
1167 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1168 transition.
1169
1170 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1171 number representing the first entry of the node, and state as a
1172 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1173 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1174 are 2 entrys per node. eg:
1175
1176 A B A B
1177 1. 2 4 1. 3 7
1178 2. 0 3 3. 0 5
1179 3. 0 0 5. 0 0
1180 4. 0 0 7. 0 0
1181
1182 The table is internally in the right hand, idx form. However as we also
1183 have to deal with the states array which is indexed by nodenum we have to
1184 use TRIE_NODENUM() to convert.
1185
1186 */
1187
a02a5408 1188 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
a3621e74 1189 reg_trie_trans );
a02a5408 1190 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
1191 next_alloc = trie->uniquecharcount + 1;
1192
1193 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1194
1195 regnode *noper = NEXTOPER( cur );
a28509cc
AL
1196 const U8 *uc = (U8*)STRING( noper );
1197 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1198
1199 U32 state = 1; /* required init */
1200
1201 U16 charid = 0; /* sanity init */
1202 U32 accept_state = 0; /* sanity init */
1203 U8 *scan = (U8*)NULL; /* sanity init */
1204
1205 STRLEN foldlen = 0; /* required init */
1206 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1207
1208
1209 for ( ; uc < e ; uc += len ) {
1210
1211 TRIE_READ_CHAR;
1212
1213 if ( uvc < 256 ) {
1214 charid = trie->charmap[ uvc ];
1215 } else {
1216 SV** svpp=(SV**)NULL;
1217 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1218 if ( !svpp ) {
1219 charid = 0;
1220 } else {
1221 charid=(U16)SvIV( *svpp );
1222 }
1223 }
1224 if ( charid ) {
1225 charid--;
1226 if ( !trie->trans[ state + charid ].next ) {
1227 trie->trans[ state + charid ].next = next_alloc;
1228 trie->trans[ state ].check++;
1229 next_alloc += trie->uniquecharcount;
1230 }
1231 state = trie->trans[ state + charid ].next;
1232 } else {
e4584336 1233 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1234 }
1235 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1236 }
1237
1238 accept_state = TRIE_NODENUM( state );
1239 if ( !trie->states[ accept_state ].wordnum ) {
1240 /* we havent inserted this word into the structure yet. */
1241 trie->states[ accept_state ].wordnum = ++curword;
1242
1243 DEBUG_r({
1244 /* store the word for dumping */
1245 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1246 if ( UTF ) SvUTF8_on( tmp );
1247 av_push( trie->words, tmp );
1248 });
1249
1250 } else {
1251 /* Its a dupe. So ignore it. */
1252 }
1253
1254 } /* end second pass */
1255
1256 DEBUG_TRIE_COMPILE_MORE_r({
1257 /*
1258 print out the table precompression so that we can do a visual check
1259 that they are identical.
1260 */
1261 U32 state;
1262 U16 charid;
1263 PerlIO_printf( Perl_debug_log, "\nChar : " );
1264
1265 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1266 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1267 if ( tmp ) {
cfd0369c 1268 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1269 }
1270 }
1271
1272 PerlIO_printf( Perl_debug_log, "\nState+-" );
1273
1274 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1275 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1276 }
1277
1278 PerlIO_printf( Perl_debug_log, "\n" );
1279
1280 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1281
e4584336 1282 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
a3621e74
YO
1283
1284 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
e4584336
RB
1285 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1286 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
a3621e74
YO
1287 }
1288 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
e4584336 1289 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
a3621e74 1290 } else {
e4584336 1291 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
a3621e74
YO
1292 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1293 }
1294 }
1295 PerlIO_printf( Perl_debug_log, "\n\n" );
1296 });
1297 {
1298 /*
1299 * Inplace compress the table.*
1300
1301 For sparse data sets the table constructed by the trie algorithm will
1302 be mostly 0/FAIL transitions or to put it another way mostly empty.
1303 (Note that leaf nodes will not contain any transitions.)
1304
1305 This algorithm compresses the tables by eliminating most such
1306 transitions, at the cost of a modest bit of extra work during lookup:
1307
1308 - Each states[] entry contains a .base field which indicates the
1309 index in the state[] array wheres its transition data is stored.
1310
1311 - If .base is 0 there are no valid transitions from that node.
1312
1313 - If .base is nonzero then charid is added to it to find an entry in
1314 the trans array.
1315
1316 -If trans[states[state].base+charid].check!=state then the
1317 transition is taken to be a 0/Fail transition. Thus if there are fail
1318 transitions at the front of the node then the .base offset will point
1319 somewhere inside the previous nodes data (or maybe even into a node
1320 even earlier), but the .check field determines if the transition is
1321 valid.
1322
1323 The following process inplace converts the table to the compressed
1324 table: We first do not compress the root node 1,and mark its all its
1325 .check pointers as 1 and set its .base pointer as 1 as well. This
1326 allows to do a DFA construction from the compressed table later, and
1327 ensures that any .base pointers we calculate later are greater than
1328 0.
1329
1330 - We set 'pos' to indicate the first entry of the second node.
1331
1332 - We then iterate over the columns of the node, finding the first and
1333 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1334 and set the .check pointers accordingly, and advance pos
1335 appropriately and repreat for the next node. Note that when we copy
1336 the next pointers we have to convert them from the original
1337 NODEIDX form to NODENUM form as the former is not valid post
1338 compression.
1339
1340 - If a node has no transitions used we mark its base as 0 and do not
1341 advance the pos pointer.
1342
1343 - If a node only has one transition we use a second pointer into the
1344 structure to fill in allocated fail transitions from other states.
1345 This pointer is independent of the main pointer and scans forward
1346 looking for null transitions that are allocated to a state. When it
1347 finds one it writes the single transition into the "hole". If the
1348 pointer doesnt find one the single transition is appeneded as normal.
1349
1350 - Once compressed we can Renew/realloc the structures to release the
1351 excess space.
1352
1353 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1354 specifically Fig 3.47 and the associated pseudocode.
1355
1356 demq
1357 */
a3b680e6 1358 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1359 U32 state, charid;
a3621e74
YO
1360 U32 pos = 0, zp=0;
1361 trie->laststate = laststate;
1362
1363 for ( state = 1 ; state < laststate ; state++ ) {
1364 U8 flag = 0;
a28509cc
AL
1365 const U32 stateidx = TRIE_NODEIDX( state );
1366 const U32 o_used = trie->trans[ stateidx ].check;
1367 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1368 trie->trans[ stateidx ].check = 0;
1369
1370 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1371 if ( flag || trie->trans[ stateidx + charid ].next ) {
1372 if ( trie->trans[ stateidx + charid ].next ) {
1373 if (o_used == 1) {
1374 for ( ; zp < pos ; zp++ ) {
1375 if ( ! trie->trans[ zp ].next ) {
1376 break;
1377 }
1378 }
1379 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1380 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1381 trie->trans[ zp ].check = state;
1382 if ( ++zp > pos ) pos = zp;
1383 break;
1384 }
1385 used--;
1386 }
1387 if ( !flag ) {
1388 flag = 1;
1389 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1390 }
1391 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1392 trie->trans[ pos ].check = state;
1393 pos++;
1394 }
1395 }
1396 }
cc601c31 1397 trie->lasttrans = pos + 1;
a3621e74
YO
1398 Renew( trie->states, laststate + 1, reg_trie_state);
1399 DEBUG_TRIE_COMPILE_MORE_r(
e4584336
RB
1400 PerlIO_printf( Perl_debug_log,
1401 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
5d7488b2
AL
1402 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1403 (IV)next_alloc,
1404 (IV)pos,
a3621e74
YO
1405 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1406 );
1407
1408 } /* end table compress */
1409 }
cc601c31
YO
1410 /* resize the trans array to remove unused space */
1411 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74
YO
1412
1413 DEBUG_TRIE_COMPILE_r({
1414 U32 state;
1415 /*
1416 Now we print it out again, in a slightly different form as there is additional
1417 info we want to be able to see when its compressed. They are close enough for
1418 visual comparison though.
1419 */
1420 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1421
1422 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1423 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1424 if ( tmp ) {
cfd0369c 1425 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1426 }
1427 }
1428 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
cc601c31 1429
a3621e74
YO
1430 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1431 PerlIO_printf( Perl_debug_log, "-----");
1432 PerlIO_printf( Perl_debug_log, "\n");
cc601c31 1433
a3621e74 1434 for( state = 1 ; state < trie->laststate ; state++ ) {
a28509cc 1435 const U32 base = trie->states[ state ].trans.base;
a3621e74 1436
e4584336 1437 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
a3621e74
YO
1438
1439 if ( trie->states[ state ].wordnum ) {
1440 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1441 } else {
1442 PerlIO_printf( Perl_debug_log, "%6s", "" );
1443 }
1444
e4584336 1445 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
a3621e74
YO
1446
1447 if ( base ) {
1448 U32 ofs = 0;
1449
cc601c31
YO
1450 while( ( base + ofs < trie->uniquecharcount ) ||
1451 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1452 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
a3621e74
YO
1453 ofs++;
1454
e4584336 1455 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
a3621e74
YO
1456
1457 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1458 if ( ( base + ofs >= trie->uniquecharcount ) &&
1459 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1460 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1461 {
e4584336
RB
1462 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1463 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
a3621e74
YO
1464 } else {
1465 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1466 }
1467 }
1468
e4584336 1469 PerlIO_printf( Perl_debug_log, "]");
a3621e74
YO
1470
1471 }
1472 PerlIO_printf( Perl_debug_log, "\n" );
1473 }
1474 });
1475
1476 {
1477 /* now finally we "stitch in" the new TRIE node
1478 This means we convert either the first branch or the first Exact,
1479 depending on whether the thing following (in 'last') is a branch
1480 or not and whther first is the startbranch (ie is it a sub part of
1481 the alternation or is it the whole thing.)
1482 Assuming its a sub part we conver the EXACT otherwise we convert
1483 the whole branch sequence, including the first.
1484 */
1485 regnode *convert;
1486
1487
1488
1489
1490 if ( first == startbranch && OP( last ) != BRANCH ) {
1491 convert = first;
1492 } else {
1493 convert = NEXTOPER( first );
1494 NEXT_OFF( first ) = (U16)(last - first);
1495 }
1496
1497 OP( convert ) = TRIE + (U8)( flags - EXACT );
1498 NEXT_OFF( convert ) = (U16)(tail - convert);
1499 ARG_SET( convert, data_slot );
1500
1501 /* tells us if we need to handle accept buffers specially */
1502 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1503
1504
1505 /* needed for dumping*/
1506 DEBUG_r({
1507 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1508 /* We now need to mark all of the space originally used by the
1509 branches as optimized away. This keeps the dumpuntil from
1510 throwing a wobbly as it doesnt use regnext() to traverse the
1511 opcodes.
1512 */
1513 while( optimize < last ) {
1514 OP( optimize ) = OPTIMIZED;
1515 optimize++;
1516 }
1517 });
1518 } /* end node insert */
1519 return 1;
1520}
1521
1522
1523
1524/*
5d1c421c
JH
1525 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1526 * These need to be revisited when a newer toolchain becomes available.
1527 */
1528#if defined(__sparc64__) && defined(__GNUC__)
1529# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1530# undef SPARC64_GCC_WORKAROUND
1531# define SPARC64_GCC_WORKAROUND 1
1532# endif
1533#endif
1534
653099ff
GS
1535/* REx optimizer. Converts nodes into quickier variants "in place".
1536 Finds fixed substrings. */
1537
a0288114 1538/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
1539 to the position after last scanned or to NULL. */
1540
a3621e74 1541
76e3520e 1542STATIC I32
9a957fbc
AL
1543S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1544 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1545 /* scanp: Start here (read-write). */
1546 /* deltap: Write maxlen-minlen here. */
1547 /* last: Stop before this one. */
1548{
1549 I32 min = 0, pars = 0, code;
1550 regnode *scan = *scanp, *next;
1551 I32 delta = 0;
1552 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1553 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1554 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1555 scan_data_t data_fake;
653099ff 1556 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1557 SV *re_trie_maxbuff = NULL;
1558
1559 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1560
c277df42
IZ
1561 while (scan && OP(scan) != END && scan < last) {
1562 /* Peephole optimizer: */
a3621e74
YO
1563 DEBUG_OPTIMISE_r({
1564 SV *mysv=sv_newmortal();
1565 regprop( mysv, scan);
e4584336 1566 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
cfd0369c 1567 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
a3621e74 1568 });
c277df42 1569
22c35a8c 1570 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 1571 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1572 regnode *n = regnext(scan);
1573 U32 stringok = 1;
1574#ifdef DEBUGGING
1575 regnode *stop = scan;
b81d288d 1576#endif
c277df42 1577
cd439c50 1578 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1579 /* Skip NOTHING, merge EXACT*. */
1580 while (n &&
b81d288d 1581 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
1582 (stringok && (OP(n) == OP(scan))))
1583 && NEXT_OFF(n)
1584 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1585 if (OP(n) == TAIL || n > next)
1586 stringok = 0;
22c35a8c 1587 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
1588 NEXT_OFF(scan) += NEXT_OFF(n);
1589 next = n + NODE_STEP_REGNODE;
1590#ifdef DEBUGGING
1591 if (stringok)
1592 stop = n;
b81d288d 1593#endif
c277df42 1594 n = regnext(n);
a0ed51b3 1595 }
f49d4d0f 1596 else if (stringok) {
a3b680e6 1597 const int oldl = STR_LEN(scan);
c277df42 1598 regnode *nnext = regnext(n);
f49d4d0f 1599
b81d288d 1600 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1601 break;
1602 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1603 STR_LEN(scan) += STR_LEN(n);
1604 next = n + NODE_SZ_STR(n);
c277df42 1605 /* Now we can overwrite *n : */
f49d4d0f 1606 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1607#ifdef DEBUGGING
f49d4d0f 1608 stop = next - 1;
b81d288d 1609#endif
c277df42
IZ
1610 n = nnext;
1611 }
1612 }
61a36c01 1613
a3621e74 1614 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1615/*
1616 Two problematic code points in Unicode casefolding of EXACT nodes:
1617
1618 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1619 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1620
1621 which casefold to
1622
1623 Unicode UTF-8
1624
1625 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1626 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1627
1628 This means that in case-insensitive matching (or "loose matching",
1629 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1630 length of the above casefolded versions) can match a target string
1631 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1632 This would rather mess up the minimum length computation.
1633
1634 What we'll do is to look for the tail four bytes, and then peek
1635 at the preceding two bytes to see whether we need to decrease
1636 the minimum length by four (six minus two).
1637
1638 Thanks to the design of UTF-8, there cannot be false matches:
1639 A sequence of valid UTF-8 bytes cannot be a subsequence of
1640 another valid sequence of UTF-8 bytes.
1641
1642*/
1643 char *s0 = STRING(scan), *s, *t;
1644 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
a28509cc
AL
1645 const char * const t0 = "\xcc\x88\xcc\x81";
1646 const char * const t1 = t0 + 3;
2af232bd 1647
61a36c01
JH
1648 for (s = s0 + 2;
1649 s < s2 && (t = ninstr(s, s1, t0, t1));
1650 s = t + 4) {
1651 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1652 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1653 min -= 4;
1654 }
1655 }
1656
c277df42
IZ
1657#ifdef DEBUGGING
1658 /* Allow dumping */
cd439c50 1659 n = scan + NODE_SZ_STR(scan);
c277df42 1660 while (n <= stop) {
22c35a8c 1661 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1662 OP(n) = OPTIMIZED;
1663 NEXT_OFF(n) = 0;
1664 }
1665 n++;
1666 }
653099ff 1667#endif
c277df42 1668 }
a3621e74
YO
1669
1670
1671
653099ff
GS
1672 /* Follow the next-chain of the current node and optimize
1673 away all the NOTHINGs from it. */
c277df42 1674 if (OP(scan) != CURLYX) {
a3b680e6 1675 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
1676 ? I32_MAX
1677 /* I32 may be smaller than U16 on CRAYs! */
1678 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1679 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1680 int noff;
1681 regnode *n = scan;
b81d288d 1682
c277df42
IZ
1683 /* Skip NOTHING and LONGJMP. */
1684 while ((n = regnext(n))
22c35a8c 1685 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1686 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1687 && off + noff < max)
1688 off += noff;
1689 if (reg_off_by_arg[OP(scan)])
1690 ARG(scan) = off;
b81d288d 1691 else
c277df42
IZ
1692 NEXT_OFF(scan) = off;
1693 }
a3621e74 1694
653099ff
GS
1695 /* The principal pseudo-switch. Cannot be a switch, since we
1696 look into several different things. */
b81d288d 1697 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1698 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1699 next = regnext(scan);
1700 code = OP(scan);
a3621e74 1701 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1702
1703 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1704 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1705 struct regnode_charclass_class accum;
a3621e74 1706 regnode *startbranch=scan;
c277df42 1707
653099ff 1708 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1709 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1710 if (flags & SCF_DO_STCLASS)
830247a4 1711 cl_init_zero(pRExC_state, &accum);
a3621e74 1712
c277df42 1713 while (OP(scan) == code) {
830247a4 1714 I32 deltanext, minnext, f = 0, fake;
653099ff 1715 struct regnode_charclass_class this_class;
c277df42
IZ
1716
1717 num++;
1718 data_fake.flags = 0;
b81d288d 1719 if (data) {
2c2d71f5 1720 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1721 data_fake.last_closep = data->last_closep;
1722 }
1723 else
1724 data_fake.last_closep = &fake;
c277df42
IZ
1725 next = regnext(scan);
1726 scan = NEXTOPER(scan);
1727 if (code != BRANCH)
1728 scan = NEXTOPER(scan);
653099ff 1729 if (flags & SCF_DO_STCLASS) {
830247a4 1730 cl_init(pRExC_state, &this_class);
653099ff
GS
1731 data_fake.start_class = &this_class;
1732 f = SCF_DO_STCLASS_AND;
b81d288d 1733 }
e1901655
IZ
1734 if (flags & SCF_WHILEM_VISITED_POS)
1735 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1736
653099ff 1737 /* we suppose the run is continuous, last=next...*/
830247a4 1738 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1739 next, &data_fake, f,depth+1);
b81d288d 1740 if (min1 > minnext)
c277df42
IZ
1741 min1 = minnext;
1742 if (max1 < minnext + deltanext)
1743 max1 = minnext + deltanext;
1744 if (deltanext == I32_MAX)
aca2d497 1745 is_inf = is_inf_internal = 1;
c277df42
IZ
1746 scan = next;
1747 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1748 pars++;
405ff068 1749 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1750 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1751 if (data)
1752 data->whilem_c = data_fake.whilem_c;
653099ff 1753 if (flags & SCF_DO_STCLASS)
830247a4 1754 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1755 if (code == SUSPEND)
c277df42
IZ
1756 break;
1757 }
1758 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1759 min1 = 0;
1760 if (flags & SCF_DO_SUBSTR) {
1761 data->pos_min += min1;
1762 data->pos_delta += max1 - min1;
1763 if (max1 != min1 || is_inf)
1764 data->longest = &(data->longest_float);
1765 }
1766 min += min1;
1767 delta += max1 - min1;
653099ff 1768 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1769 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1770 if (min1) {
1771 cl_and(data->start_class, &and_with);
1772 flags &= ~SCF_DO_STCLASS;
1773 }
1774 }
1775 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1776 if (min1) {
1777 cl_and(data->start_class, &accum);
653099ff 1778 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1779 }
1780 else {
b81d288d 1781 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1782 * data->start_class */
1783 StructCopy(data->start_class, &and_with,
1784 struct regnode_charclass_class);
1785 flags &= ~SCF_DO_STCLASS_AND;
1786 StructCopy(&accum, data->start_class,
1787 struct regnode_charclass_class);
1788 flags |= SCF_DO_STCLASS_OR;
1789 data->start_class->flags |= ANYOF_EOS;
1790 }
653099ff 1791 }
a3621e74
YO
1792
1793 /* demq.
1794
1795 Assuming this was/is a branch we are dealing with: 'scan' now
1796 points at the item that follows the branch sequence, whatever
1797 it is. We now start at the beginning of the sequence and look
1798 for subsequences of
1799
1800 BRANCH->EXACT=>X
1801 BRANCH->EXACT=>X
1802
1803 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1804
1805 If we can find such a subseqence we need to turn the first
1806 element into a trie and then add the subsequent branch exact
1807 strings to the trie.
1808
1809 We have two cases
1810
1811 1. patterns where the whole set of branch can be converted to a trie,
1812
1813 2. patterns where only a subset of the alternations can be
1814 converted to a trie.
1815
1816 In case 1 we can replace the whole set with a single regop
1817 for the trie. In case 2 we need to keep the start and end
1818 branchs so
1819
1820 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1821 becomes BRANCH TRIE; BRANCH X;
1822
1823 Hypthetically when we know the regex isnt anchored we can
1824 turn a case 1 into a DFA and let it rip... Every time it finds a match
1825 it would just call its tail, no WHILEM/CURLY needed.
1826
1827 */
0111c4fd
RGS
1828 if (DO_TRIE) {
1829 if (!re_trie_maxbuff) {
1830 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1831 if (!SvIOK(re_trie_maxbuff))
1832 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1833 }
a3621e74
YO
1834 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1835 regnode *cur;
1836 regnode *first = (regnode *)NULL;
1837 regnode *last = (regnode *)NULL;
1838 regnode *tail = scan;
1839 U8 optype = 0;
1840 U32 count=0;
1841
1842#ifdef DEBUGGING
1843 SV *mysv = sv_newmortal(); /* for dumping */
1844#endif
1845 /* var tail is used because there may be a TAIL
1846 regop in the way. Ie, the exacts will point to the
1847 thing following the TAIL, but the last branch will
1848 point at the TAIL. So we advance tail. If we
1849 have nested (?:) we may have to move through several
1850 tails.
1851 */
1852
1853 while ( OP( tail ) == TAIL ) {
1854 /* this is the TAIL generated by (?:) */
1855 tail = regnext( tail );
1856 }
1857
1858 DEBUG_OPTIMISE_r({
1859 regprop( mysv, tail );
1860 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
cfd0369c 1861 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
a3621e74
YO
1862 (RExC_seen_evals) ? "[EVAL]" : ""
1863 );
1864 });
1865 /*
1866
1867 step through the branches, cur represents each
1868 branch, noper is the first thing to be matched
1869 as part of that branch and noper_next is the
1870 regnext() of that node. if noper is an EXACT
1871 and noper_next is the same as scan (our current
1872 position in the regex) then the EXACT branch is
1873 a possible optimization target. Once we have
1874 two or more consequetive such branches we can
1875 create a trie of the EXACT's contents and stich
1876 it in place. If the sequence represents all of
1877 the branches we eliminate the whole thing and
1878 replace it with a single TRIE. If it is a
1879 subsequence then we need to stitch it in. This
1880 means the first branch has to remain, and needs
1881 to be repointed at the item on the branch chain
1882 following the last branch optimized. This could
1883 be either a BRANCH, in which case the
1884 subsequence is internal, or it could be the
1885 item following the branch sequence in which
1886 case the subsequence is at the end.
1887
1888 */
1889
1890 /* dont use tail as the end marker for this traverse */
1891 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
1892 regnode * const noper = NEXTOPER( cur );
1893 regnode * const noper_next = regnext( noper );
a3621e74 1894
a3621e74
YO
1895 DEBUG_OPTIMISE_r({
1896 regprop( mysv, cur);
1897 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1898 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
a3621e74
YO
1899
1900 regprop( mysv, noper);
1901 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 1902 SvPV_nolen_const(mysv));
a3621e74
YO
1903
1904 if ( noper_next ) {
1905 regprop( mysv, noper_next );
1906 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 1907 SvPV_nolen_const(mysv));
a3621e74
YO
1908 }
1909 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1910 first, last, cur );
1911 });
1912 if ( ( first ? OP( noper ) == optype
1913 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1914 && noper_next == tail && count<U16_MAX)
1915 {
1916 count++;
1917 if ( !first ) {
1918 first = cur;
1919 optype = OP( noper );
1920 } else {
1921 DEBUG_OPTIMISE_r(
1922 if (!last ) {
1923 regprop( mysv, first);
1924 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1925 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
a3621e74
YO
1926 regprop( mysv, NEXTOPER(first) );
1927 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1928 SvPV_nolen_const( mysv ) );
a3621e74
YO
1929 }
1930 );
1931 last = cur;
1932 DEBUG_OPTIMISE_r({
1933 regprop( mysv, cur);
1934 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1935 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
a3621e74
YO
1936 regprop( mysv, noper );
1937 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1938 SvPV_nolen_const( mysv ) );
a3621e74
YO
1939 });
1940 }
1941 } else {
1942 if ( last ) {
1943 DEBUG_OPTIMISE_r(
1944 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1945 (int)depth * 2 + 2, "E:", "**END**" );
a3621e74
YO
1946 );
1947 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1948 }
1949 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1950 && noper_next == tail )
1951 {
1952 count = 1;
1953 first = cur;
1954 optype = OP( noper );
1955 } else {
1956 count = 0;
1957 first = NULL;
1958 optype = 0;
1959 }
1960 last = NULL;
1961 }
1962 }
1963 DEBUG_OPTIMISE_r({
1964 regprop( mysv, cur);
1965 PerlIO_printf( Perl_debug_log,
e4584336 1966 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
cfd0369c 1967 " ", SvPV_nolen_const( mysv ), first, last, cur);
a3621e74
YO
1968
1969 });
1970 if ( last ) {
1971 DEBUG_OPTIMISE_r(
1972 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1973 (int)depth * 2 + 2, "E:", "==END==" );
a3621e74
YO
1974 );
1975 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1976 }
1977 }
1978 }
a0ed51b3 1979 }
a3621e74 1980 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 1981 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 1982 } else /* single branch is optimized. */
c277df42
IZ
1983 scan = NEXTOPER(scan);
1984 continue;
a0ed51b3
LW
1985 }
1986 else if (OP(scan) == EXACT) {
cd439c50 1987 I32 l = STR_LEN(scan);
1aa99e6b 1988 UV uc = *((U8*)STRING(scan));
a0ed51b3 1989 if (UTF) {
a3b680e6 1990 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 1991 l = utf8_length(s, s + l);
9041c2e3 1992 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
1993 }
1994 min += l;
c277df42 1995 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
1996 /* The code below prefers earlier match for fixed
1997 offset, later match for variable offset. */
1998 if (data->last_end == -1) { /* Update the start info. */
1999 data->last_start_min = data->pos_min;
2000 data->last_start_max = is_inf
b81d288d 2001 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2002 }
cd439c50 2003 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292 2004 {
9a957fbc 2005 SV * const sv = data->last_found;
a28509cc 2006 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2007 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2008 if (mg && mg->mg_len >= 0)
5e43f467
JH
2009 mg->mg_len += utf8_length((U8*)STRING(scan),
2010 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2011 }
33b8afdf
JH
2012 if (UTF)
2013 SvUTF8_on(data->last_found);
c277df42
IZ
2014 data->last_end = data->pos_min + l;
2015 data->pos_min += l; /* As in the first entry. */
2016 data->flags &= ~SF_BEFORE_EOL;
2017 }
653099ff
GS
2018 if (flags & SCF_DO_STCLASS_AND) {
2019 /* Check whether it is compatible with what we know already! */
2020 int compat = 1;
2021
1aa99e6b 2022 if (uc >= 0x100 ||
516a5887 2023 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2024 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2025 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2026 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2027 )
653099ff
GS
2028 compat = 0;
2029 ANYOF_CLASS_ZERO(data->start_class);
2030 ANYOF_BITMAP_ZERO(data->start_class);
2031 if (compat)
1aa99e6b 2032 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2033 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2034 if (uc < 0x100)
2035 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2036 }
2037 else if (flags & SCF_DO_STCLASS_OR) {
2038 /* false positive possible if the class is case-folded */
1aa99e6b 2039 if (uc < 0x100)
9b877dbb
IH
2040 ANYOF_BITMAP_SET(data->start_class, uc);
2041 else
2042 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2043 data->start_class->flags &= ~ANYOF_EOS;
2044 cl_and(data->start_class, &and_with);
2045 }
2046 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2047 }
653099ff 2048 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2049 I32 l = STR_LEN(scan);
1aa99e6b 2050 UV uc = *((U8*)STRING(scan));
653099ff
GS
2051
2052 /* Search for fixed substrings supports EXACT only. */
b81d288d 2053 if (flags & SCF_DO_SUBSTR)
830247a4 2054 scan_commit(pRExC_state, data);
a0ed51b3 2055 if (UTF) {
1aa99e6b
IH
2056 U8 *s = (U8 *)STRING(scan);
2057 l = utf8_length(s, s + l);
9041c2e3 2058 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2059 }
2060 min += l;
c277df42 2061 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 2062 data->pos_min += l;
653099ff
GS
2063 if (flags & SCF_DO_STCLASS_AND) {
2064 /* Check whether it is compatible with what we know already! */
2065 int compat = 1;
2066
1aa99e6b 2067 if (uc >= 0x100 ||
516a5887 2068 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2069 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2070 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2071 compat = 0;
2072 ANYOF_CLASS_ZERO(data->start_class);
2073 ANYOF_BITMAP_ZERO(data->start_class);
2074 if (compat) {
1aa99e6b 2075 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2076 data->start_class->flags &= ~ANYOF_EOS;
2077 data->start_class->flags |= ANYOF_FOLD;
2078 if (OP(scan) == EXACTFL)
2079 data->start_class->flags |= ANYOF_LOCALE;
2080 }
2081 }
2082 else if (flags & SCF_DO_STCLASS_OR) {
2083 if (data->start_class->flags & ANYOF_FOLD) {
2084 /* false positive possible if the class is case-folded.
2085 Assume that the locale settings are the same... */
1aa99e6b
IH
2086 if (uc < 0x100)
2087 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2088 data->start_class->flags &= ~ANYOF_EOS;
2089 }
2090 cl_and(data->start_class, &and_with);
2091 }
2092 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2093 }
bfed75c6 2094 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2095 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2096 I32 f = flags, pos_before = 0;
c277df42 2097 regnode *oscan = scan;
653099ff
GS
2098 struct regnode_charclass_class this_class;
2099 struct regnode_charclass_class *oclass = NULL;
727f22e3 2100 I32 next_is_eval = 0;
653099ff 2101
22c35a8c 2102 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2103 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2104 scan = NEXTOPER(scan);
2105 goto finish;
2106 case PLUS:
653099ff 2107 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2108 next = NEXTOPER(scan);
653099ff 2109 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2110 mincount = 1;
2111 maxcount = REG_INFTY;
c277df42
IZ
2112 next = regnext(scan);
2113 scan = NEXTOPER(scan);
2114 goto do_curly;
2115 }
2116 }
2117 if (flags & SCF_DO_SUBSTR)
2118 data->pos_min++;
2119 min++;
2120 /* Fall through. */
2121 case STAR:
653099ff
GS
2122 if (flags & SCF_DO_STCLASS) {
2123 mincount = 0;
b81d288d 2124 maxcount = REG_INFTY;
653099ff
GS
2125 next = regnext(scan);
2126 scan = NEXTOPER(scan);
2127 goto do_curly;
2128 }
b81d288d 2129 is_inf = is_inf_internal = 1;
c277df42
IZ
2130 scan = regnext(scan);
2131 if (flags & SCF_DO_SUBSTR) {
830247a4 2132 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2133 data->longest = &(data->longest_float);
2134 }
2135 goto optimize_curly_tail;
2136 case CURLY:
b81d288d 2137 mincount = ARG1(scan);
c277df42
IZ
2138 maxcount = ARG2(scan);
2139 next = regnext(scan);
cb434fcc
IZ
2140 if (OP(scan) == CURLYX) {
2141 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2142 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2143 }
c277df42 2144 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2145 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2146 do_curly:
2147 if (flags & SCF_DO_SUBSTR) {
830247a4 2148 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2149 pos_before = data->pos_min;
2150 }
2151 if (data) {
2152 fl = data->flags;
2153 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2154 if (is_inf)
2155 data->flags |= SF_IS_INF;
2156 }
653099ff 2157 if (flags & SCF_DO_STCLASS) {
830247a4 2158 cl_init(pRExC_state, &this_class);
653099ff
GS
2159 oclass = data->start_class;
2160 data->start_class = &this_class;
2161 f |= SCF_DO_STCLASS_AND;
2162 f &= ~SCF_DO_STCLASS_OR;
2163 }
e1901655
IZ
2164 /* These are the cases when once a subexpression
2165 fails at a particular position, it cannot succeed
2166 even after backtracking at the enclosing scope.
b81d288d 2167
e1901655
IZ
2168 XXXX what if minimal match and we are at the
2169 initial run of {n,m}? */
2170 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2171 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2172
c277df42 2173 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2174 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2175 (mincount == 0
2176 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2177
2178 if (flags & SCF_DO_STCLASS)
2179 data->start_class = oclass;
2180 if (mincount == 0 || minnext == 0) {
2181 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2182 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2183 }
2184 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2185 /* Switch to OR mode: cache the old value of
653099ff
GS
2186 * data->start_class */
2187 StructCopy(data->start_class, &and_with,
2188 struct regnode_charclass_class);
2189 flags &= ~SCF_DO_STCLASS_AND;
2190 StructCopy(&this_class, data->start_class,
2191 struct regnode_charclass_class);
2192 flags |= SCF_DO_STCLASS_OR;
2193 data->start_class->flags |= ANYOF_EOS;
2194 }
2195 } else { /* Non-zero len */
2196 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2197 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2198 cl_and(data->start_class, &and_with);
2199 }
2200 else if (flags & SCF_DO_STCLASS_AND)
2201 cl_and(data->start_class, &this_class);
2202 flags &= ~SCF_DO_STCLASS;
2203 }
c277df42
IZ
2204 if (!scan) /* It was not CURLYX, but CURLY. */
2205 scan = next;
041457d9
DM
2206 if ( /* ? quantifier ok, except for (?{ ... }) */
2207 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2208 && (minnext == 0) && (deltanext == 0)
99799961 2209 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2210 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2211 && ckWARN(WARN_REGEXP))
b45f050a 2212 {
830247a4 2213 vWARN(RExC_parse,
b45f050a
JF
2214 "Quantifier unexpected on zero-length expression");
2215 }
2216
c277df42 2217 min += minnext * mincount;
b81d288d 2218 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2219 && (minnext + deltanext) > 0)
2220 || deltanext == I32_MAX);
aca2d497 2221 is_inf |= is_inf_internal;
c277df42
IZ
2222 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2223
2224 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2225 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2226 && data->flags & SF_IN_PAR
2227 && !(data->flags & SF_HAS_EVAL)
2228 && !deltanext && minnext == 1 ) {
2229 /* Try to optimize to CURLYN. */
2230 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
2231 regnode *nxt1 = nxt;
2232#ifdef DEBUGGING
2233 regnode *nxt2;
2234#endif
c277df42
IZ
2235
2236 /* Skip open. */
2237 nxt = regnext(nxt);
bfed75c6 2238 if (!strchr((const char*)PL_simple,OP(nxt))
22c35a8c 2239 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 2240 && STR_LEN(nxt) == 1))
c277df42 2241 goto nogo;
497b47a8 2242#ifdef DEBUGGING
c277df42 2243 nxt2 = nxt;
497b47a8 2244#endif
c277df42 2245 nxt = regnext(nxt);
b81d288d 2246 if (OP(nxt) != CLOSE)
c277df42
IZ
2247 goto nogo;
2248 /* Now we know that nxt2 is the only contents: */
eb160463 2249 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2250 OP(oscan) = CURLYN;
2251 OP(nxt1) = NOTHING; /* was OPEN. */
2252#ifdef DEBUGGING
2253 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2254 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2255 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2256 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2257 OP(nxt + 1) = OPTIMIZED; /* was count. */
2258 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2259#endif
c277df42 2260 }
c277df42
IZ
2261 nogo:
2262
2263 /* Try optimization CURLYX => CURLYM. */
b81d288d 2264 if ( OP(oscan) == CURLYX && data
c277df42 2265 && !(data->flags & SF_HAS_PAR)
c277df42 2266 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2267 && !deltanext /* atom is fixed width */
2268 && minnext != 0 /* CURLYM can't handle zero width */
2269 ) {
c277df42
IZ
2270 /* XXXX How to optimize if data == 0? */
2271 /* Optimize to a simpler form. */
2272 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2273 regnode *nxt2;
2274
2275 OP(oscan) = CURLYM;
2276 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2277 && (OP(nxt2) != WHILEM))
c277df42
IZ
2278 nxt = nxt2;
2279 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2280 /* Need to optimize away parenths. */
2281 if (data->flags & SF_IN_PAR) {
2282 /* Set the parenth number. */
2283 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2284
b81d288d 2285 if (OP(nxt) != CLOSE)
b45f050a 2286 FAIL("Panic opt close");
eb160463 2287 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2288 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2289 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2290#ifdef DEBUGGING
2291 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2292 OP(nxt + 1) = OPTIMIZED; /* was count. */
2293 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2294 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2295#endif
c277df42
IZ
2296#if 0
2297 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2298 regnode *nnxt = regnext(nxt1);
b81d288d 2299
c277df42
IZ
2300 if (nnxt == nxt) {
2301 if (reg_off_by_arg[OP(nxt1)])
2302 ARG_SET(nxt1, nxt2 - nxt1);
2303 else if (nxt2 - nxt1 < U16_MAX)
2304 NEXT_OFF(nxt1) = nxt2 - nxt1;
2305 else
2306 OP(nxt) = NOTHING; /* Cannot beautify */
2307 }
2308 nxt1 = nnxt;
2309 }
2310#endif
2311 /* Optimize again: */
b81d288d 2312 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2313 NULL, 0,depth+1);
a0ed51b3
LW
2314 }
2315 else
c277df42 2316 oscan->flags = 0;
c277df42 2317 }
e1901655
IZ
2318 else if ((OP(oscan) == CURLYX)
2319 && (flags & SCF_WHILEM_VISITED_POS)
2320 /* See the comment on a similar expression above.
2321 However, this time it not a subexpression
2322 we care about, but the expression itself. */
2323 && (maxcount == REG_INFTY)
2324 && data && ++data->whilem_c < 16) {
2325 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2326 /* Find WHILEM (as in regexec.c) */
2327 regnode *nxt = oscan + NEXT_OFF(oscan);
2328
2329 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2330 nxt += ARG(nxt);
eb160463
GS
2331 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2332 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2333 }
b81d288d 2334 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2335 pars++;
2336 if (flags & SCF_DO_SUBSTR) {
2337 SV *last_str = Nullsv;
2338 int counted = mincount != 0;
2339
2340 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2341#if defined(SPARC64_GCC_WORKAROUND)
2342 I32 b = 0;
2343 STRLEN l = 0;
cfd0369c 2344 const char *s = NULL;
5d1c421c
JH
2345 I32 old = 0;
2346
2347 if (pos_before >= data->last_start_min)
2348 b = pos_before;
2349 else
2350 b = data->last_start_min;
2351
2352 l = 0;
cfd0369c 2353 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2354 old = b - data->last_start_min;
2355
2356#else
b81d288d 2357 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2358 ? pos_before : data->last_start_min;
2359 STRLEN l;
cfd0369c 2360 const char *s = SvPV_const(data->last_found, l);
a0ed51b3 2361 I32 old = b - data->last_start_min;
5d1c421c 2362#endif
a0ed51b3
LW
2363
2364 if (UTF)
2365 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2366
a0ed51b3 2367 l -= old;
c277df42 2368 /* Get the added string: */
79cb57f6 2369 last_str = newSVpvn(s + old, l);
0e933229
IH
2370 if (UTF)
2371 SvUTF8_on(last_str);
c277df42
IZ
2372 if (deltanext == 0 && pos_before == b) {
2373 /* What was added is a constant string */
2374 if (mincount > 1) {
2375 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2376 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2377 SvPVX_const(last_str), l, mincount - 1);
b162af07 2378 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2379 /* Add additional parts. */
b81d288d 2380 SvCUR_set(data->last_found,
c277df42
IZ
2381 SvCUR(data->last_found) - l);
2382 sv_catsv(data->last_found, last_str);
0eda9292
JH
2383 {
2384 SV * sv = data->last_found;
2385 MAGIC *mg =
2386 SvUTF8(sv) && SvMAGICAL(sv) ?
2387 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2388 if (mg && mg->mg_len >= 0)
2389 mg->mg_len += CHR_SVLEN(last_str);
2390 }
c277df42
IZ
2391 data->last_end += l * (mincount - 1);
2392 }
2a8d9689
HS
2393 } else {
2394 /* start offset must point into the last copy */
2395 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2396 data->last_start_max += is_inf ? I32_MAX
2397 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2398 }
2399 }
2400 /* It is counted once already... */
2401 data->pos_min += minnext * (mincount - counted);
2402 data->pos_delta += - counted * deltanext +
2403 (minnext + deltanext) * maxcount - minnext * mincount;
2404 if (mincount != maxcount) {
653099ff
GS
2405 /* Cannot extend fixed substrings found inside
2406 the group. */
830247a4 2407 scan_commit(pRExC_state,data);
c277df42
IZ
2408 if (mincount && last_str) {
2409 sv_setsv(data->last_found, last_str);
2410 data->last_end = data->pos_min;
b81d288d 2411 data->last_start_min =
a0ed51b3 2412 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2413 data->last_start_max = is_inf
2414 ? I32_MAX
c277df42 2415 : data->pos_min + data->pos_delta
a0ed51b3 2416 - CHR_SVLEN(last_str);
c277df42
IZ
2417 }
2418 data->longest = &(data->longest_float);
2419 }
aca2d497 2420 SvREFCNT_dec(last_str);
c277df42 2421 }
405ff068 2422 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2423 data->flags |= SF_HAS_EVAL;
2424 optimize_curly_tail:
c277df42 2425 if (OP(oscan) != CURLYX) {
22c35a8c 2426 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2427 && NEXT_OFF(next))
2428 NEXT_OFF(oscan) += NEXT_OFF(next);
2429 }
c277df42 2430 continue;
653099ff 2431 default: /* REF and CLUMP only? */
c277df42 2432 if (flags & SCF_DO_SUBSTR) {
830247a4 2433 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2434 data->longest = &(data->longest_float);
2435 }
aca2d497 2436 is_inf = is_inf_internal = 1;
653099ff 2437 if (flags & SCF_DO_STCLASS_OR)
830247a4 2438 cl_anything(pRExC_state, data->start_class);
653099ff 2439 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2440 break;
2441 }
a0ed51b3 2442 }
bfed75c6 2443 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2444 int value = 0;
653099ff 2445
c277df42 2446 if (flags & SCF_DO_SUBSTR) {
830247a4 2447 scan_commit(pRExC_state,data);
c277df42
IZ
2448 data->pos_min++;
2449 }
2450 min++;
653099ff
GS
2451 if (flags & SCF_DO_STCLASS) {
2452 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2453
2454 /* Some of the logic below assumes that switching
2455 locale on will only add false positives. */
2456 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2457 case SANY:
653099ff
GS
2458 default:
2459 do_default:
2460 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2461 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2462 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2463 break;
2464 case REG_ANY:
2465 if (OP(scan) == SANY)
2466 goto do_default;
2467 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2468 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2469 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2470 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2471 }
2472 if (flags & SCF_DO_STCLASS_AND || !value)
2473 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2474 break;
2475 case ANYOF:
2476 if (flags & SCF_DO_STCLASS_AND)
2477 cl_and(data->start_class,
2478 (struct regnode_charclass_class*)scan);
2479 else
830247a4 2480 cl_or(pRExC_state, data->start_class,
653099ff
GS
2481 (struct regnode_charclass_class*)scan);
2482 break;
2483 case ALNUM:
2484 if (flags & SCF_DO_STCLASS_AND) {
2485 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2486 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2487 for (value = 0; value < 256; value++)
2488 if (!isALNUM(value))
2489 ANYOF_BITMAP_CLEAR(data->start_class, value);
2490 }
2491 }
2492 else {
2493 if (data->start_class->flags & ANYOF_LOCALE)
2494 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2495 else {
2496 for (value = 0; value < 256; value++)
2497 if (isALNUM(value))
b81d288d 2498 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2499 }
2500 }
2501 break;
2502 case ALNUML:
2503 if (flags & SCF_DO_STCLASS_AND) {
2504 if (data->start_class->flags & ANYOF_LOCALE)
2505 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2506 }
2507 else {
2508 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2509 data->start_class->flags |= ANYOF_LOCALE;
2510 }
2511 break;
2512 case NALNUM:
2513 if (flags & SCF_DO_STCLASS_AND) {
2514 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2515 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2516 for (value = 0; value < 256; value++)
2517 if (isALNUM(value))
2518 ANYOF_BITMAP_CLEAR(data->start_class, value);
2519 }
2520 }
2521 else {
2522 if (data->start_class->flags & ANYOF_LOCALE)
2523 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2524 else {
2525 for (value = 0; value < 256; value++)
2526 if (!isALNUM(value))
b81d288d 2527 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2528 }
2529 }
2530 break;
2531 case NALNUML:
2532 if (flags & SCF_DO_STCLASS_AND) {
2533 if (data->start_class->flags & ANYOF_LOCALE)
2534 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2535 }
2536 else {
2537 data->start_class->flags |= ANYOF_LOCALE;
2538 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2539 }
2540 break;
2541 case SPACE:
2542 if (flags & SCF_DO_STCLASS_AND) {
2543 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2544 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2545 for (value = 0; value < 256; value++)
2546 if (!isSPACE(value))
2547 ANYOF_BITMAP_CLEAR(data->start_class, value);
2548 }
2549 }
2550 else {
2551 if (data->start_class->flags & ANYOF_LOCALE)
2552 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2553 else {
2554 for (value = 0; value < 256; value++)
2555 if (isSPACE(value))
b81d288d 2556 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2557 }
2558 }
2559 break;
2560 case SPACEL:
2561 if (flags & SCF_DO_STCLASS_AND) {
2562 if (data->start_class->flags & ANYOF_LOCALE)
2563 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2564 }
2565 else {
2566 data->start_class->flags |= ANYOF_LOCALE;
2567 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2568 }
2569 break;
2570 case NSPACE:
2571 if (flags & SCF_DO_STCLASS_AND) {
2572 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2573 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2574 for (value = 0; value < 256; value++)
2575 if (isSPACE(value))
2576 ANYOF_BITMAP_CLEAR(data->start_class, value);
2577 }
2578 }
2579 else {
2580 if (data->start_class->flags & ANYOF_LOCALE)
2581 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2582 else {
2583 for (value = 0; value < 256; value++)
2584 if (!isSPACE(value))
b81d288d 2585 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2586 }
2587 }
2588 break;
2589 case NSPACEL:
2590 if (flags & SCF_DO_STCLASS_AND) {
2591 if (data->start_class->flags & ANYOF_LOCALE) {
2592 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2593 for (value = 0; value < 256; value++)
2594 if (!isSPACE(value))
2595 ANYOF_BITMAP_CLEAR(data->start_class, value);
2596 }
2597 }
2598 else {
2599 data->start_class->flags |= ANYOF_LOCALE;
2600 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2601 }
2602 break;
2603 case DIGIT:
2604 if (flags & SCF_DO_STCLASS_AND) {
2605 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2606 for (value = 0; value < 256; value++)
2607 if (!isDIGIT(value))
2608 ANYOF_BITMAP_CLEAR(data->start_class, value);
2609 }
2610 else {
2611 if (data->start_class->flags & ANYOF_LOCALE)
2612 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2613 else {
2614 for (value = 0; value < 256; value++)
2615 if (isDIGIT(value))
b81d288d 2616 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2617 }
2618 }
2619 break;
2620 case NDIGIT:
2621 if (flags & SCF_DO_STCLASS_AND) {
2622 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2623 for (value = 0; value < 256; value++)
2624 if (isDIGIT(value))
2625 ANYOF_BITMAP_CLEAR(data->start_class, value);
2626 }
2627 else {
2628 if (data->start_class->flags & ANYOF_LOCALE)
2629 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2630 else {
2631 for (value = 0; value < 256; value++)
2632 if (!isDIGIT(value))
b81d288d 2633 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2634 }
2635 }
2636 break;
2637 }
2638 if (flags & SCF_DO_STCLASS_OR)
2639 cl_and(data->start_class, &and_with);
2640 flags &= ~SCF_DO_STCLASS;
2641 }
a0ed51b3 2642 }
22c35a8c 2643 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2644 data->flags |= (OP(scan) == MEOL
2645 ? SF_BEFORE_MEOL
2646 : SF_BEFORE_SEOL);
a0ed51b3 2647 }
653099ff
GS
2648 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2649 /* Lookbehind, or need to calculate parens/evals/stclass: */
2650 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2651 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2652 /* Lookahead/lookbehind */
cb434fcc 2653 I32 deltanext, minnext, fake = 0;
c277df42 2654 regnode *nscan;
653099ff
GS
2655 struct regnode_charclass_class intrnl;
2656 int f = 0;
c277df42
IZ
2657
2658 data_fake.flags = 0;
b81d288d 2659 if (data) {
2c2d71f5 2660 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2661 data_fake.last_closep = data->last_closep;
2662 }
2663 else
2664 data_fake.last_closep = &fake;
653099ff
GS
2665 if ( flags & SCF_DO_STCLASS && !scan->flags
2666 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2667 cl_init(pRExC_state, &intrnl);
653099ff 2668 data_fake.start_class = &intrnl;
e1901655 2669 f |= SCF_DO_STCLASS_AND;
653099ff 2670 }
e1901655
IZ
2671 if (flags & SCF_WHILEM_VISITED_POS)
2672 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2673 next = regnext(scan);
2674 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2675 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2676 if (scan->flags) {
2677 if (deltanext) {
9baa0206 2678 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2679 }
2680 else if (minnext > U8_MAX) {
9baa0206 2681 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2682 }
eb160463 2683 scan->flags = (U8)minnext;
c277df42
IZ
2684 }
2685 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2686 pars++;
405ff068 2687 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2688 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2689 if (data)
2690 data->whilem_c = data_fake.whilem_c;
e1901655 2691 if (f & SCF_DO_STCLASS_AND) {
a28509cc 2692 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
2693
2694 cl_and(data->start_class, &intrnl);
2695 if (was)
2696 data->start_class->flags |= ANYOF_EOS;
2697 }
a0ed51b3
LW
2698 }
2699 else if (OP(scan) == OPEN) {
c277df42 2700 pars++;
a0ed51b3 2701 }
cb434fcc 2702 else if (OP(scan) == CLOSE) {
eb160463 2703 if ((I32)ARG(scan) == is_par) {
cb434fcc 2704 next = regnext(scan);
c277df42 2705
cb434fcc
IZ
2706 if ( next && (OP(next) != WHILEM) && next < last)
2707 is_par = 0; /* Disable optimization */
2708 }
2709 if (data)
2710 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2711 }
2712 else if (OP(scan) == EVAL) {
c277df42
IZ
2713 if (data)
2714 data->flags |= SF_HAS_EVAL;
2715 }
96776eda 2716 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2717 if (flags & SCF_DO_SUBSTR) {
830247a4 2718 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2719 data->longest = &(data->longest_float);
2720 }
2721 is_inf = is_inf_internal = 1;
653099ff 2722 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2723 cl_anything(pRExC_state, data->start_class);
96776eda 2724 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2725 }
c277df42
IZ
2726 /* Else: zero-length, ignore. */
2727 scan = regnext(scan);
2728 }
2729
2730 finish:
2731 *scanp = scan;
aca2d497 2732 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2733 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2734 data->pos_delta = I32_MAX - data->pos_min;
2735 if (is_par > U8_MAX)
2736 is_par = 0;
2737 if (is_par && pars==1 && data) {
2738 data->flags |= SF_IN_PAR;
2739 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2740 }
2741 else if (pars && data) {
c277df42
IZ
2742 data->flags |= SF_HAS_PAR;
2743 data->flags &= ~SF_IN_PAR;
2744 }
653099ff
GS
2745 if (flags & SCF_DO_STCLASS_OR)
2746 cl_and(data->start_class, &and_with);
c277df42
IZ
2747 return min;
2748}
2749
76e3520e 2750STATIC I32
bfed75c6 2751S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2752{
830247a4 2753 if (RExC_rx->data) {
b81d288d
AB
2754 Renewc(RExC_rx->data,
2755 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2756 char, struct reg_data);
830247a4
IZ
2757 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2758 RExC_rx->data->count += n;
a0ed51b3
LW
2759 }
2760 else {
a02a5408 2761 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2762 char, struct reg_data);
a02a5408 2763 Newx(RExC_rx->data->what, n, U8);
830247a4 2764 RExC_rx->data->count = n;
c277df42 2765 }
830247a4
IZ
2766 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2767 return RExC_rx->data->count - n;
c277df42
IZ
2768}
2769
d88dccdf 2770void
864dbfa3 2771Perl_reginitcolors(pTHX)
d88dccdf 2772{
1df70142 2773 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 2774 if (s) {
1df70142
AL
2775 char *t = savepv(s);
2776 int i = 0;
2777 PL_colors[0] = t;
d88dccdf 2778 while (++i < 6) {
1df70142
AL
2779 t = strchr(t, '\t');
2780 if (t) {
2781 *t = '\0';
2782 PL_colors[i] = ++t;
d88dccdf
IZ
2783 }
2784 else
1df70142 2785 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
2786 }
2787 } else {
1df70142 2788 int i = 0;
b81d288d 2789 while (i < 6)
06b5626a 2790 PL_colors[i++] = (char *)"";
d88dccdf
IZ
2791 }
2792 PL_colorset = 1;
2793}
2794
8615cb43 2795
a687059c 2796/*
e50aee73 2797 - pregcomp - compile a regular expression into internal code
a687059c
LW
2798 *
2799 * We can't allocate space until we know how big the compiled form will be,
2800 * but we can't compile it (and thus know how big it is) until we've got a
2801 * place to put the code. So we cheat: we compile it twice, once with code
2802 * generation turned off and size counting turned on, and once "for real".
2803 * This also means that we don't allocate space until we are sure that the
2804 * thing really will compile successfully, and we never have to move the
2805 * code and thus invalidate pointers into it. (Note that it has to be in
2806 * one piece because free() must be able to free it all.) [NB: not true in perl]
2807 *
2808 * Beware that the optimization-preparation code in here knows about some
2809 * of the structure of the compiled regexp. [I'll say.]
2810 */
2811regexp *
864dbfa3 2812Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2813{
a0d0e21e 2814 register regexp *r;
c277df42 2815 regnode *scan;
c277df42 2816 regnode *first;
a0d0e21e 2817 I32 flags;
a0d0e21e
LW
2818 I32 minlen = 0;
2819 I32 sawplus = 0;
2820 I32 sawopen = 0;
2c2d71f5 2821 scan_data_t data;
830247a4
IZ
2822 RExC_state_t RExC_state;
2823 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 2824
a3621e74
YO
2825 GET_RE_DEBUG_FLAGS_DECL;
2826
a0d0e21e 2827 if (exp == NULL)
c277df42 2828 FAIL("NULL regexp argument");
a0d0e21e 2829
a5961de5 2830 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 2831
5cfc7842 2832 RExC_precomp = exp;
a3621e74
YO
2833 DEBUG_r(if (!PL_colorset) reginitcolors());
2834 DEBUG_COMPILE_r({
2835 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
2836 PL_colors[4],PL_colors[5],PL_colors[0],
2837 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2838 });
e2509266 2839 RExC_flags = pm->op_pmflags;
830247a4 2840 RExC_sawback = 0;
bbce6d69 2841
830247a4
IZ
2842 RExC_seen = 0;
2843 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2844 RExC_seen_evals = 0;
2845 RExC_extralen = 0;
c277df42 2846
bbce6d69 2847 /* First pass: determine size, legality. */
830247a4 2848 RExC_parse = exp;
fac92740 2849 RExC_start = exp;
830247a4
IZ
2850 RExC_end = xend;
2851 RExC_naughty = 0;
2852 RExC_npar = 1;
2853 RExC_size = 0L;
2854 RExC_emit = &PL_regdummy;
2855 RExC_whilem_seen = 0;
85ddcde9
JH
2856#if 0 /* REGC() is (currently) a NOP at the first pass.
2857 * Clever compilers notice this and complain. --jhi */
830247a4 2858 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 2859#endif
830247a4 2860 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 2861 RExC_precomp = Nullch;
a0d0e21e
LW
2862 return(NULL);
2863 }
a3621e74 2864 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 2865
c277df42
IZ
2866 /* Small enough for pointer-storage convention?
2867 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
2868 if (RExC_size >= 0x10000L && RExC_extralen)
2869 RExC_size += RExC_extralen;
c277df42 2870 else
830247a4
IZ
2871 RExC_extralen = 0;
2872 if (RExC_whilem_seen > 15)
2873 RExC_whilem_seen = 15;
a0d0e21e 2874
bbce6d69 2875 /* Allocate space and initialize. */
a02a5408 2876 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 2877 char, regexp);
a0d0e21e 2878 if (r == NULL)
b45f050a
JF
2879 FAIL("Regexp out of space");
2880
0f79a09d
GS
2881#ifdef DEBUGGING
2882 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 2883 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 2884#endif
c277df42 2885 r->refcnt = 1;
bbce6d69 2886 r->prelen = xend - exp;
5cfc7842 2887 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 2888 r->subbeg = NULL;
f8c7b90f 2889#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
2890 r->saved_copy = Nullsv;
2891#endif
cf93c79d 2892 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 2893 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
2894
2895 r->substrs = 0; /* Useful during FAIL. */
2896 r->startp = 0; /* Useful during FAIL. */
2897 r->endp = 0; /* Useful during FAIL. */
2898
a02a5408 2899 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 2900 if (r->offsets) {
2af232bd 2901 r->offsets[0] = RExC_size;
fac92740 2902 }
a3621e74 2903 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
2904 "%s %"UVuf" bytes for offset annotations.\n",
2905 r->offsets ? "Got" : "Couldn't get",
392fbf5d 2906 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 2907
830247a4 2908 RExC_rx = r;
bbce6d69 2909
2910 /* Second pass: emit code. */
e2509266 2911 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
2912 RExC_parse = exp;
2913 RExC_end = xend;
2914 RExC_naughty = 0;
2915 RExC_npar = 1;
fac92740 2916 RExC_emit_start = r->program;
830247a4 2917 RExC_emit = r->program;
2cd61cdb 2918 /* Store the count of eval-groups for security checks: */
eb160463 2919 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 2920 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 2921 r->data = 0;
830247a4 2922 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
2923 return(NULL);
2924
a3621e74 2925
a0d0e21e 2926 /* Dig out information for optimizations. */
cf93c79d 2927 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 2928 pm->op_pmflags = RExC_flags;
a0ed51b3 2929 if (UTF)
5ff6fc6d 2930 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 2931 r->regstclass = NULL;
830247a4 2932 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 2933 r->reganch |= ROPT_NAUGHTY;
c277df42 2934 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
2935
2936 /* XXXX To minimize changes to RE engine we always allocate
2937 3-units-long substrs field. */
a02a5408 2938 Newxz(r->substrs, 1, struct reg_substr_data);
2779dcf1 2939
2c2d71f5 2940 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 2941 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 2942 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 2943 I32 fake;
c5254dd6 2944 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
2945 struct regnode_charclass_class ch_class;
2946 int stclass_flag;
cb434fcc 2947 I32 last_close = 0;
a0d0e21e
LW
2948
2949 first = scan;
c277df42 2950 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 2951 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 2952 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
2953 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2954 (OP(first) == PLUS) ||
2955 (OP(first) == MINMOD) ||
653099ff 2956 /* An {n,m} with n>0 */
22c35a8c 2957 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
2958 if (OP(first) == PLUS)
2959 sawplus = 1;
2960 else
2961 first += regarglen[(U8)OP(first)];
2962 first = NEXTOPER(first);
a687059c
LW
2963 }
2964
a0d0e21e
LW
2965 /* Starting-point info. */
2966 again:
653099ff 2967 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
2968 if (OP(first) == EXACT)
2969 ; /* Empty, get anchored substr later. */
2970 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
2971 r->regstclass = first;
2972 }
bfed75c6 2973 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 2974 r->regstclass = first;
22c35a8c
GS
2975 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2976 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 2977 r->regstclass = first;
22c35a8c 2978 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
2979 r->reganch |= (OP(first) == MBOL
2980 ? ROPT_ANCH_MBOL
2981 : (OP(first) == SBOL
2982 ? ROPT_ANCH_SBOL
2983 : ROPT_ANCH_BOL));
a0d0e21e 2984 first = NEXTOPER(first);
774d564b 2985 goto again;
2986 }
2987 else if (OP(first) == GPOS) {
2988 r->reganch |= ROPT_ANCH_GPOS;
2989 first = NEXTOPER(first);
2990 goto again;
a0d0e21e 2991 }
e09294f4 2992 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 2993 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
2994 !(r->reganch & ROPT_ANCH) )
2995 {
2996 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
2997 const int type =
2998 (OP(NEXTOPER(first)) == REG_ANY)
2999 ? ROPT_ANCH_MBOL
3000 : ROPT_ANCH_SBOL;
cad2e5aa 3001 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3002 first = NEXTOPER(first);
774d564b 3003 goto again;
a0d0e21e 3004 }
b81d288d 3005 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3006 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3007 /* x+ must match at the 1st pos of run of x's */
3008 r->reganch |= ROPT_SKIP;
a0d0e21e 3009
c277df42 3010 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 3011 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 3012 (IV)(first - scan + 1)));
a0d0e21e
LW
3013 /*
3014 * If there's something expensive in the r.e., find the
3015 * longest literal string that must appear and make it the
3016 * regmust. Resolve ties in favor of later strings, since
3017 * the regstart check works with the beginning of the r.e.
3018 * and avoiding duplication strengthens checking. Not a
3019 * strong reason, but sufficient in the absence of others.
3020 * [Now we resolve ties in favor of the earlier string if
c277df42 3021 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3022 * earlier string may buy us something the later one won't.]
3023 */
a0d0e21e 3024 minlen = 0;
a687059c 3025
79cb57f6
GS
3026 data.longest_fixed = newSVpvn("",0);
3027 data.longest_float = newSVpvn("",0);
3028 data.last_found = newSVpvn("",0);
c277df42
IZ
3029 data.longest = &(data.longest_fixed);
3030 first = scan;
653099ff 3031 if (!r->regstclass) {
830247a4 3032 cl_init(pRExC_state, &ch_class);
653099ff
GS
3033 data.start_class = &ch_class;
3034 stclass_flag = SCF_DO_STCLASS_AND;
3035 } else /* XXXX Check for BOUND? */
3036 stclass_flag = 0;
cb434fcc 3037 data.last_closep = &last_close;
653099ff 3038
830247a4 3039 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3040 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3041 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3042 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3043 && !RExC_seen_zerolen
3044 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3045 r->reganch |= ROPT_CHECK_ALL;
830247a4 3046 scan_commit(pRExC_state, &data);
c277df42
IZ
3047 SvREFCNT_dec(data.last_found);
3048
a0ed51b3 3049 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3050 if (longest_float_length
c277df42
IZ
3051 || (data.flags & SF_FL_BEFORE_EOL
3052 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3053 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3054 int t;
3055
a0ed51b3 3056 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3057 && data.offset_fixed == data.offset_float_min
3058 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3059 goto remove_float; /* As in (a)+. */
3060
33b8afdf
JH
3061 if (SvUTF8(data.longest_float)) {
3062 r->float_utf8 = data.longest_float;
3063 r->float_substr = Nullsv;
3064 } else {
3065 r->float_substr = data.longest_float;
3066 r->float_utf8 = Nullsv;
3067 }
c277df42
IZ
3068 r->float_min_offset = data.offset_float_min;
3069 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3070 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3071 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3072 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3073 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3074 }
3075 else {
aca2d497 3076 remove_float:
33b8afdf 3077 r->float_substr = r->float_utf8 = Nullsv;
c277df42 3078 SvREFCNT_dec(data.longest_float);
c5254dd6 3079 longest_float_length = 0;
a0d0e21e 3080 }
c277df42 3081
a0ed51b3 3082 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3083 if (longest_fixed_length
c277df42
IZ
3084 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3085 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3086 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3087 int t;
3088
33b8afdf
JH
3089 if (SvUTF8(data.longest_fixed)) {
3090 r->anchored_utf8 = data.longest_fixed;
3091 r->anchored_substr = Nullsv;
3092 } else {
3093 r->anchored_substr = data.longest_fixed;
3094 r->anchored_utf8 = Nullsv;
3095 }
c277df42 3096 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3097 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3098 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3099 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3100 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3101 }
3102 else {
33b8afdf 3103 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 3104 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3105 longest_fixed_length = 0;
a0d0e21e 3106 }
b81d288d 3107 if (r->regstclass
ffc61ed2 3108 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3109 r->regstclass = NULL;
33b8afdf
JH
3110 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3111 && stclass_flag
653099ff 3112 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3113 && !cl_is_anything(data.start_class))
3114 {
1df70142 3115 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3116
a02a5408 3117 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3118 struct regnode_charclass_class);
3119 StructCopy(data.start_class,
830247a4 3120 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3121 struct regnode_charclass_class);
830247a4 3122 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3123 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 3124 PL_regdata = r->data; /* for regprop() */
a3621e74 3125 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
9c5ffd7c
JH
3126 regprop(sv, (regnode*)data.start_class);
3127 PerlIO_printf(Perl_debug_log,
a0288114 3128 "synthetic stclass \"%s\".\n",
3f7c398e 3129 SvPVX_const(sv));});
653099ff 3130 }
c277df42
IZ
3131
3132 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3133 if (longest_fixed_length > longest_float_length) {
c277df42 3134 r->check_substr = r->anchored_substr;
33b8afdf 3135 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3136 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3137 if (r->reganch & ROPT_ANCH_SINGLE)
3138 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3139 }
3140 else {
c277df42 3141 r->check_substr = r->float_substr;
33b8afdf 3142 r->check_utf8 = r->float_utf8;
c277df42
IZ
3143 r->check_offset_min = data.offset_float_min;
3144 r->check_offset_max = data.offset_float_max;
a0d0e21e 3145 }
30382c73
IZ
3146 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3147 This should be changed ASAP! */
33b8afdf 3148 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3149 r->reganch |= RE_USE_INTUIT;
33b8afdf 3150 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3151 r->reganch |= RE_INTUIT_TAIL;
3152 }
a0ed51b3
LW
3153 }
3154 else {
c277df42
IZ
3155 /* Several toplevels. Best we can is to set minlen. */
3156 I32 fake;
653099ff 3157 struct regnode_charclass_class ch_class;
cb434fcc 3158 I32 last_close = 0;
c277df42 3159
a3621e74 3160 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3161 scan = r->program + 1;
830247a4 3162 cl_init(pRExC_state, &ch_class);
653099ff 3163 data.start_class = &ch_class;
cb434fcc 3164 data.last_closep = &last_close;
a3621e74 3165 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf
JH
3166 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3167 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 3168 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3169 && !cl_is_anything(data.start_class))
3170 {
1df70142 3171 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3172
a02a5408 3173 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3174 struct regnode_charclass_class);
3175 StructCopy(data.start_class,
830247a4 3176 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3177 struct regnode_charclass_class);
830247a4 3178 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3179 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3180 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
9c5ffd7c
JH
3181 regprop(sv, (regnode*)data.start_class);
3182 PerlIO_printf(Perl_debug_log,
a0288114 3183 "synthetic stclass \"%s\".\n",
3f7c398e 3184 SvPVX_const(sv));});
653099ff 3185 }
a0d0e21e
LW
3186 }
3187
a0d0e21e 3188 r->minlen = minlen;
b81d288d 3189 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3190 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3191 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3192 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3193 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3194 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3195 if (RExC_seen & REG_SEEN_CANY)
3196 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3197 Newxz(r->startp, RExC_npar, I32);
3198 Newxz(r->endp, RExC_npar, I32);
ffc61ed2 3199 PL_regdata = r->data; /* for regprop() */
a3621e74 3200 DEBUG_COMPILE_r(regdump(r));
a0d0e21e 3201 return(r);
a687059c
LW
3202}
3203
3204/*
3205 - reg - regular expression, i.e. main body or parenthesized thing
3206 *
3207 * Caller must absorb opening parenthesis.
3208 *
3209 * Combining parenthesis handling with the base level of regular expression
3210 * is a trifle forced, but the need to tie the tails of the branches to what
3211 * follows makes it hard to avoid.
3212 */
76e3520e 3213STATIC regnode *
830247a4 3214S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 3215 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3216{
27da23d5 3217 dVAR;
c277df42
IZ
3218 register regnode *ret; /* Will be the head of the group. */
3219 register regnode *br;
3220 register regnode *lastbr;
3221 register regnode *ender = 0;
a0d0e21e 3222 register I32 parno = 0;
e2509266 3223 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
3224
3225 /* for (?g), (?gc), and (?o) warnings; warning
3226 about (?c) will warn about (?g) -- japhy */
3227
3228 I32 wastedflags = 0x00,
3229 wasted_o = 0x01,
3230 wasted_g = 0x02,
3231 wasted_gc = 0x02 | 0x04,
3232 wasted_c = 0x04;
3233
fac92740 3234 char * parse_start = RExC_parse; /* MJD */
a28509cc 3235 char * const oregcomp_parse = RExC_parse;
c277df42 3236 char c;
a0d0e21e 3237
821b33a5 3238 *flagp = 0; /* Tentatively. */
a0d0e21e 3239
9d1d55b5 3240
a0d0e21e
LW
3241 /* Make an OPEN node, if parenthesized. */
3242 if (paren) {
fac92740 3243 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3244 U32 posflags = 0, negflags = 0;
3245 U32 *flagsp = &posflags;
0f5d15d6 3246 int logical = 0;
a28509cc 3247 const char * const seqstart = RExC_parse;
ca9dfc88 3248
830247a4
IZ
3249 RExC_parse++;
3250 paren = *RExC_parse++;
c277df42 3251 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3252 switch (paren) {
fac92740 3253 case '<': /* (?<...) */
830247a4 3254 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3255 if (*RExC_parse == '!')
c277df42 3256 paren = ',';
b81d288d 3257 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3258 goto unknown;
830247a4 3259 RExC_parse++;
fac92740
MJD
3260 case '=': /* (?=...) */
3261 case '!': /* (?!...) */
830247a4 3262 RExC_seen_zerolen++;
fac92740
MJD
3263 case ':': /* (?:...) */
3264 case '>': /* (?>...) */
a0d0e21e 3265 break;
fac92740
MJD
3266 case '$': /* (?$...) */
3267 case '@': /* (?@...) */
8615cb43 3268 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3269 break;
fac92740 3270 case '#': /* (?#...) */
830247a4
IZ
3271 while (*RExC_parse && *RExC_parse != ')')
3272 RExC_parse++;
3273 if (*RExC_parse != ')')
c277df42 3274 FAIL("Sequence (?#... not terminated");
830247a4 3275 nextchar(pRExC_state);
a0d0e21e
LW
3276 *flagp = TRYAGAIN;
3277 return NULL;
fac92740 3278 case 'p': /* (?p...) */
9014280d 3279 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3280 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3281 /* FALL THROUGH*/
fac92740 3282 case '?': /* (??...) */
0f5d15d6 3283 logical = 1;
438a3801
YST
3284 if (*RExC_parse != '{')
3285 goto unknown;
830247a4 3286 paren = *RExC_parse++;
0f5d15d6 3287 /* FALL THROUGH */
fac92740 3288 case '{': /* (?{...}) */
c277df42 3289 {
c277df42
IZ
3290 I32 count = 1, n = 0;
3291 char c;
830247a4 3292 char *s = RExC_parse;
c277df42
IZ
3293 SV *sv;
3294 OP_4tree *sop, *rop;
3295
830247a4
IZ
3296 RExC_seen_zerolen++;
3297 RExC_seen |= REG_SEEN_EVAL;
3298 while (count && (c = *RExC_parse)) {
3299 if (c == '\\' && RExC_parse[1])
3300 RExC_parse++;
b81d288d 3301 else if (c == '{')
c277df42 3302 count++;
b81d288d 3303 else if (c == '}')
c277df42 3304 count--;
830247a4 3305 RExC_parse++;
c277df42 3306 }
830247a4 3307 if (*RExC_parse != ')')
b45f050a 3308 {
b81d288d 3309 RExC_parse = s;
b45f050a
JF
3310 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3311 }
c277df42 3312 if (!SIZE_ONLY) {
f3548bdc 3313 PAD *pad;
b81d288d
AB
3314
3315 if (RExC_parse - 1 - s)
830247a4 3316 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3317 else
79cb57f6 3318 sv = newSVpvn("", 0);
c277df42 3319
569233ed
SB
3320 ENTER;
3321 Perl_save_re_context(aTHX);
f3548bdc 3322 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3323 sop->op_private |= OPpREFCOUNTED;
3324 /* re_dup will OpREFCNT_inc */
3325 OpREFCNT_set(sop, 1);
569233ed 3326 LEAVE;
c277df42 3327
830247a4
IZ
3328 n = add_data(pRExC_state, 3, "nop");
3329 RExC_rx->data->data[n] = (void*)rop;
3330 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3331 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3332 SvREFCNT_dec(sv);
a0ed51b3 3333 }
e24b16f9 3334 else { /* First pass */
830247a4 3335 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3336 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3337 /* No compiled RE interpolated, has runtime
3338 components ===> unsafe. */
3339 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3340 if (PL_tainting && PL_tainted)
cc6b7395 3341 FAIL("Eval-group in insecure regular expression");
923e4eb5 3342 if (IN_PERL_COMPILETIME)
b5c19bd7 3343 PL_cv_has_eval = 1;
c277df42 3344 }
b5c19bd7 3345
830247a4 3346 nextchar(pRExC_state);
0f5d15d6 3347 if (logical) {
830247a4 3348 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3349 if (!SIZE_ONLY)
3350 ret->flags = 2;
830247a4 3351 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3352 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3353 return ret;
3354 }
ccb2c380
MP
3355 ret = reganode(pRExC_state, EVAL, n);
3356 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3357 Set_Node_Offset(ret, parse_start);
3358 return ret;
c277df42 3359 }
fac92740 3360 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3361 {
fac92740 3362 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3363 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3364 || RExC_parse[1] == '<'
830247a4 3365 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3366 I32 flag;
3367
830247a4 3368 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3369 if (!SIZE_ONLY)
3370 ret->flags = 1;
830247a4 3371 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 3372 goto insert_if;
b81d288d 3373 }
a0ed51b3 3374 }
830247a4 3375 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3376 /* (?(1)...) */
830247a4 3377 parno = atoi(RExC_parse++);
c277df42 3378
830247a4
IZ
3379 while (isDIGIT(*RExC_parse))
3380 RExC_parse++;
fac92740 3381 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3382
830247a4 3383 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3384 vFAIL("Switch condition not recognized");
c277df42 3385 insert_if:
830247a4
IZ
3386 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3387 br = regbranch(pRExC_state, &flags, 1);
c277df42 3388 if (br == NULL)
830247a4 3389 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3390 else
830247a4
IZ
3391 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3392 c = *nextchar(pRExC_state);
d1b80229
IZ
3393 if (flags&HASWIDTH)
3394 *flagp |= HASWIDTH;
c277df42 3395 if (c == '|') {
830247a4
IZ
3396 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3397 regbranch(pRExC_state, &flags, 1);
3398 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
3399 if (flags&HASWIDTH)
3400 *flagp |= HASWIDTH;
830247a4 3401 c = *nextchar(pRExC_state);
a0ed51b3
LW
3402 }
3403 else
c277df42
IZ
3404 lastbr = NULL;
3405 if (c != ')')
8615cb43 3406 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
3407 ender = reg_node(pRExC_state, TAIL);
3408 regtail(pRExC_state, br, ender);
c277df42 3409 if (lastbr) {
830247a4
IZ
3410 regtail(pRExC_state, lastbr, ender);
3411 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3412 }
3413 else
830247a4 3414 regtail(pRExC_state, ret, ender);
c277df42 3415 return ret;
a0ed51b3
LW
3416 }
3417 else {
830247a4 3418 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3419 }
3420 }
1b1626e4 3421 case 0:
830247a4 3422 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3423 vFAIL("Sequence (? incomplete");
1b1626e4 3424 break;
a0d0e21e 3425 default:
830247a4 3426 --RExC_parse;
fac92740 3427 parse_flags: /* (?i) */
830247a4 3428 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3429 /* (?g), (?gc) and (?o) are useless here
3430 and must be globally applied -- japhy */
3431
3432 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3433 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3434 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3435 if (! (wastedflags & wflagbit) ) {
3436 wastedflags |= wflagbit;
3437 vWARN5(
3438 RExC_parse + 1,
3439 "Useless (%s%c) - %suse /%c modifier",
3440 flagsp == &negflags ? "?-" : "?",
3441 *RExC_parse,
3442 flagsp == &negflags ? "don't " : "",
3443 *RExC_parse
3444 );
3445 }
3446 }
3447 }
3448 else if (*RExC_parse == 'c') {
3449 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3450 if (! (wastedflags & wasted_c) ) {
3451 wastedflags |= wasted_gc;
3452 vWARN3(
3453 RExC_parse + 1,
3454 "Useless (%sc) - %suse /gc modifier",
3455 flagsp == &negflags ? "?-" : "?",
3456 flagsp == &negflags ? "don't " : ""
3457 );
3458 }
3459 }
3460 }
3461 else { pmflag(flagsp, *RExC_parse); }
3462
830247a4 3463 ++RExC_parse;
ca9dfc88 3464 }
830247a4 3465 if (*RExC_parse == '-') {
ca9dfc88 3466 flagsp = &negflags;
9d1d55b5 3467 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3468 ++RExC_parse;
ca9dfc88 3469 goto parse_flags;
48c036b1 3470 }
e2509266
JH
3471 RExC_flags |= posflags;
3472 RExC_flags &= ~negflags;
830247a4
IZ
3473 if (*RExC_parse == ':') {
3474 RExC_parse++;
ca9dfc88
IZ
3475 paren = ':';
3476 break;
3477 }
c277df42 3478 unknown:
830247a4
IZ
3479 if (*RExC_parse != ')') {
3480 RExC_parse++;
3481 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3482 }
830247a4 3483 nextchar(pRExC_state);
a0d0e21e
LW
3484 *flagp = TRYAGAIN;
3485 return NULL;
3486 }
3487 }
fac92740 3488 else { /* (...) */
830247a4
IZ
3489 parno = RExC_npar;
3490 RExC_npar++;
3491 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3492 Set_Node_Length(ret, 1); /* MJD */
3493 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 3494 open = 1;
a0d0e21e 3495 }
a0ed51b3 3496 }
fac92740 3497 else /* ! paren */
a0d0e21e
LW
3498 ret = NULL;
3499
3500 /* Pick up the branches, linking them together. */
fac92740 3501 parse_start = RExC_parse; /* MJD */
830247a4 3502 br = regbranch(pRExC_state, &flags, 1);
fac92740 3503 /* branch_len = (paren != 0); */
2af232bd 3504
a0d0e21e
LW
3505 if (br == NULL)
3506 return(NULL);
830247a4
IZ
3507 if (*RExC_parse == '|') {
3508 if (!SIZE_ONLY && RExC_extralen) {
3509 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3510 }
fac92740 3511 else { /* MJD */
830247a4 3512 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3513 Set_Node_Length(br, paren != 0);
3514 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3515 }
c277df42
IZ
3516 have_branch = 1;
3517 if (SIZE_ONLY)
830247a4 3518 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3519 }
3520 else if (paren == ':') {
c277df42
IZ
3521 *flagp |= flags&SIMPLE;
3522 }
3523 if (open) { /* Starts with OPEN. */
830247a4 3524 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3525 }
3526 else if (paren != '?') /* Not Conditional */
a0d0e21e 3527 ret = br;
32a0ca98 3528 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3529 lastbr = br;
830247a4
IZ
3530 while (*RExC_parse == '|') {
3531 if (!SIZE_ONLY && RExC_extralen) {
3532 ender = reganode(pRExC_state, LONGJMP,0);
3533 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3534 }
3535 if (SIZE_ONLY)
830247a4
IZ
3536 RExC_extralen += 2; /* Account for LONGJMP. */
3537 nextchar(pRExC_state);
3538 br = regbranch(pRExC_state, &flags, 0);
2af232bd 3539
a687059c 3540 if (br == NULL)
a0d0e21e 3541 return(NULL);
830247a4 3542 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3543 lastbr = br;
821b33a5
IZ
3544 if (flags&HASWIDTH)
3545 *flagp |= HASWIDTH;
a687059c 3546 *flagp |= flags&SPSTART;
a0d0e21e
LW
3547 }
3548
c277df42
IZ
3549 if (have_branch || paren != ':') {
3550 /* Make a closing node, and hook it on the end. */
3551 switch (paren) {
3552 case ':':
830247a4 3553 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3554 break;
3555 case 1:
830247a4 3556 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3557 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3558 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3559 break;
3560 case '<':
c277df42
IZ
3561 case ',':
3562 case '=':
3563 case '!':
c277df42 3564 *flagp &= ~HASWIDTH;
821b33a5
IZ
3565 /* FALL THROUGH */
3566 case '>':
830247a4 3567 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3568 break;
3569 case 0:
830247a4 3570 ender = reg_node(pRExC_state, END);
c277df42
IZ
3571 break;
3572 }
830247a4 3573 regtail(pRExC_state, lastbr, ender);
a0d0e21e 3574
c277df42
IZ
3575 if (have_branch) {
3576 /* Hook the tails of the branches to the closing node. */
3577 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 3578 regoptail(pRExC_state, br, ender);
c277df42
IZ
3579 }
3580 }
a0d0e21e 3581 }
c277df42
IZ
3582
3583 {
e1ec3a88
AL
3584 const char *p;
3585 static const char parens[] = "=!<,>";
c277df42
IZ
3586
3587 if (paren && (p = strchr(parens, paren))) {
eb160463 3588 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
3589 int flag = (p - parens) > 1;
3590
3591 if (paren == '>')
3592 node = SUSPEND, flag = 0;
830247a4 3593 reginsert(pRExC_state, node,ret);
45948336
EP
3594 Set_Node_Cur_Length(ret);
3595 Set_Node_Offset(ret, parse_start + 1);
c277df42 3596 ret->flags = flag;
830247a4 3597 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 3598 }
a0d0e21e
LW
3599 }
3600
3601 /* Check for proper termination. */
ce3e6498 3602 if (paren) {
e2509266 3603 RExC_flags = oregflags;
830247a4
IZ
3604 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3605 RExC_parse = oregcomp_parse;
380a0633 3606 vFAIL("Unmatched (");
ce3e6498 3607 }
a0ed51b3 3608 }
830247a4
IZ
3609 else if (!paren && RExC_parse < RExC_end) {
3610 if (*RExC_parse == ')') {
3611 RExC_parse++;
380a0633 3612 vFAIL("Unmatched )");
a0ed51b3
LW
3613 }
3614 else
b45f050a 3615 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
3616 /* NOTREACHED */
3617 }
a687059c 3618
a0d0e21e 3619 return(ret);
a687059c
LW
3620}
3621
3622/*
3623 - regbranch - one alternative of an | operator
3624 *
3625 * Implements the concatenation operator.
3626 */
76e3520e 3627STATIC regnode *
830247a4 3628S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 3629{
c277df42
IZ
3630 register regnode *ret;
3631 register regnode *chain = NULL;
3632 register regnode *latest;
3633 I32 flags = 0, c = 0;
a0d0e21e 3634
b81d288d 3635 if (first)
c277df42
IZ
3636 ret = NULL;
3637 else {
b81d288d 3638 if (!SIZE_ONLY && RExC_extralen)
830247a4 3639 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 3640 else {
830247a4 3641 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
3642 Set_Node_Length(ret, 1);
3643 }
c277df42
IZ
3644 }
3645
b81d288d 3646 if (!first && SIZE_ONLY)
830247a4 3647 RExC_extralen += 1; /* BRANCHJ */
b81d288d 3648
c277df42 3649 *flagp = WORST; /* Tentatively. */
a0d0e21e 3650
830247a4
IZ
3651 RExC_parse--;
3652 nextchar(pRExC_state);
3653 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 3654 flags &= ~TRYAGAIN;
830247a4 3655 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
3656 if (latest == NULL) {
3657 if (flags & TRYAGAIN)
3658 continue;
3659 return(NULL);
a0ed51b3
LW
3660 }
3661 else if (ret == NULL)
c277df42 3662 ret = latest;
a0d0e21e 3663 *flagp |= flags&HASWIDTH;
c277df42 3664 if (chain == NULL) /* First piece. */
a0d0e21e
LW
3665 *flagp |= flags&SPSTART;
3666 else {
830247a4
IZ
3667 RExC_naughty++;
3668 regtail(pRExC_state, chain, latest);
a687059c 3669 }
a0d0e21e 3670 chain = latest;
c277df42
IZ
3671 c++;
3672 }
3673 if (chain == NULL) { /* Loop ran zero times. */
830247a4 3674 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
3675 if (ret == NULL)
3676 ret = chain;
3677 }
3678 if (c == 1) {
3679 *flagp |= flags&SIMPLE;
a0d0e21e 3680 }
a687059c 3681
a0d0e21e 3682 return(ret);
a687059c
LW
3683}
3684
3685/*
3686 - regpiece - something followed by possible [*+?]
3687 *
3688 * Note that the branching code sequences used for ? and the general cases
3689 * of * and + are somewhat optimized: they use the same NOTHING node as
3690 * both the endmarker for their branch list and the body of the last branch.
3691 * It might seem that this node could be dispensed with entirely, but the
3692 * endmarker role is not redundant.
3693 */
76e3520e 3694STATIC regnode *
830247a4 3695S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3696{
c277df42 3697 register regnode *ret;
a0d0e21e
LW
3698 register char op;
3699 register char *next;
3700 I32 flags;
1df70142 3701 const char * const origparse = RExC_parse;
a0d0e21e
LW
3702 char *maxpos;
3703 I32 min;
c277df42 3704 I32 max = REG_INFTY;
fac92740 3705 char *parse_start;
a0d0e21e 3706
830247a4 3707 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
3708 if (ret == NULL) {
3709 if (flags & TRYAGAIN)
3710 *flagp |= TRYAGAIN;
3711 return(NULL);
3712 }
3713
830247a4 3714 op = *RExC_parse;
a0d0e21e 3715
830247a4 3716 if (op == '{' && regcurly(RExC_parse)) {
fac92740 3717 parse_start = RExC_parse; /* MJD */
830247a4 3718 next = RExC_parse + 1;
a0d0e21e
LW
3719 maxpos = Nullch;
3720 while (isDIGIT(*next) || *next == ',') {
3721 if (*next == ',') {
3722 if (maxpos)
3723 break;
3724 else
3725 maxpos = next;
a687059c 3726 }
a0d0e21e
LW
3727 next++;
3728 }
3729 if (*next == '}') { /* got one */
3730 if (!maxpos)
3731 maxpos = next;
830247a4
IZ
3732 RExC_parse++;
3733 min = atoi(RExC_parse);
a0d0e21e
LW
3734 if (*maxpos == ',')
3735 maxpos++;
3736 else
830247a4 3737 maxpos = RExC_parse;
a0d0e21e
LW
3738 max = atoi(maxpos);
3739 if (!max && *maxpos != '0')
c277df42
IZ
3740 max = REG_INFTY; /* meaning "infinity" */
3741 else if (max >= REG_INFTY)
8615cb43 3742 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
3743 RExC_parse = next;
3744 nextchar(pRExC_state);
a0d0e21e
LW
3745
3746 do_curly:
3747 if ((flags&SIMPLE)) {
830247a4
IZ
3748 RExC_naughty += 2 + RExC_naughty / 2;
3749 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
3750 Set_Node_Offset(ret, parse_start+1); /* MJD */
3751 Set_Node_Cur_Length(ret);
a0d0e21e
LW
3752 }
3753 else {
830247a4 3754 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
3755
3756 w->flags = 0;
830247a4
IZ
3757 regtail(pRExC_state, ret, w);
3758 if (!SIZE_ONLY && RExC_extralen) {
3759 reginsert(pRExC_state, LONGJMP,ret);
3760 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
3761 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3762 }
830247a4 3763 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
3764 /* MJD hk */
3765 Set_Node_Offset(ret, parse_start+1);
2af232bd 3766 Set_Node_Length(ret,
fac92740 3767 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 3768
830247a4 3769 if (!SIZE_ONLY && RExC_extralen)
c277df42 3770 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 3771 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 3772 if (SIZE_ONLY)
830247a4
IZ
3773 RExC_whilem_seen++, RExC_extralen += 3;
3774 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 3775 }
c277df42 3776 ret->flags = 0;
a0d0e21e
LW
3777
3778 if (min > 0)
821b33a5
IZ
3779 *flagp = WORST;
3780 if (max > 0)
3781 *flagp |= HASWIDTH;
a0d0e21e 3782 if (max && max < min)
8615cb43 3783 vFAIL("Can't do {n,m} with n > m");
c277df42 3784 if (!SIZE_ONLY) {
eb160463
GS
3785 ARG1_SET(ret, (U16)min);
3786 ARG2_SET(ret, (U16)max);
a687059c 3787 }
a687059c 3788
a0d0e21e 3789 goto nest_check;
a687059c 3790 }
a0d0e21e 3791 }
a687059c 3792
a0d0e21e
LW
3793 if (!ISMULT1(op)) {
3794 *flagp = flags;
a687059c 3795 return(ret);
a0d0e21e 3796 }
bb20fd44 3797
c277df42 3798#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
3799
3800 /* if this is reinstated, don't forget to put this back into perldiag:
3801
3802 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3803
3804 (F) The part of the regexp subject to either the * or + quantifier
3805 could match an empty string. The {#} shows in the regular
3806 expression about where the problem was discovered.
3807
3808 */
3809
bb20fd44 3810 if (!(flags&HASWIDTH) && op != '?')
b45f050a 3811 vFAIL("Regexp *+ operand could be empty");
b81d288d 3812#endif
bb20fd44 3813
fac92740 3814 parse_start = RExC_parse;
830247a4 3815 nextchar(pRExC_state);
a0d0e21e 3816
821b33a5 3817 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
3818
3819 if (op == '*' && (flags&SIMPLE)) {
830247a4 3820 reginsert(pRExC_state, STAR, ret);
c277df42 3821 ret->flags = 0;
830247a4 3822 RExC_naughty += 4;
a0d0e21e
LW
3823 }
3824 else if (op == '*') {
3825 min = 0;
3826 goto do_curly;
a0ed51b3
LW
3827 }
3828 else if (op == '+' && (flags&SIMPLE)) {
830247a4 3829 reginsert(pRExC_state, PLUS, ret);
c277df42 3830 ret->flags = 0;
830247a4 3831 RExC_naughty += 3;
a0d0e21e
LW
3832 }
3833 else if (op == '+') {
3834 min = 1;
3835 goto do_curly;
a0ed51b3
LW
3836 }
3837 else if (op == '?') {
a0d0e21e
LW
3838 min = 0; max = 1;
3839 goto do_curly;
3840 }
3841 nest_check:
041457d9 3842 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 3843 vWARN3(RExC_parse,
b45f050a 3844 "%.*s matches null string many times",
830247a4 3845 RExC_parse - origparse,
b45f050a 3846 origparse);
a0d0e21e
LW
3847 }
3848
830247a4
IZ
3849 if (*RExC_parse == '?') {
3850 nextchar(pRExC_state);
3851 reginsert(pRExC_state, MINMOD, ret);
3852 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 3853 }
830247a4
IZ
3854 if (ISMULT2(RExC_parse)) {
3855 RExC_parse++;
b45f050a
JF
3856 vFAIL("Nested quantifiers");
3857 }
a0d0e21e
LW
3858
3859 return(ret);
a687059c
LW
3860}
3861
3862/*
3863 - regatom - the lowest level
3864 *
3865 * Optimization: gobbles an entire sequence of ordinary characters so that
3866 * it can turn them into a single node, which is smaller to store and
3867 * faster to run. Backslashed characters are exceptions, each becoming a
3868 * separate node; the code is simpler that way and it's not worth fixing.
3869 *
b45f050a 3870 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 3871STATIC regnode *
830247a4 3872S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3873{
c277df42 3874 register regnode *ret = 0;
a0d0e21e 3875 I32 flags;
45948336 3876 char *parse_start = RExC_parse;
a0d0e21e
LW
3877
3878 *flagp = WORST; /* Tentatively. */
3879
3880tryagain:
830247a4 3881 switch (*RExC_parse) {
a0d0e21e 3882 case '^':
830247a4
IZ
3883 RExC_seen_zerolen++;
3884 nextchar(pRExC_state);
e2509266 3885 if (RExC_flags & PMf_MULTILINE)
830247a4 3886 ret = reg_node(pRExC_state, MBOL);
e2509266 3887 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3888 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3889 else
830247a4 3890 ret = reg_node(pRExC_state, BOL);
fac92740 3891 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3892 break;
3893 case '$':
830247a4 3894 nextchar(pRExC_state);
b81d288d 3895 if (*RExC_parse)
830247a4 3896 RExC_seen_zerolen++;
e2509266 3897 if (RExC_flags & PMf_MULTILINE)
830247a4 3898 ret = reg_node(pRExC_state, MEOL);
e2509266 3899 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3900 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3901 else
830247a4 3902 ret = reg_node(pRExC_state, EOL);
fac92740 3903 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3904 break;
3905 case '.':
830247a4 3906 nextchar(pRExC_state);
e2509266 3907 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
3908 ret = reg_node(pRExC_state, SANY);
3909 else
3910 ret = reg_node(pRExC_state, REG_ANY);
3911 *flagp |= HASWIDTH|SIMPLE;
830247a4 3912 RExC_naughty++;
fac92740 3913 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3914 break;
3915 case '[':
b45f050a 3916 {
830247a4 3917 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 3918 ret = regclass(pRExC_state);
830247a4
IZ
3919 if (*RExC_parse != ']') {
3920 RExC_parse = oregcomp_parse;
b45f050a
JF
3921 vFAIL("Unmatched [");
3922 }
830247a4 3923 nextchar(pRExC_state);
a0d0e21e 3924 *flagp |= HASWIDTH|SIMPLE;
fac92740 3925 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 3926 break;
b45f050a 3927 }
a0d0e21e 3928 case '(':
830247a4
IZ
3929 nextchar(pRExC_state);
3930 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 3931 if (ret == NULL) {
bf93d4cc 3932 if (flags & TRYAGAIN) {
830247a4 3933 if (RExC_parse == RExC_end) {
bf93d4cc
GS
3934 /* Make parent create an empty node if needed. */
3935 *flagp |= TRYAGAIN;
3936 return(NULL);
3937 }
a0d0e21e 3938 goto tryagain;
bf93d4cc 3939 }
a0d0e21e
LW
3940 return(NULL);
3941 }
c277df42 3942 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
3943 break;
3944 case '|':
3945 case ')':
3946 if (flags & TRYAGAIN) {
3947 *flagp |= TRYAGAIN;
3948 return NULL;
3949 }
b45f050a 3950 vFAIL("Internal urp");
a0d0e21e
LW
3951 /* Supposed to be caught earlier. */
3952 break;
85afd4ae 3953 case '{':
830247a4
IZ
3954 if (!regcurly(RExC_parse)) {
3955 RExC_parse++;
85afd4ae
CS
3956 goto defchar;
3957 }
3958 /* FALL THROUGH */
a0d0e21e
LW
3959 case '?':
3960 case '+':
3961 case '*':
830247a4 3962 RExC_parse++;
b45f050a 3963 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
3964 break;
3965 case '\\':
830247a4 3966 switch (*++RExC_parse) {
a0d0e21e 3967 case 'A':
830247a4
IZ
3968 RExC_seen_zerolen++;
3969 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3970 *flagp |= SIMPLE;
830247a4 3971 nextchar(pRExC_state);
fac92740 3972 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3973 break;
3974 case 'G':
830247a4
IZ
3975 ret = reg_node(pRExC_state, GPOS);
3976 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 3977 *flagp |= SIMPLE;
830247a4 3978 nextchar(pRExC_state);
fac92740 3979 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3980 break;
3981 case 'Z':
830247a4 3982 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3983 *flagp |= SIMPLE;
a1917ab9 3984 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 3985 nextchar(pRExC_state);
a0d0e21e 3986 break;
b85d18e9 3987 case 'z':
830247a4 3988 ret = reg_node(pRExC_state, EOS);
b85d18e9 3989 *flagp |= SIMPLE;
830247a4
IZ
3990 RExC_seen_zerolen++; /* Do not optimize RE away */
3991 nextchar(pRExC_state);
fac92740 3992 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 3993 break;
4a2d328f 3994 case 'C':
f33976b4
DB
3995 ret = reg_node(pRExC_state, CANY);
3996 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 3997 *flagp |= HASWIDTH|SIMPLE;
830247a4 3998 nextchar(pRExC_state);
fac92740 3999 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
4000 break;
4001 case 'X':
830247a4 4002 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 4003 *flagp |= HASWIDTH;
830247a4 4004 nextchar(pRExC_state);
fac92740 4005 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 4006 break;
a0d0e21e 4007 case 'w':
eb160463 4008 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 4009 *flagp |= HASWIDTH|SIMPLE;
830247a4 4010 nextchar(pRExC_state);
fac92740 4011 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4012 break;
4013 case 'W':
eb160463 4014 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4015 *flagp |= HASWIDTH|SIMPLE;
830247a4 4016 nextchar(pRExC_state);
fac92740 4017 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4018 break;
4019 case 'b':
830247a4
IZ
4020 RExC_seen_zerolen++;
4021 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4022 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4023 *flagp |= SIMPLE;
830247a4 4024 nextchar(pRExC_state);
fac92740 4025 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4026 break;
4027 case 'B':
830247a4
IZ
4028 RExC_seen_zerolen++;
4029 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4030 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4031 *flagp |= SIMPLE;
830247a4 4032 nextchar(pRExC_state);
fac92740 4033 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4034 break;
4035 case 's':
eb160463 4036 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4037 *flagp |= HASWIDTH|SIMPLE;
830247a4 4038 nextchar(pRExC_state);
fac92740 4039 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4040 break;
4041 case 'S':
eb160463 4042 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4043 *flagp |= HASWIDTH|SIMPLE;
830247a4 4044 nextchar(pRExC_state);
fac92740 4045 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4046 break;
4047 case 'd':
ffc61ed2 4048 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4049 *flagp |= HASWIDTH|SIMPLE;
830247a4 4050 nextchar(pRExC_state);
fac92740 4051 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4052 break;
4053 case 'D':
ffc61ed2 4054 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4055 *flagp |= HASWIDTH|SIMPLE;
830247a4 4056 nextchar(pRExC_state);
fac92740 4057 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4058 break;
a14b48bc
LW
4059 case 'p':
4060 case 'P':
3568d838 4061 {
830247a4 4062 char* oldregxend = RExC_end;
ccb2c380 4063 char* parse_start = RExC_parse - 2;
a14b48bc 4064
830247a4 4065 if (RExC_parse[1] == '{') {
3568d838 4066 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4067 RExC_end = strchr(RExC_parse, '}');
4068 if (!RExC_end) {
0da60cf5 4069 U8 c = (U8)*RExC_parse;
830247a4
IZ
4070 RExC_parse += 2;
4071 RExC_end = oldregxend;
0da60cf5 4072 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4073 }
830247a4 4074 RExC_end++;
a14b48bc 4075 }
af6f566e 4076 else {
830247a4 4077 RExC_end = RExC_parse + 2;
af6f566e
HS
4078 if (RExC_end > oldregxend)
4079 RExC_end = oldregxend;
4080 }
830247a4 4081 RExC_parse--;
a14b48bc 4082
ffc61ed2 4083 ret = regclass(pRExC_state);
a14b48bc 4084
830247a4
IZ
4085 RExC_end = oldregxend;
4086 RExC_parse--;
ccb2c380
MP
4087
4088 Set_Node_Offset(ret, parse_start + 2);
4089 Set_Node_Cur_Length(ret);
830247a4 4090 nextchar(pRExC_state);
a14b48bc
LW
4091 *flagp |= HASWIDTH|SIMPLE;
4092 }
4093 break;
a0d0e21e
LW
4094 case 'n':
4095 case 'r':
4096 case 't':
4097 case 'f':
4098 case 'e':
4099 case 'a':
4100 case 'x':
4101 case 'c':
4102 case '0':
4103 goto defchar;
4104 case '1': case '2': case '3': case '4':
4105 case '5': case '6': case '7': case '8': case '9':
4106 {
1df70142 4107 const I32 num = atoi(RExC_parse);
a0d0e21e 4108
830247a4 4109 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4110 goto defchar;
4111 else {
fac92740 4112 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4113 while (isDIGIT(*RExC_parse))
4114 RExC_parse++;
b45f050a 4115
eb160463 4116 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4117 vFAIL("Reference to nonexistent group");
830247a4 4118 RExC_sawback = 1;
eb160463
GS
4119 ret = reganode(pRExC_state,
4120 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4121 num);
a0d0e21e 4122 *flagp |= HASWIDTH;
2af232bd 4123
fac92740 4124 /* override incorrect value set in reganode MJD */
2af232bd 4125 Set_Node_Offset(ret, parse_start+1);
fac92740 4126 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4127 RExC_parse--;
4128 nextchar(pRExC_state);
a0d0e21e
LW
4129 }
4130 }
4131 break;
4132 case '\0':
830247a4 4133 if (RExC_parse >= RExC_end)
b45f050a 4134 FAIL("Trailing \\");
a0d0e21e
LW
4135 /* FALL THROUGH */
4136 default:
a0288114 4137 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 4138 back into the quick-grab loop below */
45948336 4139 parse_start--;
a0d0e21e
LW
4140 goto defchar;
4141 }
4142 break;
4633a7c4
LW
4143
4144 case '#':
e2509266 4145 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4146 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4147 if (RExC_parse < RExC_end)
4633a7c4
LW
4148 goto tryagain;
4149 }
4150 /* FALL THROUGH */
4151
a0d0e21e 4152 default: {
ba210ebe 4153 register STRLEN len;
58ae7d3f 4154 register UV ender;
a0d0e21e 4155 register char *p;
c277df42 4156 char *oldp, *s;
80aecb99 4157 STRLEN foldlen;
89ebb4a3 4158 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4159
4160 parse_start = RExC_parse - 1;
a0d0e21e 4161
830247a4 4162 RExC_parse++;
a0d0e21e
LW
4163
4164 defchar:
58ae7d3f 4165 ender = 0;
eb160463
GS
4166 ret = reg_node(pRExC_state,
4167 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4168 s = STRING(ret);
830247a4
IZ
4169 for (len = 0, p = RExC_parse - 1;
4170 len < 127 && p < RExC_end;
a0d0e21e
LW
4171 len++)
4172 {
4173 oldp = p;
5b5a24f7 4174
e2509266 4175 if (RExC_flags & PMf_EXTENDED)
830247a4 4176 p = regwhite(p, RExC_end);
a0d0e21e
LW
4177 switch (*p) {
4178 case '^':
4179 case '$':
4180 case '.':
4181 case '[':
4182 case '(':
4183 case ')':
4184 case '|':
4185 goto loopdone;
4186 case '\\':
4187 switch (*++p) {
4188 case 'A':
1ed8eac0
JF
4189 case 'C':
4190 case 'X':
a0d0e21e
LW
4191 case 'G':
4192 case 'Z':
b85d18e9 4193 case 'z':
a0d0e21e
LW
4194 case 'w':
4195 case 'W':
4196 case 'b':
4197 case 'B':
4198 case 's':
4199 case 'S':
4200 case 'd':
4201 case 'D':
a14b48bc
LW
4202 case 'p':
4203 case 'P':
a0d0e21e
LW
4204 --p;
4205 goto loopdone;
4206 case 'n':
4207 ender = '\n';
4208 p++;
a687059c 4209 break;
a0d0e21e
LW
4210 case 'r':
4211 ender = '\r';
4212 p++;
a687059c 4213 break;
a0d0e21e
LW
4214 case 't':
4215 ender = '\t';
4216 p++;
a687059c 4217 break;
a0d0e21e
LW
4218 case 'f':
4219 ender = '\f';
4220 p++;
a687059c 4221 break;
a0d0e21e 4222 case 'e':
c7f1f016 4223 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4224 p++;
a687059c 4225 break;
a0d0e21e 4226 case 'a':
c7f1f016 4227 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4228 p++;
a687059c 4229 break;
a0d0e21e 4230 case 'x':
a0ed51b3 4231 if (*++p == '{') {
1df70142 4232 char* const e = strchr(p, '}');
b81d288d 4233
b45f050a 4234 if (!e) {
830247a4 4235 RExC_parse = p + 1;
b45f050a
JF
4236 vFAIL("Missing right brace on \\x{}");
4237 }
de5f0749 4238 else {
a4c04bdc
NC
4239 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4240 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 4241 STRLEN numlen = e - p - 1;
53305cf1 4242 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4243 if (ender > 0xff)
4244 RExC_utf8 = 1;
a0ed51b3
LW
4245 p = e + 1;
4246 }
a0ed51b3
LW
4247 }
4248 else {
a4c04bdc 4249 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 4250 STRLEN numlen = 2;
53305cf1 4251 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4252 p += numlen;
4253 }
a687059c 4254 break;
a0d0e21e
LW
4255 case 'c':
4256 p++;
bbce6d69 4257 ender = UCHARAT(p++);
4258 ender = toCTRL(ender);
a687059c 4259 break;
a0d0e21e
LW
4260 case '0': case '1': case '2': case '3':case '4':
4261 case '5': case '6': case '7': case '8':case '9':
4262 if (*p == '0' ||
830247a4 4263 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 4264 I32 flags = 0;
1df70142 4265 STRLEN numlen = 3;
53305cf1 4266 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4267 p += numlen;
4268 }
4269 else {
4270 --p;
4271 goto loopdone;
a687059c
LW
4272 }
4273 break;
a0d0e21e 4274 case '\0':
830247a4 4275 if (p >= RExC_end)
b45f050a 4276 FAIL("Trailing \\");
a687059c 4277 /* FALL THROUGH */
a0d0e21e 4278 default:
041457d9 4279 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 4280 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4281 goto normal_default;
a0d0e21e
LW
4282 }
4283 break;
a687059c 4284 default:
a0ed51b3 4285 normal_default:
fd400ab9 4286 if (UTF8_IS_START(*p) && UTF) {
1df70142 4287 STRLEN numlen;
5e12f4fb 4288 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 4289 &numlen, 0);
a0ed51b3
LW
4290 p += numlen;
4291 }
4292 else
4293 ender = *p++;
a0d0e21e 4294 break;
a687059c 4295 }
e2509266 4296 if (RExC_flags & PMf_EXTENDED)
830247a4 4297 p = regwhite(p, RExC_end);
60a8b682
JH
4298 if (UTF && FOLD) {
4299 /* Prime the casefolded buffer. */
ac7e0132 4300 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4301 }
a0d0e21e
LW
4302 if (ISMULT2(p)) { /* Back off on ?+*. */
4303 if (len)
4304 p = oldp;
16ea2a2e 4305 else if (UTF) {
0ebc6274
JH
4306 STRLEN unilen;
4307
80aecb99 4308 if (FOLD) {
60a8b682 4309 /* Emit all the Unicode characters. */
1df70142 4310 STRLEN numlen;
80aecb99
JH
4311 for (foldbuf = tmpbuf;
4312 foldlen;
4313 foldlen -= numlen) {
4314 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4315 if (numlen > 0) {
0ebc6274
JH
4316 reguni(pRExC_state, ender, s, &unilen);
4317 s += unilen;
4318 len += unilen;
4319 /* In EBCDIC the numlen
4320 * and unilen can differ. */
9dc45d57 4321 foldbuf += numlen;
47654450
JH
4322 if (numlen >= foldlen)
4323 break;
9dc45d57
JH
4324 }
4325 else
4326 break; /* "Can't happen." */
80aecb99
JH
4327 }
4328 }
4329 else {
0ebc6274 4330 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4331 if (unilen > 0) {
0ebc6274
JH
4332 s += unilen;
4333 len += unilen;
9dc45d57 4334 }
80aecb99 4335 }
a0ed51b3 4336 }
a0d0e21e
LW
4337 else {
4338 len++;
eb160463 4339 REGC((char)ender, s++);
a0d0e21e
LW
4340 }
4341 break;
a687059c 4342 }
16ea2a2e 4343 if (UTF) {
0ebc6274
JH
4344 STRLEN unilen;
4345
80aecb99 4346 if (FOLD) {
60a8b682 4347 /* Emit all the Unicode characters. */
1df70142 4348 STRLEN numlen;
80aecb99
JH
4349 for (foldbuf = tmpbuf;
4350 foldlen;
4351 foldlen -= numlen) {
4352 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4353 if (numlen > 0) {
0ebc6274
JH
4354 reguni(pRExC_state, ender, s, &unilen);
4355 len += unilen;
4356 s += unilen;
4357 /* In EBCDIC the numlen
4358 * and unilen can differ. */
9dc45d57 4359 foldbuf += numlen;
47654450
JH
4360 if (numlen >= foldlen)
4361 break;
9dc45d57
JH
4362 }
4363 else
4364 break;
80aecb99
JH
4365 }
4366 }
4367 else {
0ebc6274 4368 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4369 if (unilen > 0) {
0ebc6274
JH
4370 s += unilen;
4371 len += unilen;
9dc45d57 4372 }
80aecb99
JH
4373 }
4374 len--;
a0ed51b3
LW
4375 }
4376 else
eb160463 4377 REGC((char)ender, s++);
a0d0e21e
LW
4378 }
4379 loopdone:
830247a4 4380 RExC_parse = p - 1;
fac92740 4381 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4382 nextchar(pRExC_state);
793db0cb
JH
4383 {
4384 /* len is STRLEN which is unsigned, need to copy to signed */
4385 IV iv = len;
4386 if (iv < 0)
4387 vFAIL("Internal disaster");
4388 }
a0d0e21e
LW
4389 if (len > 0)
4390 *flagp |= HASWIDTH;
090f7165 4391 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4392 *flagp |= SIMPLE;
c277df42 4393 if (!SIZE_ONLY)
cd439c50
IZ
4394 STR_LEN(ret) = len;
4395 if (SIZE_ONLY)
830247a4 4396 RExC_size += STR_SZ(len);
cd439c50 4397 else
830247a4 4398 RExC_emit += STR_SZ(len);
a687059c 4399 }
a0d0e21e
LW
4400 break;
4401 }
a687059c 4402
60a8b682
JH
4403 /* If the encoding pragma is in effect recode the text of
4404 * any EXACT-kind nodes. */
22c54be3 4405 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
4406 STRLEN oldlen = STR_LEN(ret);
4407 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4408
4409 if (RExC_utf8)
4410 SvUTF8_on(sv);
4411 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
4412 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4413 const STRLEN newlen = SvCUR(sv);
d0063567
DK
4414
4415 if (SvUTF8(sv))
4416 RExC_utf8 = 1;
4417 if (!SIZE_ONLY) {
a3621e74
YO
4418 GET_RE_DEBUG_FLAGS_DECL;
4419 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
4420 (int)oldlen, STRING(ret),
4421 (int)newlen, s));
4422 Copy(s, STRING(ret), newlen, char);
4423 STR_LEN(ret) += newlen - oldlen;
4424 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4425 } else
4426 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4427 }
a72c7584
JH
4428 }
4429
a0d0e21e 4430 return(ret);
a687059c
LW
4431}
4432
873ef191 4433STATIC char *
504618e9 4434S_regwhite(pTHX_ char *p, const char *e)
5b5a24f7
CS
4435{
4436 while (p < e) {
4437 if (isSPACE(*p))
4438 ++p;
4439 else if (*p == '#') {
4440 do {
4441 p++;
4442 } while (p < e && *p != '\n');
4443 }
4444 else
4445 break;
4446 }
4447 return p;
4448}
4449
b8c5462f
JH
4450/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4451 Character classes ([:foo:]) can also be negated ([:^foo:]).
4452 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4453 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 4454 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
4455
4456#define POSIXCC_DONE(c) ((c) == ':')
4457#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4458#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4459
b8c5462f 4460STATIC I32
830247a4 4461S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 4462{
936ed897 4463 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 4464
830247a4 4465 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 4466 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 4467 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 4468 const char c = UCHARAT(RExC_parse);
830247a4 4469 char* s = RExC_parse++;
b81d288d 4470
9a86a77b 4471 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
4472 RExC_parse++;
4473 if (RExC_parse == RExC_end)
620e46c5 4474 /* Grandfather lone [:, [=, [. */
830247a4 4475 RExC_parse = s;
620e46c5 4476 else {
1df70142 4477 const char* t = RExC_parse++; /* skip over the c */
a28509cc 4478 const char *posixcc;
b8c5462f 4479
80916619
NC
4480 assert(*t == c);
4481
9a86a77b 4482 if (UCHARAT(RExC_parse) == ']') {
830247a4 4483 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
4484 posixcc = s + 1;
4485 if (*s == ':') {
1df70142
AL
4486 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4487 const I32 skip = t - posixcc;
80916619
NC
4488
4489 /* Initially switch on the length of the name. */
4490 switch (skip) {
4491 case 4:
4492 if (memEQ(posixcc, "word", 4)) {
4493 /* this is not POSIX, this is the Perl \w */;
4494 namedclass
4495 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4496 }
cc4319de 4497 break;
80916619
NC
4498 case 5:
4499 /* Names all of length 5. */
4500 /* alnum alpha ascii blank cntrl digit graph lower
4501 print punct space upper */
4502 /* Offset 4 gives the best switch position. */
4503 switch (posixcc[4]) {
4504 case 'a':
4505 if (memEQ(posixcc, "alph", 4)) {
4506 /* a */
4507 namedclass
4508 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4509 }
4510 break;
4511 case 'e':
4512 if (memEQ(posixcc, "spac", 4)) {
4513 /* e */
4514 namedclass
4515 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4516 }
4517 break;
4518 case 'h':
4519 if (memEQ(posixcc, "grap", 4)) {
4520 /* h */
4521 namedclass
4522 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4523 }
4524 break;
4525 case 'i':
4526 if (memEQ(posixcc, "asci", 4)) {
4527 /* i */
4528 namedclass
4529 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4530 }
4531 break;
4532 case 'k':
4533 if (memEQ(posixcc, "blan", 4)) {
4534 /* k */
4535 namedclass
4536 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4537 }
4538 break;
4539 case 'l':
4540 if (memEQ(posixcc, "cntr", 4)) {
4541 /* l */
4542 namedclass
4543 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4544 }
4545 break;
4546 case 'm':
4547 if (memEQ(posixcc, "alnu", 4)) {
4548 /* m */
4549 namedclass
4550 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4551 }
4552 break;
4553 case 'r':
4554 if (memEQ(posixcc, "lowe", 4)) {
4555 /* r */
4556 namedclass
4557 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4558 }
4559 if (memEQ(posixcc, "uppe", 4)) {
8fdec511 4560 /* r */
80916619
NC
4561 namedclass
4562 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4563 }
4564 break;
4565 case 't':
4566 if (memEQ(posixcc, "digi", 4)) {
4567 /* t */
4568 namedclass
4569 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4570 }
4571 if (memEQ(posixcc, "prin", 4)) {
8fdec511 4572 /* t */
80916619
NC
4573 namedclass
4574 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4575 }
4576 if (memEQ(posixcc, "punc", 4)) {
4577 /* t */
4578 namedclass
4579 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4580 }
4581 break;
b8c5462f
JH
4582 }
4583 break;
80916619
NC
4584 case 6:
4585 if (memEQ(posixcc, "xdigit", 6)) {
4586 namedclass
4587 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
4588 }
4589 break;
4590 }
80916619
NC
4591
4592 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
4593 {
4594 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4595 t - s - 1, s + 1);
4596 }
80916619
NC
4597 assert (posixcc[skip] == ':');
4598 assert (posixcc[skip+1] == ']');
b45f050a 4599 } else if (!SIZE_ONLY) {
b8c5462f 4600 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 4601
830247a4 4602 /* adjust RExC_parse so the warning shows after
b45f050a 4603 the class closes */
9a86a77b 4604 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 4605 RExC_parse++;
b45f050a
JF
4606 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4607 }
b8c5462f
JH
4608 } else {
4609 /* Maternal grandfather:
4610 * "[:" ending in ":" but not in ":]" */
830247a4 4611 RExC_parse = s;
767d463e 4612 }
620e46c5
JH
4613 }
4614 }
4615
b8c5462f
JH
4616 return namedclass;
4617}
4618
4619STATIC void
830247a4 4620S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 4621{
b938889d 4622 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
4623 const char *s = RExC_parse;
4624 const char c = *s++;
b8c5462f
JH
4625
4626 while(*s && isALNUM(*s))
4627 s++;
4628 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
4629 if (ckWARN(WARN_REGEXP))
4630 vWARN3(s+2,
4631 "POSIX syntax [%c %c] belongs inside character classes",
4632 c, c);
b45f050a
JF
4633
4634 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 4635 if (POSIXCC_NOTYET(c)) {
830247a4 4636 /* adjust RExC_parse so the error shows after
b45f050a 4637 the class closes */
9a86a77b 4638 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
4639 ;
4640 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4641 }
b8c5462f
JH
4642 }
4643 }
620e46c5
JH
4644}
4645
76e3520e 4646STATIC regnode *
830247a4 4647S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 4648{
ffc61ed2 4649 register UV value;
9a86a77b 4650 register UV nextvalue;
3568d838 4651 register IV prevvalue = OOB_UNICODE;
ffc61ed2 4652 register IV range = 0;
c277df42 4653 register regnode *ret;
ba210ebe 4654 STRLEN numlen;
ffc61ed2 4655 IV namedclass;
9c5ffd7c 4656 char *rangebegin = 0;
936ed897 4657 bool need_class = 0;
9c5ffd7c 4658 SV *listsv = Nullsv;
ffc61ed2
JH
4659 register char *e;
4660 UV n;
9e55ce06
JH
4661 bool optimize_invert = TRUE;
4662 AV* unicode_alternate = 0;
1b2d223b
JH
4663#ifdef EBCDIC
4664 UV literal_endpoint = 0;
4665#endif
ffc61ed2
JH
4666
4667 ret = reganode(pRExC_state, ANYOF, 0);
4668
4669 if (!SIZE_ONLY)
4670 ANYOF_FLAGS(ret) = 0;
4671
9a86a77b 4672 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
4673 RExC_naughty++;
4674 RExC_parse++;
4675 if (!SIZE_ONLY)
4676 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4677 }
a0d0e21e 4678
936ed897 4679 if (SIZE_ONLY)
830247a4 4680 RExC_size += ANYOF_SKIP;
936ed897 4681 else {
830247a4 4682 RExC_emit += ANYOF_SKIP;
936ed897
IZ
4683 if (FOLD)
4684 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4685 if (LOC)
4686 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
4687 ANYOF_BITMAP_ZERO(ret);
4688 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 4689 }
b8c5462f 4690
9a86a77b
JH
4691 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4692
b938889d 4693 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 4694 checkposixcc(pRExC_state);
b8c5462f 4695
f064b6ad
HS
4696 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4697 if (UCHARAT(RExC_parse) == ']')
4698 goto charclassloop;
ffc61ed2 4699
9a86a77b 4700 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
4701
4702 charclassloop:
4703
4704 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4705
73b437c8 4706 if (!range)
830247a4 4707 rangebegin = RExC_parse;
ffc61ed2 4708 if (UTF) {
5e12f4fb 4709 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
4710 RExC_end - RExC_parse,
4711 &numlen, 0);
ffc61ed2
JH
4712 RExC_parse += numlen;
4713 }
4714 else
4715 value = UCHARAT(RExC_parse++);
9a86a77b
JH
4716 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4717 if (value == '[' && POSIXCC(nextvalue))
830247a4 4718 namedclass = regpposixcc(pRExC_state, value);
620e46c5 4719 else if (value == '\\') {
ffc61ed2 4720 if (UTF) {
5e12f4fb 4721 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
4722 RExC_end - RExC_parse,
4723 &numlen, 0);
4724 RExC_parse += numlen;
4725 }
4726 else
4727 value = UCHARAT(RExC_parse++);
470c3474 4728 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 4729 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
4730 * be a problem later if we want switch on Unicode.
4731 * A similar issue a little bit later when switching on
4732 * namedclass. --jhi */
ffc61ed2 4733 switch ((I32)value) {
b8c5462f
JH
4734 case 'w': namedclass = ANYOF_ALNUM; break;
4735 case 'W': namedclass = ANYOF_NALNUM; break;
4736 case 's': namedclass = ANYOF_SPACE; break;
4737 case 'S': namedclass = ANYOF_NSPACE; break;
4738 case 'd': namedclass = ANYOF_DIGIT; break;
4739 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
4740 case 'p':
4741 case 'P':
af6f566e 4742 if (RExC_parse >= RExC_end)
2a4859cd 4743 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 4744 if (*RExC_parse == '{') {
1df70142 4745 const U8 c = (U8)value;
ffc61ed2
JH
4746 e = strchr(RExC_parse++, '}');
4747 if (!e)
0da60cf5 4748 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
4749 while (isSPACE(UCHARAT(RExC_parse)))
4750 RExC_parse++;
4751 if (e == RExC_parse)
0da60cf5 4752 vFAIL2("Empty \\%c{}", c);
ffc61ed2 4753 n = e - RExC_parse;
ab13f0c7
JH
4754 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4755 n--;
ffc61ed2
JH
4756 }
4757 else {
4758 e = RExC_parse;
4759 n = 1;
4760 }
4761 if (!SIZE_ONLY) {
ab13f0c7
JH
4762 if (UCHARAT(RExC_parse) == '^') {
4763 RExC_parse++;
4764 n--;
4765 value = value == 'p' ? 'P' : 'p'; /* toggle */
4766 while (isSPACE(UCHARAT(RExC_parse))) {
4767 RExC_parse++;
4768 n--;
4769 }
4770 }
ffc61ed2 4771 if (value == 'p')
ab13f0c7
JH
4772 Perl_sv_catpvf(aTHX_ listsv,
4773 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 4774 else
ab13f0c7
JH
4775 Perl_sv_catpvf(aTHX_ listsv,
4776 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
4777 }
4778 RExC_parse = e + 1;
4779 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2
JP
4780 namedclass = ANYOF_MAX; /* no official name, but it's named */
4781 break;
b8c5462f
JH
4782 case 'n': value = '\n'; break;
4783 case 'r': value = '\r'; break;
4784 case 't': value = '\t'; break;
4785 case 'f': value = '\f'; break;
4786 case 'b': value = '\b'; break;
c7f1f016
NIS
4787 case 'e': value = ASCII_TO_NATIVE('\033');break;
4788 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 4789 case 'x':
ffc61ed2 4790 if (*RExC_parse == '{') {
a4c04bdc
NC
4791 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4792 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 4793 e = strchr(RExC_parse++, '}');
b81d288d 4794 if (!e)
ffc61ed2 4795 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
4796
4797 numlen = e - RExC_parse;
4798 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4799 RExC_parse = e + 1;
4800 }
4801 else {
a4c04bdc 4802 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4803 numlen = 2;
4804 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4805 RExC_parse += numlen;
4806 }
b8c5462f
JH
4807 break;
4808 case 'c':
830247a4 4809 value = UCHARAT(RExC_parse++);
b8c5462f
JH
4810 value = toCTRL(value);
4811 break;
4812 case '0': case '1': case '2': case '3': case '4':
4813 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
4814 {
4815 I32 flags = 0;
4816 numlen = 3;
4817 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 4818 RExC_parse += numlen;
b8c5462f 4819 break;
53305cf1 4820 }
1028017a 4821 default:
041457d9 4822 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
4823 vWARN2(RExC_parse,
4824 "Unrecognized escape \\%c in character class passed through",
4825 (int)value);
1028017a 4826 break;
b8c5462f 4827 }
ffc61ed2 4828 } /* end of \blah */
1b2d223b
JH
4829#ifdef EBCDIC
4830 else
4831 literal_endpoint++;
4832#endif
ffc61ed2
JH
4833
4834 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4835
4836 if (!SIZE_ONLY && !need_class)
936ed897 4837 ANYOF_CLASS_ZERO(ret);
ffc61ed2 4838
936ed897 4839 need_class = 1;
ffc61ed2
JH
4840
4841 /* a bad range like a-\d, a-[:digit:] ? */
4842 if (range) {
73b437c8 4843 if (!SIZE_ONLY) {
e476b1b5 4844 if (ckWARN(WARN_REGEXP))
830247a4 4845 vWARN4(RExC_parse,
b45f050a 4846 "False [] range \"%*.*s\"",
830247a4
IZ
4847 RExC_parse - rangebegin,
4848 RExC_parse - rangebegin,
b45f050a 4849 rangebegin);
3568d838
JH
4850 if (prevvalue < 256) {
4851 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
4852 ANYOF_BITMAP_SET(ret, '-');
4853 }
4854 else {
4855 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4856 Perl_sv_catpvf(aTHX_ listsv,
3568d838 4857 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 4858 }
b8c5462f 4859 }
ffc61ed2
JH
4860
4861 range = 0; /* this was not a true range */
73b437c8 4862 }
ffc61ed2 4863
73b437c8 4864 if (!SIZE_ONLY) {
c49a72a9
NC
4865 const char *what = NULL;
4866 char yesno = 0;
4867
3568d838
JH
4868 if (namedclass > OOB_NAMEDCLASS)
4869 optimize_invert = FALSE;
e2962f66
JH
4870 /* Possible truncation here but in some 64-bit environments
4871 * the compiler gets heartburn about switch on 64-bit values.
4872 * A similar issue a little earlier when switching on value.
98f323fa 4873 * --jhi */
e2962f66 4874 switch ((I32)namedclass) {
73b437c8
JH
4875 case ANYOF_ALNUM:
4876 if (LOC)
936ed897 4877 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
4878 else {
4879 for (value = 0; value < 256; value++)
4880 if (isALNUM(value))
936ed897 4881 ANYOF_BITMAP_SET(ret, value);
73b437c8 4882 }
c49a72a9
NC
4883 yesno = '+';
4884 what = "Word";
73b437c8
JH
4885 break;
4886 case ANYOF_NALNUM:
4887 if (LOC)
936ed897 4888 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
4889 else {
4890 for (value = 0; value < 256; value++)
4891 if (!isALNUM(value))
936ed897 4892 ANYOF_BITMAP_SET(ret, value);
73b437c8 4893 }
c49a72a9
NC
4894 yesno = '!';
4895 what = "Word";
73b437c8 4896 break;
ffc61ed2 4897 case ANYOF_ALNUMC:
73b437c8 4898 if (LOC)
ffc61ed2 4899 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
4900 else {
4901 for (value = 0; value < 256; value++)
ffc61ed2 4902 if (isALNUMC(value))
936ed897 4903 ANYOF_BITMAP_SET(ret, value);
73b437c8 4904 }
c49a72a9
NC
4905 yesno = '+';
4906 what = "Alnum";
73b437c8
JH
4907 break;
4908 case ANYOF_NALNUMC:
4909 if (LOC)
936ed897 4910 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
4911 else {
4912 for (value = 0; value < 256; value++)
4913 if (!isALNUMC(value))
936ed897 4914 ANYOF_BITMAP_SET(ret, value);
73b437c8 4915 }
c49a72a9
NC
4916 yesno = '!';
4917 what = "Alnum";
73b437c8
JH
4918 break;
4919 case ANYOF_ALPHA:
4920 if (LOC)
936ed897 4921 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
4922 else {
4923 for (value = 0; value < 256; value++)
4924 if (isALPHA(value))
936ed897 4925 ANYOF_BITMAP_SET(ret, value);
73b437c8 4926 }
c49a72a9
NC
4927 yesno = '+';
4928 what = "Alpha";
73b437c8
JH
4929 break;
4930 case ANYOF_NALPHA:
4931 if (LOC)
936ed897 4932 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
4933 else {
4934 for (value = 0; value < 256; value++)
4935 if (!isALPHA(value))
936ed897 4936 ANYOF_BITMAP_SET(ret, value);
73b437c8 4937 }
c49a72a9
NC
4938 yesno = '!';
4939 what = "Alpha";
73b437c8
JH
4940 break;
4941 case ANYOF_ASCII:
4942 if (LOC)
936ed897 4943 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 4944 else {
c7f1f016 4945#ifndef EBCDIC
1ba5c669
JH
4946 for (value = 0; value < 128; value++)
4947 ANYOF_BITMAP_SET(ret, value);
4948#else /* EBCDIC */
ffbc6a93 4949 for (value = 0; value < 256; value++) {
3a3c4447
JH
4950 if (isASCII(value))
4951 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4952 }
1ba5c669 4953#endif /* EBCDIC */
73b437c8 4954 }
c49a72a9
NC
4955 yesno = '+';
4956 what = "ASCII";
73b437c8
JH
4957 break;
4958 case ANYOF_NASCII:
4959 if (LOC)
936ed897 4960 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 4961 else {
c7f1f016 4962#ifndef EBCDIC
1ba5c669
JH
4963 for (value = 128; value < 256; value++)
4964 ANYOF_BITMAP_SET(ret, value);
4965#else /* EBCDIC */
ffbc6a93 4966 for (value = 0; value < 256; value++) {
3a3c4447
JH
4967 if (!isASCII(value))
4968 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4969 }
1ba5c669 4970#endif /* EBCDIC */
73b437c8 4971 }
c49a72a9
NC
4972 yesno = '!';
4973 what = "ASCII";
73b437c8 4974 break;
aaa51d5e
JF
4975 case ANYOF_BLANK:
4976 if (LOC)
4977 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4978 else {
4979 for (value = 0; value < 256; value++)
4980 if (isBLANK(value))
4981 ANYOF_BITMAP_SET(ret, value);
4982 }
c49a72a9
NC
4983 yesno = '+';
4984 what = "Blank";
aaa51d5e
JF
4985 break;
4986 case ANYOF_NBLANK:
4987 if (LOC)
4988 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4989 else {
4990 for (value = 0; value < 256; value++)
4991 if (!isBLANK(value))
4992 ANYOF_BITMAP_SET(ret, value);
4993 }
c49a72a9
NC
4994 yesno = '!';
4995 what = "Blank";
aaa51d5e 4996 break;
73b437c8
JH
4997 case ANYOF_CNTRL:
4998 if (LOC)
936ed897 4999 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
5000 else {
5001 for (value = 0; value < 256; value++)
5002 if (isCNTRL(value))
936ed897 5003 ANYOF_BITMAP_SET(ret, value);
73b437c8 5004 }
c49a72a9
NC
5005 yesno = '+';
5006 what = "Cntrl";
73b437c8
JH
5007 break;
5008 case ANYOF_NCNTRL:
5009 if (LOC)
936ed897 5010 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5011 else {
5012 for (value = 0; value < 256; value++)
5013 if (!isCNTRL(value))
936ed897 5014 ANYOF_BITMAP_SET(ret, value);
73b437c8 5015 }
c49a72a9
NC
5016 yesno = '!';
5017 what = "Cntrl";
ffc61ed2
JH
5018 break;
5019 case ANYOF_DIGIT:
5020 if (LOC)
5021 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5022 else {
5023 /* consecutive digits assumed */
5024 for (value = '0'; value <= '9'; value++)
5025 ANYOF_BITMAP_SET(ret, value);
5026 }
c49a72a9
NC
5027 yesno = '+';
5028 what = "Digit";
ffc61ed2
JH
5029 break;
5030 case ANYOF_NDIGIT:
5031 if (LOC)
5032 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5033 else {
5034 /* consecutive digits assumed */
5035 for (value = 0; value < '0'; value++)
5036 ANYOF_BITMAP_SET(ret, value);
5037 for (value = '9' + 1; value < 256; value++)
5038 ANYOF_BITMAP_SET(ret, value);
5039 }
c49a72a9
NC
5040 yesno = '!';
5041 what = "Digit";
73b437c8
JH
5042 break;
5043 case ANYOF_GRAPH:
5044 if (LOC)
936ed897 5045 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5046 else {
5047 for (value = 0; value < 256; value++)
5048 if (isGRAPH(value))
936ed897 5049 ANYOF_BITMAP_SET(ret, value);
73b437c8 5050 }
c49a72a9
NC
5051 yesno = '+';
5052 what = "Graph";
73b437c8
JH
5053 break;
5054 case ANYOF_NGRAPH:
5055 if (LOC)
936ed897 5056 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5057 else {
5058 for (value = 0; value < 256; value++)
5059 if (!isGRAPH(value))
936ed897 5060 ANYOF_BITMAP_SET(ret, value);
73b437c8 5061 }
c49a72a9
NC
5062 yesno = '!';
5063 what = "Graph";
73b437c8
JH
5064 break;
5065 case ANYOF_LOWER:
5066 if (LOC)
936ed897 5067 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5068 else {
5069 for (value = 0; value < 256; value++)
5070 if (isLOWER(value))
936ed897 5071 ANYOF_BITMAP_SET(ret, value);
73b437c8 5072 }
c49a72a9
NC
5073 yesno = '+';
5074 what = "Lower";
73b437c8
JH
5075 break;
5076 case ANYOF_NLOWER:
5077 if (LOC)
936ed897 5078 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5079 else {
5080 for (value = 0; value < 256; value++)
5081 if (!isLOWER(value))
936ed897 5082 ANYOF_BITMAP_SET(ret, value);
73b437c8 5083 }
c49a72a9
NC
5084 yesno = '!';
5085 what = "Lower";
73b437c8
JH
5086 break;
5087 case ANYOF_PRINT:
5088 if (LOC)
936ed897 5089 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5090 else {
5091 for (value = 0; value < 256; value++)
5092 if (isPRINT(value))
936ed897 5093 ANYOF_BITMAP_SET(ret, value);
73b437c8 5094 }
c49a72a9
NC
5095 yesno = '+';
5096 what = "Print";
73b437c8
JH
5097 break;
5098 case ANYOF_NPRINT:
5099 if (LOC)
936ed897 5100 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5101 else {
5102 for (value = 0; value < 256; value++)
5103 if (!isPRINT(value))
936ed897 5104 ANYOF_BITMAP_SET(ret, value);
73b437c8 5105 }
c49a72a9
NC
5106 yesno = '!';
5107 what = "Print";
73b437c8 5108 break;
aaa51d5e
JF
5109 case ANYOF_PSXSPC:
5110 if (LOC)
5111 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5112 else {
5113 for (value = 0; value < 256; value++)
5114 if (isPSXSPC(value))
5115 ANYOF_BITMAP_SET(ret, value);
5116 }
c49a72a9
NC
5117 yesno = '+';
5118 what = "Space";
aaa51d5e
JF
5119 break;
5120 case ANYOF_NPSXSPC:
5121 if (LOC)
5122 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5123 else {
5124 for (value = 0; value < 256; value++)
5125 if (!isPSXSPC(value))
5126 ANYOF_BITMAP_SET(ret, value);
5127 }
c49a72a9
NC
5128 yesno = '!';
5129 what = "Space";
aaa51d5e 5130 break;
73b437c8
JH
5131 case ANYOF_PUNCT:
5132 if (LOC)
936ed897 5133 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5134 else {
5135 for (value = 0; value < 256; value++)
5136 if (isPUNCT(value))
936ed897 5137 ANYOF_BITMAP_SET(ret, value);
73b437c8 5138 }
c49a72a9
NC
5139 yesno = '+';
5140 what = "Punct";
73b437c8
JH
5141 break;
5142 case ANYOF_NPUNCT:
5143 if (LOC)
936ed897 5144 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5145 else {
5146 for (value = 0; value < 256; value++)
5147 if (!isPUNCT(value))
936ed897 5148 ANYOF_BITMAP_SET(ret, value);
73b437c8 5149 }
c49a72a9
NC
5150 yesno = '!';
5151 what = "Punct";
ffc61ed2
JH
5152 break;
5153 case ANYOF_SPACE:
5154 if (LOC)
5155 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5156 else {
5157 for (value = 0; value < 256; value++)
5158 if (isSPACE(value))
5159 ANYOF_BITMAP_SET(ret, value);
5160 }
c49a72a9
NC
5161 yesno = '+';
5162 what = "SpacePerl";
ffc61ed2
JH
5163 break;
5164 case ANYOF_NSPACE:
5165 if (LOC)
5166 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5167 else {
5168 for (value = 0; value < 256; value++)
5169 if (!isSPACE(value))
5170 ANYOF_BITMAP_SET(ret, value);
5171 }
c49a72a9
NC
5172 yesno = '!';
5173 what = "SpacePerl";
73b437c8
JH
5174 break;
5175 case ANYOF_UPPER:
5176 if (LOC)
936ed897 5177 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5178 else {
5179 for (value = 0; value < 256; value++)
5180 if (isUPPER(value))
936ed897 5181 ANYOF_BITMAP_SET(ret, value);
73b437c8 5182 }
c49a72a9
NC
5183 yesno = '+';
5184 what = "Upper";
73b437c8
JH
5185 break;
5186 case ANYOF_NUPPER:
5187 if (LOC)
936ed897 5188 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5189 else {
5190 for (value = 0; value < 256; value++)
5191 if (!isUPPER(value))
936ed897 5192 ANYOF_BITMAP_SET(ret, value);
73b437c8 5193 }
c49a72a9
NC
5194 yesno = '!';
5195 what = "Upper";
73b437c8
JH
5196 break;
5197 case ANYOF_XDIGIT:
5198 if (LOC)
936ed897 5199 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5200 else {
5201 for (value = 0; value < 256; value++)
5202 if (isXDIGIT(value))
936ed897 5203 ANYOF_BITMAP_SET(ret, value);
73b437c8 5204 }
c49a72a9
NC
5205 yesno = '+';
5206 what = "XDigit";
73b437c8
JH
5207 break;
5208 case ANYOF_NXDIGIT:
5209 if (LOC)
936ed897 5210 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5211 else {
5212 for (value = 0; value < 256; value++)
5213 if (!isXDIGIT(value))
936ed897 5214 ANYOF_BITMAP_SET(ret, value);
73b437c8 5215 }
c49a72a9
NC
5216 yesno = '!';
5217 what = "XDigit";
73b437c8 5218 break;
f81125e2
JP
5219 case ANYOF_MAX:
5220 /* this is to handle \p and \P */
5221 break;
73b437c8 5222 default:
b45f050a 5223 vFAIL("Invalid [::] class");
73b437c8 5224 break;
b8c5462f 5225 }
c49a72a9
NC
5226 if (what) {
5227 /* Strings such as "+utf8::isWord\n" */
5228 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5229 }
b8c5462f 5230 if (LOC)
936ed897 5231 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5232 continue;
a0d0e21e 5233 }
ffc61ed2
JH
5234 } /* end of namedclass \blah */
5235
a0d0e21e 5236 if (range) {
eb160463 5237 if (prevvalue > (IV)value) /* b-a */ {
b45f050a 5238 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
5239 RExC_parse - rangebegin,
5240 RExC_parse - rangebegin,
b45f050a 5241 rangebegin);
3568d838 5242 range = 0; /* not a valid range */
73b437c8 5243 }
a0d0e21e
LW
5244 }
5245 else {
3568d838 5246 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5247 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5248 RExC_parse[1] != ']') {
5249 RExC_parse++;
ffc61ed2
JH
5250
5251 /* a bad range like \w-, [:word:]- ? */
5252 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 5253 if (ckWARN(WARN_REGEXP))
830247a4 5254 vWARN4(RExC_parse,
b45f050a 5255 "False [] range \"%*.*s\"",
830247a4
IZ
5256 RExC_parse - rangebegin,
5257 RExC_parse - rangebegin,
b45f050a 5258 rangebegin);
73b437c8 5259 if (!SIZE_ONLY)
936ed897 5260 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5261 } else
ffc61ed2
JH
5262 range = 1; /* yeah, it's a range! */
5263 continue; /* but do it the next time */
a0d0e21e 5264 }
a687059c 5265 }
ffc61ed2 5266
93733859 5267 /* now is the next time */
ae5c130c 5268 if (!SIZE_ONLY) {
3568d838
JH
5269 IV i;
5270
5271 if (prevvalue < 256) {
1df70142 5272 const IV ceilvalue = value < 256 ? value : 255;
3568d838
JH
5273
5274#ifdef EBCDIC
1b2d223b
JH
5275 /* In EBCDIC [\x89-\x91] should include
5276 * the \x8e but [i-j] should not. */
5277 if (literal_endpoint == 2 &&
5278 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5279 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5280 {
3568d838
JH
5281 if (isLOWER(prevvalue)) {
5282 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5283 if (isLOWER(i))
5284 ANYOF_BITMAP_SET(ret, i);
5285 } else {
3568d838 5286 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5287 if (isUPPER(i))
5288 ANYOF_BITMAP_SET(ret, i);
5289 }
8ada0baa 5290 }
ffc61ed2 5291 else
8ada0baa 5292#endif
a5961de5
JH
5293 for (i = prevvalue; i <= ceilvalue; i++)
5294 ANYOF_BITMAP_SET(ret, i);
3568d838 5295 }
a5961de5 5296 if (value > 255 || UTF) {
1df70142
AL
5297 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5298 const UV natvalue = NATIVE_TO_UNI(value);
b08decb7 5299
ffc61ed2 5300 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5301 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5302 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5303 prevnatvalue, natvalue);
5304 }
5305 else if (prevnatvalue == natvalue) {
5306 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5307 if (FOLD) {
89ebb4a3 5308 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5309 STRLEN foldlen;
1df70142 5310 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5311
c840d2a2
JH
5312 /* If folding and foldable and a single
5313 * character, insert also the folded version
5314 * to the charclass. */
9e55ce06 5315 if (f != value) {
eb160463 5316 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5317 Perl_sv_catpvf(aTHX_ listsv,
5318 "%04"UVxf"\n", f);
5319 else {
5320 /* Any multicharacter foldings
5321 * require the following transform:
5322 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5323 * where E folds into "pq" and F folds
5324 * into "rst", all other characters
5325 * fold to single characters. We save
5326 * away these multicharacter foldings,
5327 * to be later saved as part of the
5328 * additional "s" data. */
5329 SV *sv;
5330
5331 if (!unicode_alternate)
5332 unicode_alternate = newAV();
5333 sv = newSVpvn((char*)foldbuf, foldlen);
5334 SvUTF8_on(sv);
5335 av_push(unicode_alternate, sv);
5336 }
5337 }
254ba52a 5338
60a8b682
JH
5339 /* If folding and the value is one of the Greek
5340 * sigmas insert a few more sigmas to make the
5341 * folding rules of the sigmas to work right.
5342 * Note that not all the possible combinations
5343 * are handled here: some of them are handled
9e55ce06
JH
5344 * by the standard folding rules, and some of
5345 * them (literal or EXACTF cases) are handled
5346 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5347 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5348 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5349 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5350 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5351 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5352 }
5353 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5354 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5355 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5356 }
5357 }
ffc61ed2 5358 }
1b2d223b
JH
5359#ifdef EBCDIC
5360 literal_endpoint = 0;
5361#endif
8ada0baa 5362 }
ffc61ed2
JH
5363
5364 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5365 }
ffc61ed2 5366
936ed897 5367 if (need_class) {
4f66b38d 5368 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5369 if (SIZE_ONLY)
830247a4 5370 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5371 else
830247a4 5372 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5373 }
ffc61ed2 5374
ae5c130c 5375 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 5376 if (!SIZE_ONLY &&
ffc61ed2 5377 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5378 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5379 ) {
a0ed51b3 5380 for (value = 0; value < 256; ++value) {
936ed897 5381 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5382 UV fold = PL_fold[value];
ffc61ed2
JH
5383
5384 if (fold != value)
5385 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5386 }
5387 }
936ed897 5388 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5389 }
ffc61ed2 5390
ae5c130c 5391 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 5392 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
5393 /* If the only flag is inversion. */
5394 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5395 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5396 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5397 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5398 }
a0d0e21e 5399
b81d288d 5400 if (!SIZE_ONLY) {
fde631ed 5401 AV *av = newAV();
ffc61ed2
JH
5402 SV *rv;
5403
9e55ce06 5404 /* The 0th element stores the character class description
6a0407ee 5405 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5406 * to initialize the appropriate swash (which gets stored in
5407 * the 1st element), and also useful for dumping the regnode.
5408 * The 2nd element stores the multicharacter foldings,
6a0407ee 5409 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
5410 av_store(av, 0, listsv);
5411 av_store(av, 1, NULL);
9e55ce06 5412 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 5413 rv = newRV_noinc((SV*)av);
19860706 5414 n = add_data(pRExC_state, 1, "s");
830247a4 5415 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 5416 ARG_SET(ret, n);
a0ed51b3
LW
5417 }
5418
5419 return ret;
5420}
5421
76e3520e 5422STATIC char*
830247a4 5423S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 5424{
830247a4 5425 char* retval = RExC_parse++;
a0d0e21e 5426
4633a7c4 5427 for (;;) {
830247a4
IZ
5428 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5429 RExC_parse[2] == '#') {
e994fd66
AE
5430 while (*RExC_parse != ')') {
5431 if (RExC_parse == RExC_end)
5432 FAIL("Sequence (?#... not terminated");
830247a4 5433 RExC_parse++;
e994fd66 5434 }
830247a4 5435 RExC_parse++;
4633a7c4
LW
5436 continue;
5437 }
e2509266 5438 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
5439 if (isSPACE(*RExC_parse)) {
5440 RExC_parse++;
748a9306
LW
5441 continue;
5442 }
830247a4 5443 else if (*RExC_parse == '#') {
e994fd66
AE
5444 while (RExC_parse < RExC_end)
5445 if (*RExC_parse++ == '\n') break;
748a9306
LW
5446 continue;
5447 }
748a9306 5448 }
4633a7c4 5449 return retval;
a0d0e21e 5450 }
a687059c
LW
5451}
5452
5453/*
c277df42 5454- reg_node - emit a node
a0d0e21e 5455*/
76e3520e 5456STATIC regnode * /* Location. */
830247a4 5457S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 5458{
c277df42 5459 register regnode *ptr;
504618e9 5460 regnode * const ret = RExC_emit;
a687059c 5461
c277df42 5462 if (SIZE_ONLY) {
830247a4
IZ
5463 SIZE_ALIGN(RExC_size);
5464 RExC_size += 1;
a0d0e21e
LW
5465 return(ret);
5466 }
a687059c 5467
c277df42 5468 NODE_ALIGN_FILL(ret);
a0d0e21e 5469 ptr = ret;
c277df42 5470 FILL_ADVANCE_NODE(ptr, op);
fac92740 5471 if (RExC_offsets) { /* MJD */
ccb2c380 5472 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
5473 "reg_node", __LINE__,
5474 reg_name[op],
5475 RExC_emit - RExC_emit_start > RExC_offsets[0]
5476 ? "Overwriting end of array!\n" : "OK",
5477 RExC_emit - RExC_emit_start,
5478 RExC_parse - RExC_start,
5479 RExC_offsets[0]));
ccb2c380 5480 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
5481 }
5482
830247a4 5483 RExC_emit = ptr;
a687059c 5484
a0d0e21e 5485 return(ret);
a687059c
LW
5486}
5487
5488/*
a0d0e21e
LW
5489- reganode - emit a node with an argument
5490*/
76e3520e 5491STATIC regnode * /* Location. */
830247a4 5492S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 5493{
c277df42 5494 register regnode *ptr;
504618e9 5495 regnode * const ret = RExC_emit;
fe14fcc3 5496
c277df42 5497 if (SIZE_ONLY) {
830247a4
IZ
5498 SIZE_ALIGN(RExC_size);
5499 RExC_size += 2;
a0d0e21e
LW
5500 return(ret);
5501 }
fe14fcc3 5502
c277df42 5503 NODE_ALIGN_FILL(ret);
a0d0e21e 5504 ptr = ret;
c277df42 5505 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 5506 if (RExC_offsets) { /* MJD */
ccb2c380 5507 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5508 "reganode",
ccb2c380
MP
5509 __LINE__,
5510 reg_name[op],
fac92740
MJD
5511 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5512 "Overwriting end of array!\n" : "OK",
5513 RExC_emit - RExC_emit_start,
5514 RExC_parse - RExC_start,
5515 RExC_offsets[0]));
ccb2c380 5516 Set_Cur_Node_Offset;
fac92740
MJD
5517 }
5518
830247a4 5519 RExC_emit = ptr;
fe14fcc3 5520
a0d0e21e 5521 return(ret);
fe14fcc3
LW
5522}
5523
5524/*
cd439c50 5525- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
5526*/
5527STATIC void
a28509cc 5528S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 5529{
5e12f4fb 5530 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
5531}
5532
5533/*
a0d0e21e
LW
5534- reginsert - insert an operator in front of already-emitted operand
5535*
5536* Means relocating the operand.
5537*/
76e3520e 5538STATIC void
830247a4 5539S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 5540{
c277df42
IZ
5541 register regnode *src;
5542 register regnode *dst;
5543 register regnode *place;
504618e9 5544 const int offset = regarglen[(U8)op];
b81d288d 5545
22c35a8c 5546/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
5547
5548 if (SIZE_ONLY) {
830247a4 5549 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
5550 return;
5551 }
a687059c 5552
830247a4
IZ
5553 src = RExC_emit;
5554 RExC_emit += NODE_STEP_REGNODE + offset;
5555 dst = RExC_emit;
fac92740 5556 while (src > opnd) {
c277df42 5557 StructCopy(--src, --dst, regnode);
fac92740 5558 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 5559 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 5560 "reg_insert",
ccb2c380
MP
5561 __LINE__,
5562 reg_name[op],
fac92740
MJD
5563 dst - RExC_emit_start > RExC_offsets[0]
5564 ? "Overwriting end of array!\n" : "OK",
5565 src - RExC_emit_start,
5566 dst - RExC_emit_start,
5567 RExC_offsets[0]));
ccb2c380
MP
5568 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5569 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
5570 }
5571 }
5572
a0d0e21e
LW
5573
5574 place = opnd; /* Op node, where operand used to be. */
fac92740 5575 if (RExC_offsets) { /* MJD */
ccb2c380 5576 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5577 "reginsert",
ccb2c380
MP
5578 __LINE__,
5579 reg_name[op],
fac92740
MJD
5580 place - RExC_emit_start > RExC_offsets[0]
5581 ? "Overwriting end of array!\n" : "OK",
5582 place - RExC_emit_start,
5583 RExC_parse - RExC_start,
5584 RExC_offsets[0]));
ccb2c380 5585 Set_Node_Offset(place, RExC_parse);
45948336 5586 Set_Node_Length(place, 1);
fac92740 5587 }
c277df42
IZ
5588 src = NEXTOPER(place);
5589 FILL_ADVANCE_NODE(place, op);
5590 Zero(src, offset, regnode);
a687059c
LW
5591}
5592
5593/*
c277df42 5594- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 5595*/
76e3520e 5596STATIC void
830247a4 5597S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5598{
c277df42 5599 register regnode *scan;
a0d0e21e 5600
c277df42 5601 if (SIZE_ONLY)
a0d0e21e
LW
5602 return;
5603
5604 /* Find last node. */
5605 scan = p;
5606 for (;;) {
504618e9 5607 regnode * const temp = regnext(scan);
a0d0e21e
LW
5608 if (temp == NULL)
5609 break;
5610 scan = temp;
5611 }
a687059c 5612
c277df42
IZ
5613 if (reg_off_by_arg[OP(scan)]) {
5614 ARG_SET(scan, val - scan);
a0ed51b3
LW
5615 }
5616 else {
c277df42
IZ
5617 NEXT_OFF(scan) = val - scan;
5618 }
a687059c
LW
5619}
5620
5621/*
a0d0e21e
LW
5622- regoptail - regtail on operand of first argument; nop if operandless
5623*/
76e3520e 5624STATIC void
830247a4 5625S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5626{
a0d0e21e 5627 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
5628 if (p == NULL || SIZE_ONLY)
5629 return;
22c35a8c 5630 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 5631 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 5632 }
22c35a8c 5633 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 5634 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
5635 }
5636 else
a0d0e21e 5637 return;
a687059c
LW
5638}
5639
5640/*
5641 - regcurly - a little FSA that accepts {\d+,?\d*}
5642 */
79072805 5643STATIC I32
a3b680e6 5644S_regcurly(pTHX_ register const char *s)
a687059c
LW
5645{
5646 if (*s++ != '{')
5647 return FALSE;
f0fcb552 5648 if (!isDIGIT(*s))
a687059c 5649 return FALSE;
f0fcb552 5650 while (isDIGIT(*s))
a687059c
LW
5651 s++;
5652 if (*s == ',')
5653 s++;
f0fcb552 5654 while (isDIGIT(*s))
a687059c
LW
5655 s++;
5656 if (*s != '}')
5657 return FALSE;
5658 return TRUE;
5659}
5660
a687059c
LW
5661
5662/*
fd181c75 5663 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
5664 */
5665void
864dbfa3 5666Perl_regdump(pTHX_ regexp *r)
a687059c 5667{
35ff7856 5668#ifdef DEBUGGING
46fc3d4c 5669 SV *sv = sv_newmortal();
a687059c 5670
c277df42 5671 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
5672
5673 /* Header fields of interest. */
c277df42 5674 if (r->anchored_substr)
7b0972df 5675 PerlIO_printf(Perl_debug_log,
a0288114 5676 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
3280af22 5677 PL_colors[0],
7b0972df 5678 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
3f7c398e 5679 SvPVX_const(r->anchored_substr),
3280af22 5680 PL_colors[1],
c277df42 5681 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 5682 (IV)r->anchored_offset);
33b8afdf
JH
5683 else if (r->anchored_utf8)
5684 PerlIO_printf(Perl_debug_log,
a0288114 5685 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
33b8afdf
JH
5686 PL_colors[0],
5687 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
3f7c398e 5688 SvPVX_const(r->anchored_utf8),
33b8afdf
JH
5689 PL_colors[1],
5690 SvTAIL(r->anchored_utf8) ? "$" : "",
5691 (IV)r->anchored_offset);
c277df42 5692 if (r->float_substr)
7b0972df 5693 PerlIO_printf(Perl_debug_log,
a0288114 5694 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
3280af22 5695 PL_colors[0],
b81d288d 5696 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
3f7c398e 5697 SvPVX_const(r->float_substr),
3280af22 5698 PL_colors[1],
c277df42 5699 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 5700 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
5701 else if (r->float_utf8)
5702 PerlIO_printf(Perl_debug_log,
a0288114 5703 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
33b8afdf
JH
5704 PL_colors[0],
5705 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
3f7c398e 5706 SvPVX_const(r->float_utf8),
33b8afdf
JH
5707 PL_colors[1],
5708 SvTAIL(r->float_utf8) ? "$" : "",
5709 (IV)r->float_min_offset, (UV)r->float_max_offset);
5710 if (r->check_substr || r->check_utf8)
b81d288d
AB
5711 PerlIO_printf(Perl_debug_log,
5712 r->check_substr == r->float_substr
33b8afdf 5713 && r->check_utf8 == r->float_utf8
c277df42
IZ
5714 ? "(checking floating" : "(checking anchored");
5715 if (r->reganch & ROPT_NOSCAN)
5716 PerlIO_printf(Perl_debug_log, " noscan");
5717 if (r->reganch & ROPT_CHECK_ALL)
5718 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 5719 if (r->check_substr || r->check_utf8)
c277df42
IZ
5720 PerlIO_printf(Perl_debug_log, ") ");
5721
46fc3d4c 5722 if (r->regstclass) {
5723 regprop(sv, r->regstclass);
3f7c398e 5724 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
46fc3d4c 5725 }
774d564b 5726 if (r->reganch & ROPT_ANCH) {
5727 PerlIO_printf(Perl_debug_log, "anchored");
5728 if (r->reganch & ROPT_ANCH_BOL)
5729 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
5730 if (r->reganch & ROPT_ANCH_MBOL)
5731 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
5732 if (r->reganch & ROPT_ANCH_SBOL)
5733 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 5734 if (r->reganch & ROPT_ANCH_GPOS)
5735 PerlIO_printf(Perl_debug_log, "(GPOS)");
5736 PerlIO_putc(Perl_debug_log, ' ');
5737 }
c277df42
IZ
5738 if (r->reganch & ROPT_GPOS_SEEN)
5739 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 5740 if (r->reganch & ROPT_SKIP)
760ac839 5741 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 5742 if (r->reganch & ROPT_IMPLICIT)
760ac839 5743 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 5744 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
5745 if (r->reganch & ROPT_EVAL_SEEN)
5746 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 5747 PerlIO_printf(Perl_debug_log, "\n");
fac92740 5748 if (r->offsets) {
e4584336 5749 const U32 len = r->offsets[0];
a3621e74
YO
5750 GET_RE_DEBUG_FLAGS_DECL;
5751 DEBUG_OFFSETS_r({
1df70142 5752 U32 i;
e4584336
RB
5753 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5754 for (i = 1; i <= len; i++)
5755 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5756 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5757 PerlIO_printf(Perl_debug_log, "\n");
a3621e74 5758 });
fac92740 5759 }
65e66c80
SP
5760#else
5761 PERL_UNUSED_ARG(r);
17c3b450 5762#endif /* DEBUGGING */
a687059c
LW
5763}
5764
5765/*
a0d0e21e
LW
5766- regprop - printable representation of opcode
5767*/
46fc3d4c 5768void
a3b680e6 5769Perl_regprop(pTHX_ SV *sv, const regnode *o)
a687059c 5770{
35ff7856 5771#ifdef DEBUGGING
9b155405 5772 register int k;
a0d0e21e 5773
54dc92de 5774 sv_setpvn(sv, "", 0);
9b155405 5775 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
5776 /* It would be nice to FAIL() here, but this may be called from
5777 regexec.c, and it would be hard to supply pRExC_state. */
5778 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 5779 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405
IZ
5780
5781 k = PL_regkind[(U8)OP(o)];
5782
2a782b5b 5783 if (k == EXACT) {
2d03de9c 5784 SV * const dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
5785 /* Using is_utf8_string() is a crude hack but it may
5786 * be the best for now since we have no flag "this EXACTish
5787 * node was UTF-8" --jhi */
1df70142 5788 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
2d03de9c 5789 const char * const s = do_utf8 ?
c728cb41
JH
5790 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5791 UNI_DISPLAY_REGEX) :
2a782b5b 5792 STRING(o);
e1ec3a88 5793 const int len = do_utf8 ?
2a782b5b
JH
5794 strlen(s) :
5795 STR_LEN(o);
5796 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5797 PL_colors[0],
5798 len, s,
5799 PL_colors[1]);
a3621e74
YO
5800 } else if (k == TRIE) {/*
5801 this isn't always safe, as Pl_regdata may not be for this regex yet
5802 (depending on where its called from) so its being moved to dumpuntil
5803 I32 n = ARG(o);
5804 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5805 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5806 trie->wordcount,
5807 trie->charcount,
5808 trie->uniquecharcount,
5809 trie->laststate);
5810 */
5811 } else if (k == CURLY) {
cb434fcc 5812 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
5813 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5814 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 5815 }
2c2d71f5
JH
5816 else if (k == WHILEM && o->flags) /* Ordinal/of */
5817 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 5818 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 5819 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 5820 else if (k == LOGICAL)
04ebc1ab 5821 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
5822 else if (k == ANYOF) {
5823 int i, rangestart = -1;
2d03de9c 5824 const U8 flags = ANYOF_FLAGS(o);
a6d05634 5825 const char * const anyofs[] = { /* Should be synchronized with
19860706 5826 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
5827 "\\w",
5828 "\\W",
5829 "\\s",
5830 "\\S",
5831 "\\d",
5832 "\\D",
5833 "[:alnum:]",
5834 "[:^alnum:]",
5835 "[:alpha:]",
5836 "[:^alpha:]",
5837 "[:ascii:]",
5838 "[:^ascii:]",
5839 "[:ctrl:]",
5840 "[:^ctrl:]",
5841 "[:graph:]",
5842 "[:^graph:]",
5843 "[:lower:]",
5844 "[:^lower:]",
5845 "[:print:]",
5846 "[:^print:]",
5847 "[:punct:]",
5848 "[:^punct:]",
5849 "[:upper:]",
aaa51d5e 5850 "[:^upper:]",
653099ff 5851 "[:xdigit:]",
aaa51d5e
JF
5852 "[:^xdigit:]",
5853 "[:space:]",
5854 "[:^space:]",
5855 "[:blank:]",
5856 "[:^blank:]"
653099ff
GS
5857 };
5858
19860706 5859 if (flags & ANYOF_LOCALE)
653099ff 5860 sv_catpv(sv, "{loc}");
19860706 5861 if (flags & ANYOF_FOLD)
653099ff
GS
5862 sv_catpv(sv, "{i}");
5863 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 5864 if (flags & ANYOF_INVERT)
653099ff 5865 sv_catpv(sv, "^");
ffc61ed2
JH
5866 for (i = 0; i <= 256; i++) {
5867 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5868 if (rangestart == -1)
5869 rangestart = i;
5870 } else if (rangestart != -1) {
5871 if (i <= rangestart + 3)
5872 for (; rangestart < i; rangestart++)
653099ff 5873 put_byte(sv, rangestart);
ffc61ed2
JH
5874 else {
5875 put_byte(sv, rangestart);
5876 sv_catpv(sv, "-");
5877 put_byte(sv, i - 1);
653099ff 5878 }
ffc61ed2 5879 rangestart = -1;
653099ff 5880 }
847a199f 5881 }
ffc61ed2
JH
5882
5883 if (o->flags & ANYOF_CLASS)
5884 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
5885 if (ANYOF_CLASS_TEST(o,i))
5886 sv_catpv(sv, anyofs[i]);
5887
5888 if (flags & ANYOF_UNICODE)
5889 sv_catpv(sv, "{unicode}");
1aa99e6b 5890 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 5891 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
5892
5893 {
5894 SV *lv;
2d03de9c 5895 SV * const sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 5896
ffc61ed2
JH
5897 if (lv) {
5898 if (sw) {
89ebb4a3 5899 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 5900
ffc61ed2 5901 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 5902 uvchr_to_utf8(s, i);
ffc61ed2 5903
3568d838 5904 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
5905 if (rangestart == -1)
5906 rangestart = i;
5907 } else if (rangestart != -1) {
ffc61ed2
JH
5908 if (i <= rangestart + 3)
5909 for (; rangestart < i; rangestart++) {
2d03de9c
AL
5910 const U8 * const e = uvchr_to_utf8(s,rangestart);
5911 U8 *p;
5912 for(p = s; p < e; p++)
ffc61ed2
JH
5913 put_byte(sv, *p);
5914 }
5915 else {
2d03de9c
AL
5916 const U8 *e = uvchr_to_utf8(s,rangestart);
5917 U8 *p;
5918 for (p = s; p < e; p++)
ffc61ed2 5919 put_byte(sv, *p);
2d03de9c
AL
5920 sv_catpvn(sv, "-", 1);
5921 e = uvchr_to_utf8(s, i-1);
5922 for (p = s; p < e; p++)
1df70142 5923 put_byte(sv, *p);
ffc61ed2
JH
5924 }
5925 rangestart = -1;
5926 }
19860706 5927 }
ffc61ed2
JH
5928
5929 sv_catpv(sv, "..."); /* et cetera */
19860706 5930 }
fde631ed 5931
ffc61ed2 5932 {
2e0de35c 5933 char *s = savesvpv(lv);
ffc61ed2 5934 char *origs = s;
b81d288d 5935
ffc61ed2 5936 while(*s && *s != '\n') s++;
b81d288d 5937
ffc61ed2 5938 if (*s == '\n') {
2d03de9c 5939 const char * const t = ++s;
ffc61ed2
JH
5940
5941 while (*s) {
5942 if (*s == '\n')
5943 *s = ' ';
5944 s++;
5945 }
5946 if (s[-1] == ' ')
5947 s[-1] = 0;
5948
5949 sv_catpv(sv, t);
fde631ed 5950 }
b81d288d 5951
ffc61ed2 5952 Safefree(origs);
fde631ed
JH
5953 }
5954 }
653099ff 5955 }
ffc61ed2 5956
653099ff
GS
5957 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5958 }
9b155405 5959 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 5960 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
65e66c80
SP
5961#else
5962 PERL_UNUSED_ARG(sv);
5963 PERL_UNUSED_ARG(o);
17c3b450 5964#endif /* DEBUGGING */
35ff7856 5965}
a687059c 5966
cad2e5aa
JH
5967SV *
5968Perl_re_intuit_string(pTHX_ regexp *prog)
5969{ /* Assume that RE_INTUIT is set */
a3621e74
YO
5970 GET_RE_DEBUG_FLAGS_DECL;
5971 DEBUG_COMPILE_r(
cfd0369c 5972 {
2d03de9c 5973 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 5974 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
5975
5976 if (!PL_colorset) reginitcolors();
5977 PerlIO_printf(Perl_debug_log,
a0288114 5978 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
5979 PL_colors[4],
5980 prog->check_substr ? "" : "utf8 ",
5981 PL_colors[5],PL_colors[0],
cad2e5aa
JH
5982 s,
5983 PL_colors[1],
5984 (strlen(s) > 60 ? "..." : ""));
5985 } );
5986
33b8afdf 5987 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
5988}
5989
2b69d0c2 5990void
864dbfa3 5991Perl_pregfree(pTHX_ struct regexp *r)
a687059c 5992{
27da23d5 5993 dVAR;
9e55ce06
JH
5994#ifdef DEBUGGING
5995 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
a3621e74 5996 SV *re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
9e55ce06 5997#endif
7821416a 5998
a3621e74 5999
7821416a
IZ
6000 if (!r || (--r->refcnt > 0))
6001 return;
a3621e74 6002 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
e1ec3a88
AL
6003 const char *s = (r->reganch & ROPT_UTF8)
6004 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 6005 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
e1ec3a88 6006 const int len = SvCUR(dsv);
9e55ce06
JH
6007 if (!PL_colorset)
6008 reginitcolors();
6009 PerlIO_printf(Perl_debug_log,
a3621e74 6010 "%sFreeing REx:%s %s%*.*s%s%s\n",
9e55ce06
JH
6011 PL_colors[4],PL_colors[5],PL_colors[0],
6012 len, len, s,
6013 PL_colors[1],
6014 len > 60 ? "..." : "");
6015 });
cad2e5aa 6016
43c5f42d
NC
6017 /* gcov results gave these as non-null 100% of the time, so there's no
6018 optimisation in checking them before calling Safefree */
6019 Safefree(r->precomp);
6020 Safefree(r->offsets); /* 20010421 MJD */
ed252734 6021 RX_MATCH_COPY_FREE(r);
f8c7b90f 6022#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
6023 if (r->saved_copy)
6024 SvREFCNT_dec(r->saved_copy);
6025#endif
a193d654
GS
6026 if (r->substrs) {
6027 if (r->anchored_substr)
6028 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6029 if (r->anchored_utf8)
6030 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6031 if (r->float_substr)
6032 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6033 if (r->float_utf8)
6034 SvREFCNT_dec(r->float_utf8);
2779dcf1 6035 Safefree(r->substrs);
a193d654 6036 }
c277df42
IZ
6037 if (r->data) {
6038 int n = r->data->count;
f3548bdc
DM
6039 PAD* new_comppad = NULL;
6040 PAD* old_comppad;
4026c95a 6041 PADOFFSET refcnt;
dfad63ad 6042
c277df42 6043 while (--n >= 0) {
261faec3 6044 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6045 switch (r->data->what[n]) {
6046 case 's':
6047 SvREFCNT_dec((SV*)r->data->data[n]);
6048 break;
653099ff
GS
6049 case 'f':
6050 Safefree(r->data->data[n]);
6051 break;
dfad63ad
HS
6052 case 'p':
6053 new_comppad = (AV*)r->data->data[n];
6054 break;
c277df42 6055 case 'o':
dfad63ad 6056 if (new_comppad == NULL)
cea2e8a9 6057 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6058 PAD_SAVE_LOCAL(old_comppad,
6059 /* Watch out for global destruction's random ordering. */
6060 (SvTYPE(new_comppad) == SVt_PVAV) ?
6061 new_comppad : Null(PAD *)
6062 );
b34c0dd4 6063 OP_REFCNT_LOCK;
4026c95a
SH
6064 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6065 OP_REFCNT_UNLOCK;
6066 if (!refcnt)
9b978d73 6067 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6068
f3548bdc 6069 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6070 SvREFCNT_dec((SV*)new_comppad);
6071 new_comppad = NULL;
c277df42
IZ
6072 break;
6073 case 'n':
9e55ce06 6074 break;
a3621e74
YO
6075 case 't':
6076 {
6077 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6078 U32 refcount;
6079 OP_REFCNT_LOCK;
e27afef8 6080 refcount = --trie->refcount;
a3621e74
YO
6081 OP_REFCNT_UNLOCK;
6082 if ( !refcount ) {
43c5f42d 6083 Safefree(trie->charmap);
a3621e74
YO
6084 if (trie->widecharmap)
6085 SvREFCNT_dec((SV*)trie->widecharmap);
43c5f42d
NC
6086 Safefree(trie->states);
6087 Safefree(trie->trans);
a3621e74
YO
6088#ifdef DEBUGGING
6089 if (trie->words)
6090 SvREFCNT_dec((SV*)trie->words);
6091 if (trie->revcharmap)
6092 SvREFCNT_dec((SV*)trie->revcharmap);
6093#endif
6094 Safefree(r->data->data[n]); /* do this last!!!! */
6095 }
6096 break;
6097 }
c277df42 6098 default:
830247a4 6099 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6100 }
6101 }
6102 Safefree(r->data->what);
6103 Safefree(r->data);
a0d0e21e
LW
6104 }
6105 Safefree(r->startp);
6106 Safefree(r->endp);
6107 Safefree(r);
a687059c 6108}
c277df42
IZ
6109
6110/*
6111 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6112 */
6113regnode *
864dbfa3 6114Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
6115{
6116 register I32 offset;
6117
3280af22 6118 if (p == &PL_regdummy)
c277df42
IZ
6119 return(NULL);
6120
6121 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6122 if (offset == 0)
6123 return(NULL);
6124
c277df42 6125 return(p+offset);
c277df42
IZ
6126}
6127
01f988be 6128STATIC void
cea2e8a9 6129S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6130{
6131 va_list args;
6132 STRLEN l1 = strlen(pat1);
6133 STRLEN l2 = strlen(pat2);
6134 char buf[512];
06bf62c7 6135 SV *msv;
73d840c0 6136 const char *message;
c277df42
IZ
6137
6138 if (l1 > 510)
6139 l1 = 510;
6140 if (l1 + l2 > 510)
6141 l2 = 510 - l1;
6142 Copy(pat1, buf, l1 , char);
6143 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6144 buf[l1 + l2] = '\n';
6145 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6146#ifdef I_STDARG
6147 /* ANSI variant takes additional second argument */
c277df42 6148 va_start(args, pat2);
8736538c
AS
6149#else
6150 va_start(args);
6151#endif
5a844595 6152 msv = vmess(buf, &args);
c277df42 6153 va_end(args);
cfd0369c 6154 message = SvPV_const(msv,l1);
c277df42
IZ
6155 if (l1 > 512)
6156 l1 = 512;
6157 Copy(message, buf, l1 , char);
197cf9b9 6158 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6159 Perl_croak(aTHX_ "%s", buf);
c277df42 6160}
a0ed51b3
LW
6161
6162/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6163
6164void
864dbfa3 6165Perl_save_re_context(pTHX)
b81d288d 6166{
830247a4 6167 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 6168 SAVEPPTR(PL_bostr);
a0ed51b3
LW
6169 SAVEPPTR(PL_reginput); /* String-input pointer. */
6170 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6171 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
6172 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6173 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6174 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a5db57d6 6175 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
a0ed51b3 6176 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 6177 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 6178 PL_reg_start_tmp = 0;
a0ed51b3
LW
6179 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6180 PL_reg_start_tmpl = 0;
7766f137 6181 SAVEVPTR(PL_regdata);
a0ed51b3
LW
6182 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6183 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 6184 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 6185 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
6186 SAVEVPTR(PL_regcc); /* from regexec.c */
6187 SAVEVPTR(PL_curcop);
7766f137
GS
6188 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6189 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
6190 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6191 SAVESPTR(PL_reg_sv); /* from regexec.c */
9febdf04 6192 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
7766f137 6193 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 6194 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
6195 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6196 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
a5db57d6
GS
6197 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
6198 PL_reg_oldsaved = Nullch;
6199 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6200 PL_reg_oldsavedlen = 0;
f8c7b90f 6201#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
6202 SAVESPTR(PL_nrs);
6203 PL_nrs = Nullsv;
6204#endif
a5db57d6
GS
6205 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6206 PL_reg_maxiter = 0;
6207 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6208 PL_reg_leftiter = 0;
6209 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
6210 PL_reg_poscache = Nullch;
6211 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6212 PL_reg_poscache_size = 0;
6213 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5fb7366e 6214 SAVEI32(PL_regnpar); /* () count. */
e49a9654 6215 SAVEI32(PL_regsize); /* from regexec.c */
ada6e8a9
AMS
6216
6217 {
6218 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
ada6e8a9 6219 REGEXP *rx;
ada6e8a9
AMS
6220
6221 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
1df70142 6222 U32 i;
ada6e8a9 6223 for (i = 1; i <= rx->nparens; i++) {
1df70142
AL
6224 GV *mgv;
6225 char digits[TYPE_CHARS(long)];
e5105eda 6226 const STRLEN len = my_sprintf(digits, "%lu", (long)i);
86c11942 6227 if ((mgv = gv_fetchpvn_flags(digits, len, FALSE, SVt_PV)))
ada6e8a9
AMS
6228 save_scalar(mgv);
6229 }
6230 }
6231 }
6232
54b6e2fa 6233#ifdef DEBUGGING
b81d288d 6234 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 6235#endif
a0ed51b3 6236}
51371543 6237
51371543 6238static void
acfe0abc 6239clear_re(pTHX_ void *r)
51371543
GS
6240{
6241 ReREFCNT_dec((regexp *)r);
6242}
ffbc6a93 6243
a28509cc
AL
6244#ifdef DEBUGGING
6245
6246STATIC void
6247S_put_byte(pTHX_ SV *sv, int c)
6248{
6249 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6250 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6251 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6252 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6253 else
6254 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6255}
6256
6257
6258STATIC regnode *
6259S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
6260{
6261 register U8 op = EXACT; /* Arbitrary non-END op. */
6262 register regnode *next;
6263
6264 while (op != END && (!last || node < last)) {
6265 /* While that wasn't END last time... */
6266
6267 NODE_ALIGN(node);
6268 op = OP(node);
6269 if (op == CLOSE)
6270 l--;
6271 next = regnext(node);
6272 /* Where, what. */
6273 if (OP(node) == OPTIMIZED)
6274 goto after_print;
6275 regprop(sv, node);
6276 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6277 (int)(2*l + 1), "", SvPVX_const(sv));
6278 if (next == NULL) /* Next ptr. */
6279 PerlIO_printf(Perl_debug_log, "(0)");
6280 else
6281 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6282 (void)PerlIO_putc(Perl_debug_log, '\n');
6283 after_print:
6284 if (PL_regkind[(U8)op] == BRANCHJ) {
6285 register regnode *nnode = (OP(next) == LONGJMP
6286 ? regnext(next)
6287 : next);
6288 if (last && nnode > last)
6289 nnode = last;
6290 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6291 }
6292 else if (PL_regkind[(U8)op] == BRANCH) {
6293 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
6294 }
6295 else if ( PL_regkind[(U8)op] == TRIE ) {
6296 const I32 n = ARG(node);
6297 const reg_trie_data * const trie = (reg_trie_data*)PL_regdata->data[n];
6298 const I32 arry_len = av_len(trie->words)+1;
6299 I32 word_idx;
6300 PerlIO_printf(Perl_debug_log,
6301 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6302 (int)(2*(l+3)),
6303 "",
6304 trie->wordcount,
6305 (int)trie->charcount,
6306 trie->uniquecharcount,
6307 (IV)trie->laststate-1,
6308 node->flags ? " EVAL mode" : "");
6309
6310 for (word_idx=0; word_idx < arry_len; word_idx++) {
6311 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
6312 if (elem_ptr) {
6313 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6314 (int)(2*(l+4)), "",
6315 PL_colors[0],
cfd0369c 6316 SvPV_nolen_const(*elem_ptr),
a28509cc
AL
6317 PL_colors[1]
6318 );
6319 /*
6320 if (next == NULL)
6321 PerlIO_printf(Perl_debug_log, "(0)\n");
6322 else
6323 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6324 */
6325 }
6326
6327 }
6328
6329 node = NEXTOPER(node);
6330 node += regarglen[(U8)op];
6331
6332 }
6333 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6334 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6335 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6336 }
6337 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6338 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6339 next, sv, l + 1);
6340 }
6341 else if ( op == PLUS || op == STAR) {
6342 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6343 }
6344 else if (op == ANYOF) {
6345 /* arglen 1 + class block */
6346 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6347 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6348 node = NEXTOPER(node);
6349 }
6350 else if (PL_regkind[(U8)op] == EXACT) {
6351 /* Literal string, where present. */
6352 node += NODE_SZ_STR(node) - 1;
6353 node = NEXTOPER(node);
6354 }
6355 else {
6356 node = NEXTOPER(node);
6357 node += regarglen[(U8)op];
6358 }
6359 if (op == CURLYX || op == OPEN)
6360 l++;
6361 else if (op == WHILEM)
6362 l--;
6363 }
6364 return node;
6365}
6366
6367#endif /* DEBUGGING */
6368
241d1a3b
NC
6369/*
6370 * Local variables:
6371 * c-indentation-style: bsd
6372 * c-basic-offset: 4
6373 * indent-tabs-mode: t
6374 * End:
6375 *
37442d52
RGS
6376 * ex: set ts=8 sts=4 sw=4 noet:
6377 */