This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
hints/dec_osf.sh (Re: blead@26701 compilation warning from tru64 cc)
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e
AD
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
cad2e5aa 38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
d06ea78c 44/* We *really* need to overwrite these symbols: */
56953603
IZ
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
d06ea78c 48# define Perl_pregfree my_regfree
cad2e5aa
JH
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
d06ea78c 51# define Perl_regnext my_regnext
f0b8d043 52# define Perl_save_re_context my_save_re_context
b81d288d 53# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
b81d288d 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 *
79 **** Alterations to Henry's code are...
80 ****
4bb101f2 81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 82 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 83 ****
9ef589d8
LW
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
86
a687059c
LW
87 *
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
91 */
92#include "EXTERN.h"
864dbfa3 93#define PERL_IN_REGCOMP_C
a687059c 94#include "perl.h"
d06ea78c 95
acfe0abc 96#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
97# include "INTERN.h"
98#endif
c277df42
IZ
99
100#define REG_COMP_C
a687059c
LW
101#include "regcomp.h"
102
d4cce5f1 103#ifdef op
11343788 104#undef op
d4cce5f1 105#endif /* op */
11343788 106
fe14fcc3 107#ifdef MSDOS
7e4e8c89 108# if defined(BUGGY_MSC6)
fe14fcc3 109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 110# pragma optimize("a",off)
fe14fcc3 111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
112# pragma optimize("w",on )
113# endif /* BUGGY_MSC6 */
fe14fcc3
LW
114#endif /* MSDOS */
115
a687059c
LW
116#ifndef STATIC
117#define STATIC static
118#endif
119
830247a4 120typedef struct RExC_state_t {
e2509266 121 U32 flags; /* are we folding, multilining? */
830247a4
IZ
122 char *precomp; /* uncompiled string. */
123 regexp *rx;
fac92740 124 char *start; /* Start of input for compile */
830247a4
IZ
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 128 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 129 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
132 U32 seen;
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
135 I32 extralen;
136 I32 seen_zerolen;
137 I32 seen_evals;
1aa99e6b 138 I32 utf8;
830247a4
IZ
139#if ADD_TO_REGEXEC
140 char *starttry; /* -Dr: where regtry was called. */
141#define RExC_starttry (pRExC_state->starttry)
142#endif
143} RExC_state_t;
144
e2509266 145#define RExC_flags (pRExC_state->flags)
830247a4
IZ
146#define RExC_precomp (pRExC_state->precomp)
147#define RExC_rx (pRExC_state->rx)
fac92740 148#define RExC_start (pRExC_state->start)
830247a4
IZ
149#define RExC_end (pRExC_state->end)
150#define RExC_parse (pRExC_state->parse)
151#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 152#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 153#define RExC_emit (pRExC_state->emit)
fac92740 154#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
155#define RExC_naughty (pRExC_state->naughty)
156#define RExC_sawback (pRExC_state->sawback)
157#define RExC_seen (pRExC_state->seen)
158#define RExC_size (pRExC_state->size)
159#define RExC_npar (pRExC_state->npar)
160#define RExC_extralen (pRExC_state->extralen)
161#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 163#define RExC_utf8 (pRExC_state->utf8)
830247a4 164
a687059c
LW
165#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
a687059c 168
35c8bce7
LW
169#ifdef SPSTART
170#undef SPSTART /* dratted cpp namespace... */
171#endif
a687059c
LW
172/*
173 * Flags to be passed up and down.
174 */
a687059c 175#define WORST 0 /* Worst case. */
821b33a5 176#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
177#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178#define SPSTART 0x4 /* Starts with * or +. */
179#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 180
2c2d71f5
JH
181/* Length of a variant. */
182
183typedef struct scan_data_t {
184 I32 len_min;
185 I32 len_delta;
186 I32 pos_min;
187 I32 pos_delta;
188 SV *last_found;
189 I32 last_end; /* min value, <0 unless valid. */
190 I32 last_start_min;
191 I32 last_start_max;
192 SV **longest; /* Either &l_fixed, or &l_float. */
193 SV *longest_fixed;
194 I32 offset_fixed;
195 SV *longest_float;
196 I32 offset_float_min;
197 I32 offset_float_max;
198 I32 flags;
199 I32 whilem_c;
cb434fcc 200 I32 *last_closep;
653099ff 201 struct regnode_charclass_class *start_class;
2c2d71f5
JH
202} scan_data_t;
203
a687059c 204/*
e50aee73 205 * Forward declarations for pregcomp()'s friends.
a687059c 206 */
a0d0e21e 207
27da23d5
JH
208static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
210
211#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212#define SF_BEFORE_SEOL 0x1
213#define SF_BEFORE_MEOL 0x2
214#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
216
09b7f37c
CB
217#ifdef NO_UNARY_PLUS
218# define SF_FIX_SHIFT_EOL (0+2)
219# define SF_FL_SHIFT_EOL (0+4)
220#else
221# define SF_FIX_SHIFT_EOL (+2)
222# define SF_FL_SHIFT_EOL (+4)
223#endif
c277df42
IZ
224
225#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
227
228#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230#define SF_IS_INF 0x40
231#define SF_HAS_PAR 0x80
232#define SF_IN_PAR 0x100
233#define SF_HAS_EVAL 0x200
4bfe0158 234#define SCF_DO_SUBSTR 0x400
653099ff
GS
235#define SCF_DO_STCLASS_AND 0x0800
236#define SCF_DO_STCLASS_OR 0x1000
237#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 238#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 239
eb160463 240#define UTF (RExC_utf8 != 0)
e2509266
JH
241#define LOC ((RExC_flags & PMf_LOCALE) != 0)
242#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 243
ffc61ed2 244#define OOB_UNICODE 12345678
93733859 245#define OOB_NAMEDCLASS -1
b8c5462f 246
a0ed51b3
LW
247#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
249
8615cb43 250
b45f050a
JF
251/* length of regex to show in messages that don't mark a position within */
252#define RegexLengthToShowInErrorMessages 127
253
254/*
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
258 */
7253e4e3
RK
259#define MARKER1 "<-- HERE" /* marker as it appears in the description */
260#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 261
7253e4e3 262#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
263
264/*
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
267 * "...".
268 */
ccb2c380 269#define FAIL(msg) STMT_START { \
bfed75c6 270 const char *ellipses = ""; \
ccb2c380
MP
271 IV len = RExC_end - RExC_precomp; \
272 \
273 if (!SIZE_ONLY) \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
278 ellipses = "..."; \
279 } \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
282} STMT_END
8615cb43 283
b45f050a
JF
284/*
285 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
286 * args. Show regex, up to a maximum length. If it's too long, chop and add
287 * "...".
288 */
ccb2c380 289#define FAIL2(pat,msg) STMT_START { \
bfed75c6 290 const char *ellipses = ""; \
ccb2c380
MP
291 IV len = RExC_end - RExC_precomp; \
292 \
293 if (!SIZE_ONLY) \
294 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
295 if (len > RegexLengthToShowInErrorMessages) { \
296 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
297 len = RegexLengthToShowInErrorMessages - 10; \
298 ellipses = "..."; \
299 } \
300 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
301 msg, (int)len, RExC_precomp, ellipses); \
302} STMT_END
b45f050a
JF
303
304
305/*
306 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
307 */
ccb2c380 308#define Simple_vFAIL(m) STMT_START { \
a28509cc 309 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
312} STMT_END
b45f050a
JF
313
314/*
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 */
ccb2c380
MP
317#define vFAIL(m) STMT_START { \
318 if (!SIZE_ONLY) \
319 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
320 Simple_vFAIL(m); \
321} STMT_END
b45f050a
JF
322
323/*
324 * Like Simple_vFAIL(), but accepts two arguments.
325 */
ccb2c380 326#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 327 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
328 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
329 (int)offset, RExC_precomp, RExC_precomp + offset); \
330} STMT_END
b45f050a
JF
331
332/*
333 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
334 */
ccb2c380
MP
335#define vFAIL2(m,a1) STMT_START { \
336 if (!SIZE_ONLY) \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
339} STMT_END
b45f050a
JF
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
ccb2c380 345#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 346 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
347 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
348 (int)offset, RExC_precomp, RExC_precomp + offset); \
349} STMT_END
b45f050a
JF
350
351/*
352 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
353 */
ccb2c380
MP
354#define vFAIL3(m,a1,a2) STMT_START { \
355 if (!SIZE_ONLY) \
356 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
357 Simple_vFAIL3(m, a1, a2); \
358} STMT_END
b45f050a
JF
359
360/*
361 * Like Simple_vFAIL(), but accepts four arguments.
362 */
ccb2c380 363#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 364 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
367} STMT_END
b45f050a 368
ccb2c380 369#define vWARN(loc,m) STMT_START { \
a28509cc 370 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
373} STMT_END
374
375#define vWARNdep(loc,m) STMT_START { \
a28509cc 376 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
380} STMT_END
381
382
383#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 384 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
387} STMT_END
388
389#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 390 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
393} STMT_END
394
395#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 396 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
399} STMT_END
400
401#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 402 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
405} STMT_END
9d1d55b5 406
8615cb43 407
cd439c50 408/* Allow for side effects in s */
ccb2c380
MP
409#define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
411} STMT_END
cd439c50 412
fac92740
MJD
413/* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
417 */
418
419#define MJD_OFFSET_DEBUG(x)
a3621e74 420/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
421
422
423#define Set_Node_Offset_To_R(node,byte) STMT_START { \
424 if (! SIZE_ONLY) { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
427 if((node) < 0) { \
551405c4 428 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
429 } else { \
430 RExC_offsets[2*(node)-1] = (byte); \
431 } \
432 } \
433} STMT_END
434
435#define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
438
439#define Set_Node_Length_To_R(node,len) STMT_START { \
440 if (! SIZE_ONLY) { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 442 __LINE__, (int)(node), (int)(len))); \
ccb2c380 443 if((node) < 0) { \
551405c4 444 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
445 } else { \
446 RExC_offsets[2*(node)] = (len); \
447 } \
448 } \
449} STMT_END
450
451#define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454#define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
456
457/* Get offsets and lengths */
458#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
460
acfe0abc 461static void clear_re(pTHX_ void *r);
4327152a 462
653099ff
GS
463/* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
466
4327152a 467STATIC void
830247a4 468S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 469{
e1ec3a88
AL
470 const STRLEN l = CHR_SVLEN(data->last_found);
471 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 472
c277df42 473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 474 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
b81d288d 478 data->flags
c277df42
IZ
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
480 else
481 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
482 }
483 else {
c277df42 484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
485 data->offset_float_max = (l
486 ? data->last_start_max
c277df42 487 : data->pos_min + data->pos_delta);
9051bda5
HS
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
c277df42 490 if (data->flags & SF_BEFORE_EOL)
b81d288d 491 data->flags
c277df42
IZ
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
493 else
494 data->flags &= ~SF_FL_BEFORE_EOL;
495 }
496 }
497 SvCUR_set(data->last_found, 0);
0eda9292 498 {
a28509cc
AL
499 SV * const sv = data->last_found;
500 MAGIC * const mg =
0eda9292
JH
501 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
502 if (mg && mg->mg_len > 0)
503 mg->mg_len = 0;
504 }
c277df42
IZ
505 data->last_end = -1;
506 data->flags &= ~SF_BEFORE_EOL;
507}
508
653099ff
GS
509/* Can match anything (initialization) */
510STATIC void
830247a4 511S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 512{
653099ff 513 ANYOF_CLASS_ZERO(cl);
f8bef550 514 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 515 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
516 if (LOC)
517 cl->flags |= ANYOF_LOCALE;
518}
519
520/* Can match anything (initialization) */
521STATIC int
a28509cc 522S_cl_is_anything(pTHX_ const struct regnode_charclass_class *cl)
653099ff
GS
523{
524 int value;
525
aaa51d5e 526 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
527 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
528 return 1;
1aa99e6b
IH
529 if (!(cl->flags & ANYOF_UNICODE_ALL))
530 return 0;
f8bef550
NC
531 if (!ANYOF_BITMAP_TESTALLSET(cl))
532 return 0;
653099ff
GS
533 return 1;
534}
535
536/* Can match anything (initialization) */
537STATIC void
830247a4 538S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 539{
8ecf7187 540 Zero(cl, 1, struct regnode_charclass_class);
653099ff 541 cl->type = ANYOF;
830247a4 542 cl_anything(pRExC_state, cl);
653099ff
GS
543}
544
545STATIC void
830247a4 546S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 547{
8ecf7187 548 Zero(cl, 1, struct regnode_charclass_class);
653099ff 549 cl->type = ANYOF;
830247a4 550 cl_anything(pRExC_state, cl);
653099ff
GS
551 if (LOC)
552 cl->flags |= ANYOF_LOCALE;
553}
554
555/* 'And' a given class with another one. Can create false positives */
556/* We assume that cl is not inverted */
557STATIC void
558S_cl_and(pTHX_ struct regnode_charclass_class *cl,
a28509cc 559 const struct regnode_charclass_class *and_with)
653099ff 560{
653099ff
GS
561 if (!(and_with->flags & ANYOF_CLASS)
562 && !(cl->flags & ANYOF_CLASS)
563 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
564 && !(and_with->flags & ANYOF_FOLD)
565 && !(cl->flags & ANYOF_FOLD)) {
566 int i;
567
568 if (and_with->flags & ANYOF_INVERT)
569 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
570 cl->bitmap[i] &= ~and_with->bitmap[i];
571 else
572 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573 cl->bitmap[i] &= and_with->bitmap[i];
574 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
575 if (!(and_with->flags & ANYOF_EOS))
576 cl->flags &= ~ANYOF_EOS;
1aa99e6b 577
14ebb1a2
JH
578 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
579 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
580 cl->flags &= ~ANYOF_UNICODE_ALL;
581 cl->flags |= ANYOF_UNICODE;
582 ARG_SET(cl, ARG(and_with));
583 }
14ebb1a2
JH
584 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
585 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 586 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
587 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
588 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 589 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
590}
591
592/* 'OR' a given class with another one. Can create false positives */
593/* We assume that cl is not inverted */
594STATIC void
a28509cc 595S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 596{
653099ff
GS
597 if (or_with->flags & ANYOF_INVERT) {
598 /* We do not use
599 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
600 * <= (B1 | !B2) | (CL1 | !CL2)
601 * which is wasteful if CL2 is small, but we ignore CL2:
602 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
603 * XXXX Can we handle case-fold? Unclear:
604 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
605 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
606 */
607 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
608 && !(or_with->flags & ANYOF_FOLD)
609 && !(cl->flags & ANYOF_FOLD) ) {
610 int i;
611
612 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613 cl->bitmap[i] |= ~or_with->bitmap[i];
614 } /* XXXX: logic is complicated otherwise */
615 else {
830247a4 616 cl_anything(pRExC_state, cl);
653099ff
GS
617 }
618 } else {
619 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 621 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
622 || (cl->flags & ANYOF_FOLD)) ) {
623 int i;
624
625 /* OR char bitmap and class bitmap separately */
626 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
627 cl->bitmap[i] |= or_with->bitmap[i];
628 if (or_with->flags & ANYOF_CLASS) {
629 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
630 cl->classflags[i] |= or_with->classflags[i];
631 cl->flags |= ANYOF_CLASS;
632 }
633 }
634 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 635 cl_anything(pRExC_state, cl);
653099ff
GS
636 }
637 }
638 if (or_with->flags & ANYOF_EOS)
639 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
640
641 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
642 ARG(cl) != ARG(or_with)) {
643 cl->flags |= ANYOF_UNICODE_ALL;
644 cl->flags &= ~ANYOF_UNICODE;
645 }
646 if (or_with->flags & ANYOF_UNICODE_ALL) {
647 cl->flags |= ANYOF_UNICODE_ALL;
648 cl->flags &= ~ANYOF_UNICODE;
649 }
653099ff
GS
650}
651
5d1c421c 652/*
a3621e74
YO
653
654 make_trie(startbranch,first,last,tail,flags)
655 startbranch: the first branch in the whole branch sequence
656 first : start branch of sequence of branch-exact nodes.
657 May be the same as startbranch
658 last : Thing following the last branch.
659 May be the same as tail.
660 tail : item following the branch sequence
661 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
662
663Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
664
665A trie is an N'ary tree where the branches are determined by digital
666decomposition of the key. IE, at the root node you look up the 1st character and
667follow that branch repeat until you find the end of the branches. Nodes can be
668marked as "accepting" meaning they represent a complete word. Eg:
669
670 /he|she|his|hers/
671
672would convert into the following structure. Numbers represent states, letters
673following numbers represent valid transitions on the letter from that state, if
674the number is in square brackets it represents an accepting state, otherwise it
675will be in parenthesis.
676
677 +-h->+-e->[3]-+-r->(8)-+-s->[9]
678 | |
679 | (2)
680 | |
681 (1) +-i->(6)-+-s->[7]
682 |
683 +-s->(3)-+-h->(4)-+-e->[5]
684
685 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
686
687This shows that when matching against the string 'hers' we will begin at state 1
688read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
689then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
690is also accepting. Thus we know that we can match both 'he' and 'hers' with a
691single traverse. We store a mapping from accepting to state to which word was
692matched, and then when we have multiple possibilities we try to complete the
693rest of the regex in the order in which they occured in the alternation.
694
695The only prior NFA like behaviour that would be changed by the TRIE support is
696the silent ignoring of duplicate alternations which are of the form:
697
698 / (DUPE|DUPE) X? (?{ ... }) Y /x
699
700Thus EVAL blocks follwing a trie may be called a different number of times with
701and without the optimisation. With the optimisations dupes will be silently
702ignored. This inconsistant behaviour of EVAL type nodes is well established as
703the following demonstrates:
704
705 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
706
707which prints out 'word' three times, but
708
709 'words'=~/(word|word|word)(?{ print $1 })S/
710
711which doesnt print it out at all. This is due to other optimisations kicking in.
712
713Example of what happens on a structural level:
714
715The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
716
717 1: CURLYM[1] {1,32767}(18)
718 5: BRANCH(8)
719 6: EXACT <ac>(16)
720 8: BRANCH(11)
721 9: EXACT <ad>(16)
722 11: BRANCH(14)
723 12: EXACT <ab>(16)
724 16: SUCCEED(0)
725 17: NOTHING(18)
726 18: END(0)
727
728This would be optimizable with startbranch=5, first=5, last=16, tail=16
729and should turn into:
730
731 1: CURLYM[1] {1,32767}(18)
732 5: TRIE(16)
733 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
734 <ac>
735 <ad>
736 <ab>
737 16: SUCCEED(0)
738 17: NOTHING(18)
739 18: END(0)
740
741Cases where tail != last would be like /(?foo|bar)baz/:
742
743 1: BRANCH(4)
744 2: EXACT <foo>(8)
745 4: BRANCH(7)
746 5: EXACT <bar>(8)
747 7: TAIL(8)
748 8: EXACT <baz>(10)
749 10: END(0)
750
751which would be optimizable with startbranch=1, first=1, last=7, tail=8
752and would end up looking like:
753
754 1: TRIE(8)
755 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
756 <foo>
757 <bar>
758 7: TAIL(8)
759 8: EXACT <baz>(10)
760 10: END(0)
761
762*/
763
764#define TRIE_DEBUG_CHAR \
765 DEBUG_TRIE_COMPILE_r({ \
766 SV *tmp; \
767 if ( UTF ) { \
d0043bd1 768 tmp = newSVpvn( "", 0 ); \
a3621e74
YO
769 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
770 } else { \
e4584336 771 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
a3621e74
YO
772 } \
773 av_push( trie->revcharmap, tmp ); \
774 })
775
776#define TRIE_READ_CHAR STMT_START { \
777 if ( UTF ) { \
778 if ( folder ) { \
779 if ( foldlen > 0 ) { \
780 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
781 foldlen -= len; \
782 scan += len; \
783 len = 0; \
784 } else { \
e1ec3a88 785 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
786 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
787 foldlen -= UNISKIP( uvc ); \
788 scan = foldbuf + UNISKIP( uvc ); \
789 } \
790 } else { \
e1ec3a88 791 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
792 } \
793 } else { \
794 uvc = (U32)*uc; \
795 len = 1; \
796 } \
797} STMT_END
798
799
800#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
801#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
802#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
803#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
804
805#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
806 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
807 TRIE_LIST_LEN( state ) *= 2; \
808 Renew( trie->states[ state ].trans.list, \
809 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
810 } \
811 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
812 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
813 TRIE_LIST_CUR( state )++; \
814} STMT_END
815
816#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 817 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
818 4, reg_trie_trans_le ); \
819 TRIE_LIST_CUR( state ) = 1; \
820 TRIE_LIST_LEN( state ) = 4; \
821} STMT_END
822
823STATIC I32
824S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
825{
27da23d5 826 dVAR;
a3621e74
YO
827 /* first pass, loop through and scan words */
828 reg_trie_data *trie;
829 regnode *cur;
e1ec3a88 830 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
a3621e74
YO
831 STRLEN len = 0;
832 UV uvc = 0;
833 U16 curword = 0;
834 U32 next_alloc = 0;
835 /* we just use folder as a flag in utf8 */
e1ec3a88 836 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
837 ? PL_fold
838 : ( flags == EXACTFL
839 ? PL_fold_locale
840 : NULL
841 )
842 );
843
e1ec3a88 844 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74
YO
845 SV *re_trie_maxbuff;
846
847 GET_RE_DEBUG_FLAGS_DECL;
848
a02a5408 849 Newxz( trie, 1, reg_trie_data );
a3621e74
YO
850 trie->refcount = 1;
851 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 852 Newxz( trie->charmap, 256, U16 );
a3621e74
YO
853 DEBUG_r({
854 trie->words = newAV();
855 trie->revcharmap = newAV();
856 });
857
858
0111c4fd 859 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 860 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 861 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74
YO
862 }
863
864 /* -- First loop and Setup --
865
866 We first traverse the branches and scan each word to determine if it
867 contains widechars, and how many unique chars there are, this is
868 important as we have to build a table with at least as many columns as we
869 have unique chars.
870
871 We use an array of integers to represent the character codes 0..255
872 (trie->charmap) and we use a an HV* to store unicode characters. We use the
873 native representation of the character value as the key and IV's for the
874 coded index.
875
876 *TODO* If we keep track of how many times each character is used we can
877 remap the columns so that the table compression later on is more
878 efficient in terms of memory by ensuring most common value is in the
879 middle and the least common are on the outside. IMO this would be better
880 than a most to least common mapping as theres a decent chance the most
881 common letter will share a node with the least common, meaning the node
882 will not be compressable. With a middle is most common approach the worst
883 case is when we have the least common nodes twice.
884
885 */
886
887
888 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 889 regnode * const noper = NEXTOPER( cur );
e1ec3a88 890 const U8 *uc = (U8*)STRING( noper );
a28509cc 891 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
892 STRLEN foldlen = 0;
893 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 894 const U8 *scan = (U8*)NULL;
a3621e74
YO
895
896 for ( ; uc < e ; uc += len ) {
897 trie->charcount++;
898 TRIE_READ_CHAR;
899 if ( uvc < 256 ) {
900 if ( !trie->charmap[ uvc ] ) {
901 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
902 if ( folder )
903 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
904 TRIE_DEBUG_CHAR;
905 }
906 } else {
907 SV** svpp;
908 if ( !trie->widecharmap )
909 trie->widecharmap = newHV();
910
911 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
912
913 if ( !svpp )
e4584336 914 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
915
916 if ( !SvTRUE( *svpp ) ) {
917 sv_setiv( *svpp, ++trie->uniquecharcount );
918 TRIE_DEBUG_CHAR;
919 }
920 }
921 }
922 trie->wordcount++;
923 } /* end first pass */
924 DEBUG_TRIE_COMPILE_r(
925 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
926 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
5d7488b2 927 (int)trie->charcount, trie->uniquecharcount )
a3621e74
YO
928 );
929
930
931 /*
932 We now know what we are dealing with in terms of unique chars and
933 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
934 representation using a flat table will take. If it's over a reasonable
935 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
936 conservative but potentially much slower representation using an array
937 of lists.
938
939 At the end we convert both representations into the same compressed
940 form that will be used in regexec.c for matching with. The latter
941 is a form that cannot be used to construct with but has memory
942 properties similar to the list form and access properties similar
943 to the table form making it both suitable for fast searches and
944 small enough that its feasable to store for the duration of a program.
945
946 See the comment in the code where the compressed table is produced
947 inplace from the flat tabe representation for an explanation of how
948 the compression works.
949
950 */
951
952
953 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
954 /*
955 Second Pass -- Array Of Lists Representation
956
957 Each state will be represented by a list of charid:state records
958 (reg_trie_trans_le) the first such element holds the CUR and LEN
959 points of the allocated array. (See defines above).
960
961 We build the initial structure using the lists, and then convert
962 it into the compressed table form which allows faster lookups
963 (but cant be modified once converted).
964
965
966 */
967
968
969 STRLEN transcount = 1;
970
a02a5408 971 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
972 TRIE_LIST_NEW(1);
973 next_alloc = 2;
974
975 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
976
c445ea15
AL
977 regnode * const noper = NEXTOPER( cur );
978 U8 *uc = (U8*)STRING( noper );
979 const U8 * const e = uc + STR_LEN( noper );
980 U32 state = 1; /* required init */
981 U16 charid = 0; /* sanity init */
982 U8 *scan = (U8*)NULL; /* sanity init */
983 STRLEN foldlen = 0; /* required init */
984 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
985
986 for ( ; uc < e ; uc += len ) {
987
988 TRIE_READ_CHAR;
989
990 if ( uvc < 256 ) {
991 charid = trie->charmap[ uvc ];
992 } else {
993 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
994 if ( !svpp ) {
995 charid = 0;
996 } else {
997 charid=(U16)SvIV( *svpp );
998 }
999 }
1000 if ( charid ) {
a3621e74 1001
c445ea15
AL
1002 U16 check;
1003 U32 newstate = 0;
a3621e74 1004
c445ea15
AL
1005 charid--;
1006 if ( !trie->states[ state ].trans.list ) {
1007 TRIE_LIST_NEW( state );
1008 }
1009 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1010 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1011 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1012 break;
1013 }
1014 }
1015 if ( ! newstate ) {
1016 newstate = next_alloc++;
1017 TRIE_LIST_PUSH( state, charid, newstate );
1018 transcount++;
1019 }
1020 state = newstate;
1021 } else {
1022 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a28509cc 1023 }
c445ea15
AL
1024 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1025 }
a3621e74 1026
c445ea15
AL
1027 if ( !trie->states[ state ].wordnum ) {
1028 /* we havent inserted this word into the structure yet. */
1029 trie->states[ state ].wordnum = ++curword;
a3621e74 1030
c445ea15
AL
1031 DEBUG_r({
1032 /* store the word for dumping */
1033 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1034 if ( UTF ) SvUTF8_on( tmp );
1035 av_push( trie->words, tmp );
1036 });
a3621e74 1037
c445ea15
AL
1038 } else {
1039 /* Its a dupe. So ignore it. */
1040 }
a3621e74
YO
1041
1042 } /* end second pass */
1043
1044 trie->laststate = next_alloc;
1045 Renew( trie->states, next_alloc, reg_trie_state );
1046
1047 DEBUG_TRIE_COMPILE_MORE_r({
1048 U32 state;
a3621e74 1049
a28509cc 1050 /* print out the table precompression. */
a3621e74
YO
1051
1052 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1053 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1054
1055 for( state=1 ; state < next_alloc ; state ++ ) {
a28509cc 1056 U16 charid;
a3621e74 1057
e4584336 1058 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
a3621e74
YO
1059 if ( ! trie->states[ state ].wordnum ) {
1060 PerlIO_printf( Perl_debug_log, "%5s| ","");
1061 } else {
e4584336 1062 PerlIO_printf( Perl_debug_log, "W%04x| ",
a3621e74
YO
1063 trie->states[ state ].wordnum
1064 );
1065 }
1066 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1067 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
e4584336 1068 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
cfd0369c 1069 SvPV_nolen_const( *tmp ),
a3621e74 1070 TRIE_LIST_ITEM(state,charid).forid,
e4584336 1071 (UV)TRIE_LIST_ITEM(state,charid).newstate
a3621e74
YO
1072 );
1073 }
1074
1075 }
1076 PerlIO_printf( Perl_debug_log, "\n\n" );
1077 });
1078
a02a5408 1079 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1080 {
1081 U32 state;
a3621e74
YO
1082 U32 tp = 0;
1083 U32 zp = 0;
1084
1085
1086 for( state=1 ; state < next_alloc ; state ++ ) {
1087 U32 base=0;
1088
1089 /*
1090 DEBUG_TRIE_COMPILE_MORE_r(
1091 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1092 );
1093 */
1094
1095 if (trie->states[state].trans.list) {
1096 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1097 U16 maxid=minid;
a28509cc 1098 U16 idx;
a3621e74
YO
1099
1100 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1101 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1102 if ( forid < minid ) {
1103 minid=forid;
1104 } else if ( forid > maxid ) {
1105 maxid=forid;
1106 }
a3621e74
YO
1107 }
1108 if ( transcount < tp + maxid - minid + 1) {
1109 transcount *= 2;
1110 Renew( trie->trans, transcount, reg_trie_trans );
1111 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1112 }
1113 base = trie->uniquecharcount + tp - minid;
1114 if ( maxid == minid ) {
1115 U32 set = 0;
1116 for ( ; zp < tp ; zp++ ) {
1117 if ( ! trie->trans[ zp ].next ) {
1118 base = trie->uniquecharcount + zp - minid;
1119 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1120 trie->trans[ zp ].check = state;
1121 set = 1;
1122 break;
1123 }
1124 }
1125 if ( !set ) {
1126 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1127 trie->trans[ tp ].check = state;
1128 tp++;
1129 zp = tp;
1130 }
1131 } else {
1132 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1133 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1134 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1135 trie->trans[ tid ].check = state;
1136 }
1137 tp += ( maxid - minid + 1 );
1138 }
1139 Safefree(trie->states[ state ].trans.list);
1140 }
1141 /*
1142 DEBUG_TRIE_COMPILE_MORE_r(
1143 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1144 );
1145 */
1146 trie->states[ state ].trans.base=base;
1147 }
cc601c31 1148 trie->lasttrans = tp + 1;
a3621e74
YO
1149 }
1150 } else {
1151 /*
1152 Second Pass -- Flat Table Representation.
1153
1154 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1155 We know that we will need Charcount+1 trans at most to store the data
1156 (one row per char at worst case) So we preallocate both structures
1157 assuming worst case.
1158
1159 We then construct the trie using only the .next slots of the entry
1160 structs.
1161
1162 We use the .check field of the first entry of the node temporarily to
1163 make compression both faster and easier by keeping track of how many non
1164 zero fields are in the node.
1165
1166 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1167 transition.
1168
1169 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1170 number representing the first entry of the node, and state as a
1171 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1172 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1173 are 2 entrys per node. eg:
1174
1175 A B A B
1176 1. 2 4 1. 3 7
1177 2. 0 3 3. 0 5
1178 3. 0 0 5. 0 0
1179 4. 0 0 7. 0 0
1180
1181 The table is internally in the right hand, idx form. However as we also
1182 have to deal with the states array which is indexed by nodenum we have to
1183 use TRIE_NODENUM() to convert.
1184
1185 */
1186
a02a5408 1187 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
a3621e74 1188 reg_trie_trans );
a02a5408 1189 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
1190 next_alloc = trie->uniquecharcount + 1;
1191
1192 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1193
c445ea15 1194 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1195 const U8 *uc = (U8*)STRING( noper );
1196 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1197
1198 U32 state = 1; /* required init */
1199
1200 U16 charid = 0; /* sanity init */
1201 U32 accept_state = 0; /* sanity init */
1202 U8 *scan = (U8*)NULL; /* sanity init */
1203
1204 STRLEN foldlen = 0; /* required init */
1205 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1206
1207
1208 for ( ; uc < e ; uc += len ) {
1209
1210 TRIE_READ_CHAR;
1211
1212 if ( uvc < 256 ) {
1213 charid = trie->charmap[ uvc ];
1214 } else {
c445ea15
AL
1215 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1216 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74
YO
1217 }
1218 if ( charid ) {
1219 charid--;
1220 if ( !trie->trans[ state + charid ].next ) {
1221 trie->trans[ state + charid ].next = next_alloc;
1222 trie->trans[ state ].check++;
1223 next_alloc += trie->uniquecharcount;
1224 }
1225 state = trie->trans[ state + charid ].next;
1226 } else {
e4584336 1227 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1228 }
1229 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1230 }
1231
1232 accept_state = TRIE_NODENUM( state );
1233 if ( !trie->states[ accept_state ].wordnum ) {
1234 /* we havent inserted this word into the structure yet. */
1235 trie->states[ accept_state ].wordnum = ++curword;
1236
1237 DEBUG_r({
1238 /* store the word for dumping */
1239 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1240 if ( UTF ) SvUTF8_on( tmp );
1241 av_push( trie->words, tmp );
1242 });
1243
1244 } else {
1245 /* Its a dupe. So ignore it. */
1246 }
1247
1248 } /* end second pass */
1249
1250 DEBUG_TRIE_COMPILE_MORE_r({
1251 /*
1252 print out the table precompression so that we can do a visual check
1253 that they are identical.
1254 */
1255 U32 state;
1256 U16 charid;
1257 PerlIO_printf( Perl_debug_log, "\nChar : " );
1258
1259 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1260 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1261 if ( tmp ) {
cfd0369c 1262 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1263 }
1264 }
1265
1266 PerlIO_printf( Perl_debug_log, "\nState+-" );
1267
1268 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1269 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1270 }
1271
1272 PerlIO_printf( Perl_debug_log, "\n" );
1273
1274 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1275
e4584336 1276 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
a3621e74
YO
1277
1278 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
e4584336
RB
1279 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1280 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
a3621e74
YO
1281 }
1282 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
e4584336 1283 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
a3621e74 1284 } else {
e4584336 1285 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
a3621e74
YO
1286 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1287 }
1288 }
1289 PerlIO_printf( Perl_debug_log, "\n\n" );
1290 });
1291 {
1292 /*
1293 * Inplace compress the table.*
1294
1295 For sparse data sets the table constructed by the trie algorithm will
1296 be mostly 0/FAIL transitions or to put it another way mostly empty.
1297 (Note that leaf nodes will not contain any transitions.)
1298
1299 This algorithm compresses the tables by eliminating most such
1300 transitions, at the cost of a modest bit of extra work during lookup:
1301
1302 - Each states[] entry contains a .base field which indicates the
1303 index in the state[] array wheres its transition data is stored.
1304
1305 - If .base is 0 there are no valid transitions from that node.
1306
1307 - If .base is nonzero then charid is added to it to find an entry in
1308 the trans array.
1309
1310 -If trans[states[state].base+charid].check!=state then the
1311 transition is taken to be a 0/Fail transition. Thus if there are fail
1312 transitions at the front of the node then the .base offset will point
1313 somewhere inside the previous nodes data (or maybe even into a node
1314 even earlier), but the .check field determines if the transition is
1315 valid.
1316
1317 The following process inplace converts the table to the compressed
1318 table: We first do not compress the root node 1,and mark its all its
1319 .check pointers as 1 and set its .base pointer as 1 as well. This
1320 allows to do a DFA construction from the compressed table later, and
1321 ensures that any .base pointers we calculate later are greater than
1322 0.
1323
1324 - We set 'pos' to indicate the first entry of the second node.
1325
1326 - We then iterate over the columns of the node, finding the first and
1327 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1328 and set the .check pointers accordingly, and advance pos
1329 appropriately and repreat for the next node. Note that when we copy
1330 the next pointers we have to convert them from the original
1331 NODEIDX form to NODENUM form as the former is not valid post
1332 compression.
1333
1334 - If a node has no transitions used we mark its base as 0 and do not
1335 advance the pos pointer.
1336
1337 - If a node only has one transition we use a second pointer into the
1338 structure to fill in allocated fail transitions from other states.
1339 This pointer is independent of the main pointer and scans forward
1340 looking for null transitions that are allocated to a state. When it
1341 finds one it writes the single transition into the "hole". If the
1342 pointer doesnt find one the single transition is appeneded as normal.
1343
1344 - Once compressed we can Renew/realloc the structures to release the
1345 excess space.
1346
1347 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1348 specifically Fig 3.47 and the associated pseudocode.
1349
1350 demq
1351 */
a3b680e6 1352 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1353 U32 state, charid;
a3621e74
YO
1354 U32 pos = 0, zp=0;
1355 trie->laststate = laststate;
1356
1357 for ( state = 1 ; state < laststate ; state++ ) {
1358 U8 flag = 0;
a28509cc
AL
1359 const U32 stateidx = TRIE_NODEIDX( state );
1360 const U32 o_used = trie->trans[ stateidx ].check;
1361 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1362 trie->trans[ stateidx ].check = 0;
1363
1364 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1365 if ( flag || trie->trans[ stateidx + charid ].next ) {
1366 if ( trie->trans[ stateidx + charid ].next ) {
1367 if (o_used == 1) {
1368 for ( ; zp < pos ; zp++ ) {
1369 if ( ! trie->trans[ zp ].next ) {
1370 break;
1371 }
1372 }
1373 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1374 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1375 trie->trans[ zp ].check = state;
1376 if ( ++zp > pos ) pos = zp;
1377 break;
1378 }
1379 used--;
1380 }
1381 if ( !flag ) {
1382 flag = 1;
1383 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1384 }
1385 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1386 trie->trans[ pos ].check = state;
1387 pos++;
1388 }
1389 }
1390 }
cc601c31 1391 trie->lasttrans = pos + 1;
a3621e74
YO
1392 Renew( trie->states, laststate + 1, reg_trie_state);
1393 DEBUG_TRIE_COMPILE_MORE_r(
e4584336
RB
1394 PerlIO_printf( Perl_debug_log,
1395 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
5d7488b2
AL
1396 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1397 (IV)next_alloc,
1398 (IV)pos,
a3621e74
YO
1399 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1400 );
1401
1402 } /* end table compress */
1403 }
cc601c31
YO
1404 /* resize the trans array to remove unused space */
1405 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74
YO
1406
1407 DEBUG_TRIE_COMPILE_r({
1408 U32 state;
1409 /*
1410 Now we print it out again, in a slightly different form as there is additional
1411 info we want to be able to see when its compressed. They are close enough for
1412 visual comparison though.
1413 */
1414 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1415
1416 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1417 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1418 if ( tmp ) {
cfd0369c 1419 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1420 }
1421 }
1422 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
cc601c31 1423
a3621e74
YO
1424 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1425 PerlIO_printf( Perl_debug_log, "-----");
1426 PerlIO_printf( Perl_debug_log, "\n");
cc601c31 1427
a3621e74 1428 for( state = 1 ; state < trie->laststate ; state++ ) {
a28509cc 1429 const U32 base = trie->states[ state ].trans.base;
a3621e74 1430
e4584336 1431 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
a3621e74
YO
1432
1433 if ( trie->states[ state ].wordnum ) {
1434 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1435 } else {
1436 PerlIO_printf( Perl_debug_log, "%6s", "" );
1437 }
1438
e4584336 1439 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
a3621e74
YO
1440
1441 if ( base ) {
1442 U32 ofs = 0;
1443
cc601c31
YO
1444 while( ( base + ofs < trie->uniquecharcount ) ||
1445 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1446 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
a3621e74
YO
1447 ofs++;
1448
e4584336 1449 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
a3621e74
YO
1450
1451 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1452 if ( ( base + ofs >= trie->uniquecharcount ) &&
1453 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1454 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1455 {
e4584336
RB
1456 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1457 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
a3621e74
YO
1458 } else {
1459 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1460 }
1461 }
1462
e4584336 1463 PerlIO_printf( Perl_debug_log, "]");
a3621e74
YO
1464
1465 }
1466 PerlIO_printf( Perl_debug_log, "\n" );
1467 }
1468 });
1469
1470 {
1471 /* now finally we "stitch in" the new TRIE node
1472 This means we convert either the first branch or the first Exact,
1473 depending on whether the thing following (in 'last') is a branch
1474 or not and whther first is the startbranch (ie is it a sub part of
1475 the alternation or is it the whole thing.)
1476 Assuming its a sub part we conver the EXACT otherwise we convert
1477 the whole branch sequence, including the first.
1478 */
1479 regnode *convert;
1480
1481
1482
1483
1484 if ( first == startbranch && OP( last ) != BRANCH ) {
1485 convert = first;
1486 } else {
1487 convert = NEXTOPER( first );
1488 NEXT_OFF( first ) = (U16)(last - first);
1489 }
1490
1491 OP( convert ) = TRIE + (U8)( flags - EXACT );
1492 NEXT_OFF( convert ) = (U16)(tail - convert);
1493 ARG_SET( convert, data_slot );
1494
1495 /* tells us if we need to handle accept buffers specially */
1496 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1497
1498
1499 /* needed for dumping*/
1500 DEBUG_r({
1501 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1502 /* We now need to mark all of the space originally used by the
1503 branches as optimized away. This keeps the dumpuntil from
1504 throwing a wobbly as it doesnt use regnext() to traverse the
1505 opcodes.
1506 */
1507 while( optimize < last ) {
1508 OP( optimize ) = OPTIMIZED;
1509 optimize++;
1510 }
1511 });
1512 } /* end node insert */
1513 return 1;
1514}
1515
1516
1517
1518/*
5d1c421c
JH
1519 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1520 * These need to be revisited when a newer toolchain becomes available.
1521 */
1522#if defined(__sparc64__) && defined(__GNUC__)
1523# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1524# undef SPARC64_GCC_WORKAROUND
1525# define SPARC64_GCC_WORKAROUND 1
1526# endif
1527#endif
1528
653099ff
GS
1529/* REx optimizer. Converts nodes into quickier variants "in place".
1530 Finds fixed substrings. */
1531
a0288114 1532/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
1533 to the position after last scanned or to NULL. */
1534
a3621e74 1535
76e3520e 1536STATIC I32
9a957fbc
AL
1537S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1538 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1539 /* scanp: Start here (read-write). */
1540 /* deltap: Write maxlen-minlen here. */
1541 /* last: Stop before this one. */
1542{
97aff369 1543 dVAR;
c277df42
IZ
1544 I32 min = 0, pars = 0, code;
1545 regnode *scan = *scanp, *next;
1546 I32 delta = 0;
1547 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1548 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1549 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1550 scan_data_t data_fake;
653099ff 1551 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1552 SV *re_trie_maxbuff = NULL;
1553
1554 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1555
c277df42
IZ
1556 while (scan && OP(scan) != END && scan < last) {
1557 /* Peephole optimizer: */
a3621e74 1558 DEBUG_OPTIMISE_r({
c445ea15 1559 SV * const mysv=sv_newmortal();
a3621e74 1560 regprop( mysv, scan);
e4584336 1561 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
cfd0369c 1562 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
a3621e74 1563 });
c277df42 1564
22c35a8c 1565 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 1566 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1567 regnode *n = regnext(scan);
1568 U32 stringok = 1;
1569#ifdef DEBUGGING
1570 regnode *stop = scan;
b81d288d 1571#endif
c277df42 1572
cd439c50 1573 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1574 /* Skip NOTHING, merge EXACT*. */
1575 while (n &&
b81d288d 1576 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
1577 (stringok && (OP(n) == OP(scan))))
1578 && NEXT_OFF(n)
1579 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1580 if (OP(n) == TAIL || n > next)
1581 stringok = 0;
22c35a8c 1582 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
1583 NEXT_OFF(scan) += NEXT_OFF(n);
1584 next = n + NODE_STEP_REGNODE;
1585#ifdef DEBUGGING
1586 if (stringok)
1587 stop = n;
b81d288d 1588#endif
c277df42 1589 n = regnext(n);
a0ed51b3 1590 }
f49d4d0f 1591 else if (stringok) {
a3b680e6 1592 const int oldl = STR_LEN(scan);
c445ea15 1593 regnode * const nnext = regnext(n);
f49d4d0f 1594
b81d288d 1595 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1596 break;
1597 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1598 STR_LEN(scan) += STR_LEN(n);
1599 next = n + NODE_SZ_STR(n);
c277df42 1600 /* Now we can overwrite *n : */
f49d4d0f 1601 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1602#ifdef DEBUGGING
f49d4d0f 1603 stop = next - 1;
b81d288d 1604#endif
c277df42
IZ
1605 n = nnext;
1606 }
1607 }
61a36c01 1608
a3621e74 1609 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1610/*
1611 Two problematic code points in Unicode casefolding of EXACT nodes:
1612
1613 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1614 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1615
1616 which casefold to
1617
1618 Unicode UTF-8
1619
1620 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1621 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1622
1623 This means that in case-insensitive matching (or "loose matching",
1624 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1625 length of the above casefolded versions) can match a target string
1626 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1627 This would rather mess up the minimum length computation.
1628
1629 What we'll do is to look for the tail four bytes, and then peek
1630 at the preceding two bytes to see whether we need to decrease
1631 the minimum length by four (six minus two).
1632
1633 Thanks to the design of UTF-8, there cannot be false matches:
1634 A sequence of valid UTF-8 bytes cannot be a subsequence of
1635 another valid sequence of UTF-8 bytes.
1636
1637*/
c445ea15
AL
1638 char * const s0 = STRING(scan), *s, *t;
1639 char * const s1 = s0 + STR_LEN(scan) - 1;
1640 char * const s2 = s1 - 4;
a28509cc
AL
1641 const char * const t0 = "\xcc\x88\xcc\x81";
1642 const char * const t1 = t0 + 3;
2af232bd 1643
61a36c01
JH
1644 for (s = s0 + 2;
1645 s < s2 && (t = ninstr(s, s1, t0, t1));
1646 s = t + 4) {
1647 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1648 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1649 min -= 4;
1650 }
1651 }
1652
c277df42
IZ
1653#ifdef DEBUGGING
1654 /* Allow dumping */
cd439c50 1655 n = scan + NODE_SZ_STR(scan);
c277df42 1656 while (n <= stop) {
22c35a8c 1657 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1658 OP(n) = OPTIMIZED;
1659 NEXT_OFF(n) = 0;
1660 }
1661 n++;
1662 }
653099ff 1663#endif
c277df42 1664 }
a3621e74
YO
1665
1666
1667
653099ff
GS
1668 /* Follow the next-chain of the current node and optimize
1669 away all the NOTHINGs from it. */
c277df42 1670 if (OP(scan) != CURLYX) {
a3b680e6 1671 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
1672 ? I32_MAX
1673 /* I32 may be smaller than U16 on CRAYs! */
1674 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1675 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1676 int noff;
1677 regnode *n = scan;
b81d288d 1678
c277df42
IZ
1679 /* Skip NOTHING and LONGJMP. */
1680 while ((n = regnext(n))
22c35a8c 1681 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1682 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1683 && off + noff < max)
1684 off += noff;
1685 if (reg_off_by_arg[OP(scan)])
1686 ARG(scan) = off;
b81d288d 1687 else
c277df42
IZ
1688 NEXT_OFF(scan) = off;
1689 }
a3621e74 1690
653099ff
GS
1691 /* The principal pseudo-switch. Cannot be a switch, since we
1692 look into several different things. */
b81d288d 1693 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1694 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1695 next = regnext(scan);
1696 code = OP(scan);
a3621e74 1697 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1698
1699 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1700 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1701 struct regnode_charclass_class accum;
a3621e74 1702 regnode *startbranch=scan;
c277df42 1703
653099ff 1704 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1705 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1706 if (flags & SCF_DO_STCLASS)
830247a4 1707 cl_init_zero(pRExC_state, &accum);
a3621e74 1708
c277df42 1709 while (OP(scan) == code) {
830247a4 1710 I32 deltanext, minnext, f = 0, fake;
653099ff 1711 struct regnode_charclass_class this_class;
c277df42
IZ
1712
1713 num++;
1714 data_fake.flags = 0;
b81d288d 1715 if (data) {
2c2d71f5 1716 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1717 data_fake.last_closep = data->last_closep;
1718 }
1719 else
1720 data_fake.last_closep = &fake;
c277df42
IZ
1721 next = regnext(scan);
1722 scan = NEXTOPER(scan);
1723 if (code != BRANCH)
1724 scan = NEXTOPER(scan);
653099ff 1725 if (flags & SCF_DO_STCLASS) {
830247a4 1726 cl_init(pRExC_state, &this_class);
653099ff
GS
1727 data_fake.start_class = &this_class;
1728 f = SCF_DO_STCLASS_AND;
b81d288d 1729 }
e1901655
IZ
1730 if (flags & SCF_WHILEM_VISITED_POS)
1731 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1732
653099ff 1733 /* we suppose the run is continuous, last=next...*/
830247a4 1734 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1735 next, &data_fake, f,depth+1);
b81d288d 1736 if (min1 > minnext)
c277df42
IZ
1737 min1 = minnext;
1738 if (max1 < minnext + deltanext)
1739 max1 = minnext + deltanext;
1740 if (deltanext == I32_MAX)
aca2d497 1741 is_inf = is_inf_internal = 1;
c277df42
IZ
1742 scan = next;
1743 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1744 pars++;
405ff068 1745 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1746 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1747 if (data)
1748 data->whilem_c = data_fake.whilem_c;
653099ff 1749 if (flags & SCF_DO_STCLASS)
830247a4 1750 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1751 if (code == SUSPEND)
c277df42
IZ
1752 break;
1753 }
1754 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1755 min1 = 0;
1756 if (flags & SCF_DO_SUBSTR) {
1757 data->pos_min += min1;
1758 data->pos_delta += max1 - min1;
1759 if (max1 != min1 || is_inf)
1760 data->longest = &(data->longest_float);
1761 }
1762 min += min1;
1763 delta += max1 - min1;
653099ff 1764 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1765 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1766 if (min1) {
1767 cl_and(data->start_class, &and_with);
1768 flags &= ~SCF_DO_STCLASS;
1769 }
1770 }
1771 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1772 if (min1) {
1773 cl_and(data->start_class, &accum);
653099ff 1774 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1775 }
1776 else {
b81d288d 1777 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1778 * data->start_class */
1779 StructCopy(data->start_class, &and_with,
1780 struct regnode_charclass_class);
1781 flags &= ~SCF_DO_STCLASS_AND;
1782 StructCopy(&accum, data->start_class,
1783 struct regnode_charclass_class);
1784 flags |= SCF_DO_STCLASS_OR;
1785 data->start_class->flags |= ANYOF_EOS;
1786 }
653099ff 1787 }
a3621e74
YO
1788
1789 /* demq.
1790
1791 Assuming this was/is a branch we are dealing with: 'scan' now
1792 points at the item that follows the branch sequence, whatever
1793 it is. We now start at the beginning of the sequence and look
1794 for subsequences of
1795
1796 BRANCH->EXACT=>X
1797 BRANCH->EXACT=>X
1798
1799 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1800
1801 If we can find such a subseqence we need to turn the first
1802 element into a trie and then add the subsequent branch exact
1803 strings to the trie.
1804
1805 We have two cases
1806
1807 1. patterns where the whole set of branch can be converted to a trie,
1808
1809 2. patterns where only a subset of the alternations can be
1810 converted to a trie.
1811
1812 In case 1 we can replace the whole set with a single regop
1813 for the trie. In case 2 we need to keep the start and end
1814 branchs so
1815
1816 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1817 becomes BRANCH TRIE; BRANCH X;
1818
1819 Hypthetically when we know the regex isnt anchored we can
1820 turn a case 1 into a DFA and let it rip... Every time it finds a match
1821 it would just call its tail, no WHILEM/CURLY needed.
1822
1823 */
0111c4fd
RGS
1824 if (DO_TRIE) {
1825 if (!re_trie_maxbuff) {
1826 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1827 if (!SvIOK(re_trie_maxbuff))
1828 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1829 }
a3621e74
YO
1830 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1831 regnode *cur;
1832 regnode *first = (regnode *)NULL;
1833 regnode *last = (regnode *)NULL;
1834 regnode *tail = scan;
1835 U8 optype = 0;
1836 U32 count=0;
1837
1838#ifdef DEBUGGING
c445ea15 1839 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
1840#endif
1841 /* var tail is used because there may be a TAIL
1842 regop in the way. Ie, the exacts will point to the
1843 thing following the TAIL, but the last branch will
1844 point at the TAIL. So we advance tail. If we
1845 have nested (?:) we may have to move through several
1846 tails.
1847 */
1848
1849 while ( OP( tail ) == TAIL ) {
1850 /* this is the TAIL generated by (?:) */
1851 tail = regnext( tail );
1852 }
1853
1854 DEBUG_OPTIMISE_r({
1855 regprop( mysv, tail );
1856 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
cfd0369c 1857 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
a3621e74
YO
1858 (RExC_seen_evals) ? "[EVAL]" : ""
1859 );
1860 });
1861 /*
1862
1863 step through the branches, cur represents each
1864 branch, noper is the first thing to be matched
1865 as part of that branch and noper_next is the
1866 regnext() of that node. if noper is an EXACT
1867 and noper_next is the same as scan (our current
1868 position in the regex) then the EXACT branch is
1869 a possible optimization target. Once we have
1870 two or more consequetive such branches we can
1871 create a trie of the EXACT's contents and stich
1872 it in place. If the sequence represents all of
1873 the branches we eliminate the whole thing and
1874 replace it with a single TRIE. If it is a
1875 subsequence then we need to stitch it in. This
1876 means the first branch has to remain, and needs
1877 to be repointed at the item on the branch chain
1878 following the last branch optimized. This could
1879 be either a BRANCH, in which case the
1880 subsequence is internal, or it could be the
1881 item following the branch sequence in which
1882 case the subsequence is at the end.
1883
1884 */
1885
1886 /* dont use tail as the end marker for this traverse */
1887 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
1888 regnode * const noper = NEXTOPER( cur );
1889 regnode * const noper_next = regnext( noper );
a3621e74 1890
a3621e74
YO
1891 DEBUG_OPTIMISE_r({
1892 regprop( mysv, cur);
1893 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1894 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
a3621e74
YO
1895
1896 regprop( mysv, noper);
1897 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 1898 SvPV_nolen_const(mysv));
a3621e74
YO
1899
1900 if ( noper_next ) {
1901 regprop( mysv, noper_next );
1902 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 1903 SvPV_nolen_const(mysv));
a3621e74
YO
1904 }
1905 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1906 first, last, cur );
1907 });
1908 if ( ( first ? OP( noper ) == optype
1909 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1910 && noper_next == tail && count<U16_MAX)
1911 {
1912 count++;
1913 if ( !first ) {
1914 first = cur;
1915 optype = OP( noper );
1916 } else {
1917 DEBUG_OPTIMISE_r(
1918 if (!last ) {
1919 regprop( mysv, first);
1920 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1921 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
a3621e74
YO
1922 regprop( mysv, NEXTOPER(first) );
1923 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1924 SvPV_nolen_const( mysv ) );
a3621e74
YO
1925 }
1926 );
1927 last = cur;
1928 DEBUG_OPTIMISE_r({
1929 regprop( mysv, cur);
1930 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1931 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
a3621e74
YO
1932 regprop( mysv, noper );
1933 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1934 SvPV_nolen_const( mysv ) );
a3621e74
YO
1935 });
1936 }
1937 } else {
1938 if ( last ) {
1939 DEBUG_OPTIMISE_r(
1940 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1941 (int)depth * 2 + 2, "E:", "**END**" );
a3621e74
YO
1942 );
1943 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1944 }
1945 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1946 && noper_next == tail )
1947 {
1948 count = 1;
1949 first = cur;
1950 optype = OP( noper );
1951 } else {
1952 count = 0;
1953 first = NULL;
1954 optype = 0;
1955 }
1956 last = NULL;
1957 }
1958 }
1959 DEBUG_OPTIMISE_r({
1960 regprop( mysv, cur);
1961 PerlIO_printf( Perl_debug_log,
e4584336 1962 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
cfd0369c 1963 " ", SvPV_nolen_const( mysv ), first, last, cur);
a3621e74
YO
1964
1965 });
1966 if ( last ) {
1967 DEBUG_OPTIMISE_r(
1968 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1969 (int)depth * 2 + 2, "E:", "==END==" );
a3621e74
YO
1970 );
1971 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1972 }
1973 }
1974 }
a0ed51b3 1975 }
a3621e74 1976 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 1977 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 1978 } else /* single branch is optimized. */
c277df42
IZ
1979 scan = NEXTOPER(scan);
1980 continue;
a0ed51b3
LW
1981 }
1982 else if (OP(scan) == EXACT) {
cd439c50 1983 I32 l = STR_LEN(scan);
c445ea15 1984 UV uc;
a0ed51b3 1985 if (UTF) {
a3b680e6 1986 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 1987 l = utf8_length(s, s + l);
9041c2e3 1988 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
1989 } else {
1990 uc = *((U8*)STRING(scan));
a0ed51b3
LW
1991 }
1992 min += l;
c277df42 1993 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
1994 /* The code below prefers earlier match for fixed
1995 offset, later match for variable offset. */
1996 if (data->last_end == -1) { /* Update the start info. */
1997 data->last_start_min = data->pos_min;
1998 data->last_start_max = is_inf
b81d288d 1999 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2000 }
cd439c50 2001 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292 2002 {
9a957fbc 2003 SV * const sv = data->last_found;
a28509cc 2004 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2005 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2006 if (mg && mg->mg_len >= 0)
5e43f467
JH
2007 mg->mg_len += utf8_length((U8*)STRING(scan),
2008 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2009 }
33b8afdf
JH
2010 if (UTF)
2011 SvUTF8_on(data->last_found);
c277df42
IZ
2012 data->last_end = data->pos_min + l;
2013 data->pos_min += l; /* As in the first entry. */
2014 data->flags &= ~SF_BEFORE_EOL;
2015 }
653099ff
GS
2016 if (flags & SCF_DO_STCLASS_AND) {
2017 /* Check whether it is compatible with what we know already! */
2018 int compat = 1;
2019
1aa99e6b 2020 if (uc >= 0x100 ||
516a5887 2021 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2022 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2023 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2024 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2025 )
653099ff
GS
2026 compat = 0;
2027 ANYOF_CLASS_ZERO(data->start_class);
2028 ANYOF_BITMAP_ZERO(data->start_class);
2029 if (compat)
1aa99e6b 2030 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2031 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2032 if (uc < 0x100)
2033 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2034 }
2035 else if (flags & SCF_DO_STCLASS_OR) {
2036 /* false positive possible if the class is case-folded */
1aa99e6b 2037 if (uc < 0x100)
9b877dbb
IH
2038 ANYOF_BITMAP_SET(data->start_class, uc);
2039 else
2040 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2041 data->start_class->flags &= ~ANYOF_EOS;
2042 cl_and(data->start_class, &and_with);
2043 }
2044 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2045 }
653099ff 2046 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2047 I32 l = STR_LEN(scan);
1aa99e6b 2048 UV uc = *((U8*)STRING(scan));
653099ff
GS
2049
2050 /* Search for fixed substrings supports EXACT only. */
b81d288d 2051 if (flags & SCF_DO_SUBSTR)
830247a4 2052 scan_commit(pRExC_state, data);
a0ed51b3 2053 if (UTF) {
1aa99e6b
IH
2054 U8 *s = (U8 *)STRING(scan);
2055 l = utf8_length(s, s + l);
9041c2e3 2056 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2057 }
2058 min += l;
c277df42 2059 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 2060 data->pos_min += l;
653099ff
GS
2061 if (flags & SCF_DO_STCLASS_AND) {
2062 /* Check whether it is compatible with what we know already! */
2063 int compat = 1;
2064
1aa99e6b 2065 if (uc >= 0x100 ||
516a5887 2066 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2067 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2068 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2069 compat = 0;
2070 ANYOF_CLASS_ZERO(data->start_class);
2071 ANYOF_BITMAP_ZERO(data->start_class);
2072 if (compat) {
1aa99e6b 2073 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2074 data->start_class->flags &= ~ANYOF_EOS;
2075 data->start_class->flags |= ANYOF_FOLD;
2076 if (OP(scan) == EXACTFL)
2077 data->start_class->flags |= ANYOF_LOCALE;
2078 }
2079 }
2080 else if (flags & SCF_DO_STCLASS_OR) {
2081 if (data->start_class->flags & ANYOF_FOLD) {
2082 /* false positive possible if the class is case-folded.
2083 Assume that the locale settings are the same... */
1aa99e6b
IH
2084 if (uc < 0x100)
2085 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2086 data->start_class->flags &= ~ANYOF_EOS;
2087 }
2088 cl_and(data->start_class, &and_with);
2089 }
2090 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2091 }
bfed75c6 2092 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2093 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2094 I32 f = flags, pos_before = 0;
c277df42 2095 regnode *oscan = scan;
653099ff
GS
2096 struct regnode_charclass_class this_class;
2097 struct regnode_charclass_class *oclass = NULL;
727f22e3 2098 I32 next_is_eval = 0;
653099ff 2099
22c35a8c 2100 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2101 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2102 scan = NEXTOPER(scan);
2103 goto finish;
2104 case PLUS:
653099ff 2105 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2106 next = NEXTOPER(scan);
653099ff 2107 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2108 mincount = 1;
2109 maxcount = REG_INFTY;
c277df42
IZ
2110 next = regnext(scan);
2111 scan = NEXTOPER(scan);
2112 goto do_curly;
2113 }
2114 }
2115 if (flags & SCF_DO_SUBSTR)
2116 data->pos_min++;
2117 min++;
2118 /* Fall through. */
2119 case STAR:
653099ff
GS
2120 if (flags & SCF_DO_STCLASS) {
2121 mincount = 0;
b81d288d 2122 maxcount = REG_INFTY;
653099ff
GS
2123 next = regnext(scan);
2124 scan = NEXTOPER(scan);
2125 goto do_curly;
2126 }
b81d288d 2127 is_inf = is_inf_internal = 1;
c277df42
IZ
2128 scan = regnext(scan);
2129 if (flags & SCF_DO_SUBSTR) {
830247a4 2130 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2131 data->longest = &(data->longest_float);
2132 }
2133 goto optimize_curly_tail;
2134 case CURLY:
b81d288d 2135 mincount = ARG1(scan);
c277df42
IZ
2136 maxcount = ARG2(scan);
2137 next = regnext(scan);
cb434fcc
IZ
2138 if (OP(scan) == CURLYX) {
2139 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2140 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2141 }
c277df42 2142 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2143 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2144 do_curly:
2145 if (flags & SCF_DO_SUBSTR) {
830247a4 2146 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2147 pos_before = data->pos_min;
2148 }
2149 if (data) {
2150 fl = data->flags;
2151 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2152 if (is_inf)
2153 data->flags |= SF_IS_INF;
2154 }
653099ff 2155 if (flags & SCF_DO_STCLASS) {
830247a4 2156 cl_init(pRExC_state, &this_class);
653099ff
GS
2157 oclass = data->start_class;
2158 data->start_class = &this_class;
2159 f |= SCF_DO_STCLASS_AND;
2160 f &= ~SCF_DO_STCLASS_OR;
2161 }
e1901655
IZ
2162 /* These are the cases when once a subexpression
2163 fails at a particular position, it cannot succeed
2164 even after backtracking at the enclosing scope.
b81d288d 2165
e1901655
IZ
2166 XXXX what if minimal match and we are at the
2167 initial run of {n,m}? */
2168 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2169 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2170
c277df42 2171 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2172 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2173 (mincount == 0
2174 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2175
2176 if (flags & SCF_DO_STCLASS)
2177 data->start_class = oclass;
2178 if (mincount == 0 || minnext == 0) {
2179 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2180 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2181 }
2182 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2183 /* Switch to OR mode: cache the old value of
653099ff
GS
2184 * data->start_class */
2185 StructCopy(data->start_class, &and_with,
2186 struct regnode_charclass_class);
2187 flags &= ~SCF_DO_STCLASS_AND;
2188 StructCopy(&this_class, data->start_class,
2189 struct regnode_charclass_class);
2190 flags |= SCF_DO_STCLASS_OR;
2191 data->start_class->flags |= ANYOF_EOS;
2192 }
2193 } else { /* Non-zero len */
2194 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2195 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2196 cl_and(data->start_class, &and_with);
2197 }
2198 else if (flags & SCF_DO_STCLASS_AND)
2199 cl_and(data->start_class, &this_class);
2200 flags &= ~SCF_DO_STCLASS;
2201 }
c277df42
IZ
2202 if (!scan) /* It was not CURLYX, but CURLY. */
2203 scan = next;
041457d9
DM
2204 if ( /* ? quantifier ok, except for (?{ ... }) */
2205 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2206 && (minnext == 0) && (deltanext == 0)
99799961 2207 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2208 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2209 && ckWARN(WARN_REGEXP))
b45f050a 2210 {
830247a4 2211 vWARN(RExC_parse,
b45f050a
JF
2212 "Quantifier unexpected on zero-length expression");
2213 }
2214
c277df42 2215 min += minnext * mincount;
b81d288d 2216 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2217 && (minnext + deltanext) > 0)
2218 || deltanext == I32_MAX);
aca2d497 2219 is_inf |= is_inf_internal;
c277df42
IZ
2220 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2221
2222 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2223 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2224 && data->flags & SF_IN_PAR
2225 && !(data->flags & SF_HAS_EVAL)
2226 && !deltanext && minnext == 1 ) {
2227 /* Try to optimize to CURLYN. */
2228 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
2229 regnode *nxt1 = nxt;
2230#ifdef DEBUGGING
2231 regnode *nxt2;
2232#endif
c277df42
IZ
2233
2234 /* Skip open. */
2235 nxt = regnext(nxt);
bfed75c6 2236 if (!strchr((const char*)PL_simple,OP(nxt))
22c35a8c 2237 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 2238 && STR_LEN(nxt) == 1))
c277df42 2239 goto nogo;
497b47a8 2240#ifdef DEBUGGING
c277df42 2241 nxt2 = nxt;
497b47a8 2242#endif
c277df42 2243 nxt = regnext(nxt);
b81d288d 2244 if (OP(nxt) != CLOSE)
c277df42
IZ
2245 goto nogo;
2246 /* Now we know that nxt2 is the only contents: */
eb160463 2247 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2248 OP(oscan) = CURLYN;
2249 OP(nxt1) = NOTHING; /* was OPEN. */
2250#ifdef DEBUGGING
2251 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2252 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2253 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2254 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2255 OP(nxt + 1) = OPTIMIZED; /* was count. */
2256 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2257#endif
c277df42 2258 }
c277df42
IZ
2259 nogo:
2260
2261 /* Try optimization CURLYX => CURLYM. */
b81d288d 2262 if ( OP(oscan) == CURLYX && data
c277df42 2263 && !(data->flags & SF_HAS_PAR)
c277df42 2264 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2265 && !deltanext /* atom is fixed width */
2266 && minnext != 0 /* CURLYM can't handle zero width */
2267 ) {
c277df42
IZ
2268 /* XXXX How to optimize if data == 0? */
2269 /* Optimize to a simpler form. */
2270 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2271 regnode *nxt2;
2272
2273 OP(oscan) = CURLYM;
2274 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2275 && (OP(nxt2) != WHILEM))
c277df42
IZ
2276 nxt = nxt2;
2277 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2278 /* Need to optimize away parenths. */
2279 if (data->flags & SF_IN_PAR) {
2280 /* Set the parenth number. */
2281 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2282
b81d288d 2283 if (OP(nxt) != CLOSE)
b45f050a 2284 FAIL("Panic opt close");
eb160463 2285 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2286 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2287 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2288#ifdef DEBUGGING
2289 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2290 OP(nxt + 1) = OPTIMIZED; /* was count. */
2291 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2292 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2293#endif
c277df42
IZ
2294#if 0
2295 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2296 regnode *nnxt = regnext(nxt1);
b81d288d 2297
c277df42
IZ
2298 if (nnxt == nxt) {
2299 if (reg_off_by_arg[OP(nxt1)])
2300 ARG_SET(nxt1, nxt2 - nxt1);
2301 else if (nxt2 - nxt1 < U16_MAX)
2302 NEXT_OFF(nxt1) = nxt2 - nxt1;
2303 else
2304 OP(nxt) = NOTHING; /* Cannot beautify */
2305 }
2306 nxt1 = nnxt;
2307 }
2308#endif
2309 /* Optimize again: */
b81d288d 2310 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2311 NULL, 0,depth+1);
a0ed51b3
LW
2312 }
2313 else
c277df42 2314 oscan->flags = 0;
c277df42 2315 }
e1901655
IZ
2316 else if ((OP(oscan) == CURLYX)
2317 && (flags & SCF_WHILEM_VISITED_POS)
2318 /* See the comment on a similar expression above.
2319 However, this time it not a subexpression
2320 we care about, but the expression itself. */
2321 && (maxcount == REG_INFTY)
2322 && data && ++data->whilem_c < 16) {
2323 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2324 /* Find WHILEM (as in regexec.c) */
2325 regnode *nxt = oscan + NEXT_OFF(oscan);
2326
2327 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2328 nxt += ARG(nxt);
eb160463
GS
2329 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2330 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2331 }
b81d288d 2332 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2333 pars++;
2334 if (flags & SCF_DO_SUBSTR) {
c445ea15 2335 SV *last_str = NULL;
c277df42
IZ
2336 int counted = mincount != 0;
2337
2338 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2339#if defined(SPARC64_GCC_WORKAROUND)
2340 I32 b = 0;
2341 STRLEN l = 0;
cfd0369c 2342 const char *s = NULL;
5d1c421c
JH
2343 I32 old = 0;
2344
2345 if (pos_before >= data->last_start_min)
2346 b = pos_before;
2347 else
2348 b = data->last_start_min;
2349
2350 l = 0;
cfd0369c 2351 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2352 old = b - data->last_start_min;
2353
2354#else
b81d288d 2355 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2356 ? pos_before : data->last_start_min;
2357 STRLEN l;
cfd0369c 2358 const char *s = SvPV_const(data->last_found, l);
a0ed51b3 2359 I32 old = b - data->last_start_min;
5d1c421c 2360#endif
a0ed51b3
LW
2361
2362 if (UTF)
2363 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2364
a0ed51b3 2365 l -= old;
c277df42 2366 /* Get the added string: */
79cb57f6 2367 last_str = newSVpvn(s + old, l);
0e933229
IH
2368 if (UTF)
2369 SvUTF8_on(last_str);
c277df42
IZ
2370 if (deltanext == 0 && pos_before == b) {
2371 /* What was added is a constant string */
2372 if (mincount > 1) {
2373 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2374 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2375 SvPVX_const(last_str), l, mincount - 1);
b162af07 2376 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2377 /* Add additional parts. */
b81d288d 2378 SvCUR_set(data->last_found,
c277df42
IZ
2379 SvCUR(data->last_found) - l);
2380 sv_catsv(data->last_found, last_str);
0eda9292
JH
2381 {
2382 SV * sv = data->last_found;
2383 MAGIC *mg =
2384 SvUTF8(sv) && SvMAGICAL(sv) ?
2385 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2386 if (mg && mg->mg_len >= 0)
2387 mg->mg_len += CHR_SVLEN(last_str);
2388 }
c277df42
IZ
2389 data->last_end += l * (mincount - 1);
2390 }
2a8d9689
HS
2391 } else {
2392 /* start offset must point into the last copy */
2393 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2394 data->last_start_max += is_inf ? I32_MAX
2395 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2396 }
2397 }
2398 /* It is counted once already... */
2399 data->pos_min += minnext * (mincount - counted);
2400 data->pos_delta += - counted * deltanext +
2401 (minnext + deltanext) * maxcount - minnext * mincount;
2402 if (mincount != maxcount) {
653099ff
GS
2403 /* Cannot extend fixed substrings found inside
2404 the group. */
830247a4 2405 scan_commit(pRExC_state,data);
c277df42
IZ
2406 if (mincount && last_str) {
2407 sv_setsv(data->last_found, last_str);
2408 data->last_end = data->pos_min;
b81d288d 2409 data->last_start_min =
a0ed51b3 2410 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2411 data->last_start_max = is_inf
2412 ? I32_MAX
c277df42 2413 : data->pos_min + data->pos_delta
a0ed51b3 2414 - CHR_SVLEN(last_str);
c277df42
IZ
2415 }
2416 data->longest = &(data->longest_float);
2417 }
aca2d497 2418 SvREFCNT_dec(last_str);
c277df42 2419 }
405ff068 2420 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2421 data->flags |= SF_HAS_EVAL;
2422 optimize_curly_tail:
c277df42 2423 if (OP(oscan) != CURLYX) {
22c35a8c 2424 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2425 && NEXT_OFF(next))
2426 NEXT_OFF(oscan) += NEXT_OFF(next);
2427 }
c277df42 2428 continue;
653099ff 2429 default: /* REF and CLUMP only? */
c277df42 2430 if (flags & SCF_DO_SUBSTR) {
830247a4 2431 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2432 data->longest = &(data->longest_float);
2433 }
aca2d497 2434 is_inf = is_inf_internal = 1;
653099ff 2435 if (flags & SCF_DO_STCLASS_OR)
830247a4 2436 cl_anything(pRExC_state, data->start_class);
653099ff 2437 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2438 break;
2439 }
a0ed51b3 2440 }
bfed75c6 2441 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2442 int value = 0;
653099ff 2443
c277df42 2444 if (flags & SCF_DO_SUBSTR) {
830247a4 2445 scan_commit(pRExC_state,data);
c277df42
IZ
2446 data->pos_min++;
2447 }
2448 min++;
653099ff
GS
2449 if (flags & SCF_DO_STCLASS) {
2450 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2451
2452 /* Some of the logic below assumes that switching
2453 locale on will only add false positives. */
2454 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2455 case SANY:
653099ff
GS
2456 default:
2457 do_default:
2458 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2459 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2460 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2461 break;
2462 case REG_ANY:
2463 if (OP(scan) == SANY)
2464 goto do_default;
2465 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2466 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2467 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2468 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2469 }
2470 if (flags & SCF_DO_STCLASS_AND || !value)
2471 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2472 break;
2473 case ANYOF:
2474 if (flags & SCF_DO_STCLASS_AND)
2475 cl_and(data->start_class,
2476 (struct regnode_charclass_class*)scan);
2477 else
830247a4 2478 cl_or(pRExC_state, data->start_class,
653099ff
GS
2479 (struct regnode_charclass_class*)scan);
2480 break;
2481 case ALNUM:
2482 if (flags & SCF_DO_STCLASS_AND) {
2483 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2484 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2485 for (value = 0; value < 256; value++)
2486 if (!isALNUM(value))
2487 ANYOF_BITMAP_CLEAR(data->start_class, value);
2488 }
2489 }
2490 else {
2491 if (data->start_class->flags & ANYOF_LOCALE)
2492 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2493 else {
2494 for (value = 0; value < 256; value++)
2495 if (isALNUM(value))
b81d288d 2496 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2497 }
2498 }
2499 break;
2500 case ALNUML:
2501 if (flags & SCF_DO_STCLASS_AND) {
2502 if (data->start_class->flags & ANYOF_LOCALE)
2503 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2504 }
2505 else {
2506 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2507 data->start_class->flags |= ANYOF_LOCALE;
2508 }
2509 break;
2510 case NALNUM:
2511 if (flags & SCF_DO_STCLASS_AND) {
2512 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2513 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2514 for (value = 0; value < 256; value++)
2515 if (isALNUM(value))
2516 ANYOF_BITMAP_CLEAR(data->start_class, value);
2517 }
2518 }
2519 else {
2520 if (data->start_class->flags & ANYOF_LOCALE)
2521 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2522 else {
2523 for (value = 0; value < 256; value++)
2524 if (!isALNUM(value))
b81d288d 2525 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2526 }
2527 }
2528 break;
2529 case NALNUML:
2530 if (flags & SCF_DO_STCLASS_AND) {
2531 if (data->start_class->flags & ANYOF_LOCALE)
2532 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2533 }
2534 else {
2535 data->start_class->flags |= ANYOF_LOCALE;
2536 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2537 }
2538 break;
2539 case SPACE:
2540 if (flags & SCF_DO_STCLASS_AND) {
2541 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2542 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2543 for (value = 0; value < 256; value++)
2544 if (!isSPACE(value))
2545 ANYOF_BITMAP_CLEAR(data->start_class, value);
2546 }
2547 }
2548 else {
2549 if (data->start_class->flags & ANYOF_LOCALE)
2550 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2551 else {
2552 for (value = 0; value < 256; value++)
2553 if (isSPACE(value))
b81d288d 2554 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2555 }
2556 }
2557 break;
2558 case SPACEL:
2559 if (flags & SCF_DO_STCLASS_AND) {
2560 if (data->start_class->flags & ANYOF_LOCALE)
2561 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2562 }
2563 else {
2564 data->start_class->flags |= ANYOF_LOCALE;
2565 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2566 }
2567 break;
2568 case NSPACE:
2569 if (flags & SCF_DO_STCLASS_AND) {
2570 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2571 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2572 for (value = 0; value < 256; value++)
2573 if (isSPACE(value))
2574 ANYOF_BITMAP_CLEAR(data->start_class, value);
2575 }
2576 }
2577 else {
2578 if (data->start_class->flags & ANYOF_LOCALE)
2579 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2580 else {
2581 for (value = 0; value < 256; value++)
2582 if (!isSPACE(value))
b81d288d 2583 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2584 }
2585 }
2586 break;
2587 case NSPACEL:
2588 if (flags & SCF_DO_STCLASS_AND) {
2589 if (data->start_class->flags & ANYOF_LOCALE) {
2590 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2591 for (value = 0; value < 256; value++)
2592 if (!isSPACE(value))
2593 ANYOF_BITMAP_CLEAR(data->start_class, value);
2594 }
2595 }
2596 else {
2597 data->start_class->flags |= ANYOF_LOCALE;
2598 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2599 }
2600 break;
2601 case DIGIT:
2602 if (flags & SCF_DO_STCLASS_AND) {
2603 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2604 for (value = 0; value < 256; value++)
2605 if (!isDIGIT(value))
2606 ANYOF_BITMAP_CLEAR(data->start_class, value);
2607 }
2608 else {
2609 if (data->start_class->flags & ANYOF_LOCALE)
2610 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2611 else {
2612 for (value = 0; value < 256; value++)
2613 if (isDIGIT(value))
b81d288d 2614 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2615 }
2616 }
2617 break;
2618 case NDIGIT:
2619 if (flags & SCF_DO_STCLASS_AND) {
2620 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2621 for (value = 0; value < 256; value++)
2622 if (isDIGIT(value))
2623 ANYOF_BITMAP_CLEAR(data->start_class, value);
2624 }
2625 else {
2626 if (data->start_class->flags & ANYOF_LOCALE)
2627 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2628 else {
2629 for (value = 0; value < 256; value++)
2630 if (!isDIGIT(value))
b81d288d 2631 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2632 }
2633 }
2634 break;
2635 }
2636 if (flags & SCF_DO_STCLASS_OR)
2637 cl_and(data->start_class, &and_with);
2638 flags &= ~SCF_DO_STCLASS;
2639 }
a0ed51b3 2640 }
22c35a8c 2641 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2642 data->flags |= (OP(scan) == MEOL
2643 ? SF_BEFORE_MEOL
2644 : SF_BEFORE_SEOL);
a0ed51b3 2645 }
653099ff
GS
2646 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2647 /* Lookbehind, or need to calculate parens/evals/stclass: */
2648 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2649 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2650 /* Lookahead/lookbehind */
cb434fcc 2651 I32 deltanext, minnext, fake = 0;
c277df42 2652 regnode *nscan;
653099ff
GS
2653 struct regnode_charclass_class intrnl;
2654 int f = 0;
c277df42
IZ
2655
2656 data_fake.flags = 0;
b81d288d 2657 if (data) {
2c2d71f5 2658 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2659 data_fake.last_closep = data->last_closep;
2660 }
2661 else
2662 data_fake.last_closep = &fake;
653099ff
GS
2663 if ( flags & SCF_DO_STCLASS && !scan->flags
2664 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2665 cl_init(pRExC_state, &intrnl);
653099ff 2666 data_fake.start_class = &intrnl;
e1901655 2667 f |= SCF_DO_STCLASS_AND;
653099ff 2668 }
e1901655
IZ
2669 if (flags & SCF_WHILEM_VISITED_POS)
2670 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2671 next = regnext(scan);
2672 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2673 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2674 if (scan->flags) {
2675 if (deltanext) {
9baa0206 2676 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2677 }
2678 else if (minnext > U8_MAX) {
9baa0206 2679 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2680 }
eb160463 2681 scan->flags = (U8)minnext;
c277df42
IZ
2682 }
2683 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2684 pars++;
405ff068 2685 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2686 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2687 if (data)
2688 data->whilem_c = data_fake.whilem_c;
e1901655 2689 if (f & SCF_DO_STCLASS_AND) {
a28509cc 2690 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
2691
2692 cl_and(data->start_class, &intrnl);
2693 if (was)
2694 data->start_class->flags |= ANYOF_EOS;
2695 }
a0ed51b3
LW
2696 }
2697 else if (OP(scan) == OPEN) {
c277df42 2698 pars++;
a0ed51b3 2699 }
cb434fcc 2700 else if (OP(scan) == CLOSE) {
eb160463 2701 if ((I32)ARG(scan) == is_par) {
cb434fcc 2702 next = regnext(scan);
c277df42 2703
cb434fcc
IZ
2704 if ( next && (OP(next) != WHILEM) && next < last)
2705 is_par = 0; /* Disable optimization */
2706 }
2707 if (data)
2708 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2709 }
2710 else if (OP(scan) == EVAL) {
c277df42
IZ
2711 if (data)
2712 data->flags |= SF_HAS_EVAL;
2713 }
96776eda 2714 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2715 if (flags & SCF_DO_SUBSTR) {
830247a4 2716 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2717 data->longest = &(data->longest_float);
2718 }
2719 is_inf = is_inf_internal = 1;
653099ff 2720 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2721 cl_anything(pRExC_state, data->start_class);
96776eda 2722 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2723 }
c277df42
IZ
2724 /* Else: zero-length, ignore. */
2725 scan = regnext(scan);
2726 }
2727
2728 finish:
2729 *scanp = scan;
aca2d497 2730 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2731 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2732 data->pos_delta = I32_MAX - data->pos_min;
2733 if (is_par > U8_MAX)
2734 is_par = 0;
2735 if (is_par && pars==1 && data) {
2736 data->flags |= SF_IN_PAR;
2737 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2738 }
2739 else if (pars && data) {
c277df42
IZ
2740 data->flags |= SF_HAS_PAR;
2741 data->flags &= ~SF_IN_PAR;
2742 }
653099ff
GS
2743 if (flags & SCF_DO_STCLASS_OR)
2744 cl_and(data->start_class, &and_with);
c277df42
IZ
2745 return min;
2746}
2747
76e3520e 2748STATIC I32
bfed75c6 2749S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2750{
830247a4 2751 if (RExC_rx->data) {
b81d288d
AB
2752 Renewc(RExC_rx->data,
2753 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2754 char, struct reg_data);
830247a4
IZ
2755 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2756 RExC_rx->data->count += n;
a0ed51b3
LW
2757 }
2758 else {
a02a5408 2759 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2760 char, struct reg_data);
a02a5408 2761 Newx(RExC_rx->data->what, n, U8);
830247a4 2762 RExC_rx->data->count = n;
c277df42 2763 }
830247a4
IZ
2764 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2765 return RExC_rx->data->count - n;
c277df42
IZ
2766}
2767
d88dccdf 2768void
864dbfa3 2769Perl_reginitcolors(pTHX)
d88dccdf 2770{
97aff369 2771 dVAR;
1df70142 2772 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 2773 if (s) {
1df70142
AL
2774 char *t = savepv(s);
2775 int i = 0;
2776 PL_colors[0] = t;
d88dccdf 2777 while (++i < 6) {
1df70142
AL
2778 t = strchr(t, '\t');
2779 if (t) {
2780 *t = '\0';
2781 PL_colors[i] = ++t;
d88dccdf
IZ
2782 }
2783 else
1df70142 2784 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
2785 }
2786 } else {
1df70142 2787 int i = 0;
b81d288d 2788 while (i < 6)
06b5626a 2789 PL_colors[i++] = (char *)"";
d88dccdf
IZ
2790 }
2791 PL_colorset = 1;
2792}
2793
8615cb43 2794
a687059c 2795/*
e50aee73 2796 - pregcomp - compile a regular expression into internal code
a687059c
LW
2797 *
2798 * We can't allocate space until we know how big the compiled form will be,
2799 * but we can't compile it (and thus know how big it is) until we've got a
2800 * place to put the code. So we cheat: we compile it twice, once with code
2801 * generation turned off and size counting turned on, and once "for real".
2802 * This also means that we don't allocate space until we are sure that the
2803 * thing really will compile successfully, and we never have to move the
2804 * code and thus invalidate pointers into it. (Note that it has to be in
2805 * one piece because free() must be able to free it all.) [NB: not true in perl]
2806 *
2807 * Beware that the optimization-preparation code in here knows about some
2808 * of the structure of the compiled regexp. [I'll say.]
2809 */
2810regexp *
864dbfa3 2811Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2812{
97aff369 2813 dVAR;
a0d0e21e 2814 register regexp *r;
c277df42 2815 regnode *scan;
c277df42 2816 regnode *first;
a0d0e21e 2817 I32 flags;
a0d0e21e
LW
2818 I32 minlen = 0;
2819 I32 sawplus = 0;
2820 I32 sawopen = 0;
2c2d71f5 2821 scan_data_t data;
830247a4
IZ
2822 RExC_state_t RExC_state;
2823 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 2824
a3621e74
YO
2825 GET_RE_DEBUG_FLAGS_DECL;
2826
a0d0e21e 2827 if (exp == NULL)
c277df42 2828 FAIL("NULL regexp argument");
a0d0e21e 2829
a5961de5 2830 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 2831
5cfc7842 2832 RExC_precomp = exp;
a3621e74
YO
2833 DEBUG_r(if (!PL_colorset) reginitcolors());
2834 DEBUG_COMPILE_r({
2835 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
2836 PL_colors[4],PL_colors[5],PL_colors[0],
2837 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2838 });
e2509266 2839 RExC_flags = pm->op_pmflags;
830247a4 2840 RExC_sawback = 0;
bbce6d69 2841
830247a4
IZ
2842 RExC_seen = 0;
2843 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2844 RExC_seen_evals = 0;
2845 RExC_extralen = 0;
c277df42 2846
bbce6d69 2847 /* First pass: determine size, legality. */
830247a4 2848 RExC_parse = exp;
fac92740 2849 RExC_start = exp;
830247a4
IZ
2850 RExC_end = xend;
2851 RExC_naughty = 0;
2852 RExC_npar = 1;
2853 RExC_size = 0L;
2854 RExC_emit = &PL_regdummy;
2855 RExC_whilem_seen = 0;
85ddcde9
JH
2856#if 0 /* REGC() is (currently) a NOP at the first pass.
2857 * Clever compilers notice this and complain. --jhi */
830247a4 2858 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 2859#endif
830247a4 2860 if (reg(pRExC_state, 0, &flags) == NULL) {
c445ea15 2861 RExC_precomp = NULL;
a0d0e21e
LW
2862 return(NULL);
2863 }
a3621e74 2864 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 2865
c277df42
IZ
2866 /* Small enough for pointer-storage convention?
2867 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
2868 if (RExC_size >= 0x10000L && RExC_extralen)
2869 RExC_size += RExC_extralen;
c277df42 2870 else
830247a4
IZ
2871 RExC_extralen = 0;
2872 if (RExC_whilem_seen > 15)
2873 RExC_whilem_seen = 15;
a0d0e21e 2874
bbce6d69 2875 /* Allocate space and initialize. */
a02a5408 2876 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 2877 char, regexp);
a0d0e21e 2878 if (r == NULL)
b45f050a
JF
2879 FAIL("Regexp out of space");
2880
0f79a09d
GS
2881#ifdef DEBUGGING
2882 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 2883 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 2884#endif
c277df42 2885 r->refcnt = 1;
bbce6d69 2886 r->prelen = xend - exp;
5cfc7842 2887 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 2888 r->subbeg = NULL;
f8c7b90f 2889#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 2890 r->saved_copy = NULL;
ed252734 2891#endif
cf93c79d 2892 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 2893 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
2894
2895 r->substrs = 0; /* Useful during FAIL. */
2896 r->startp = 0; /* Useful during FAIL. */
2897 r->endp = 0; /* Useful during FAIL. */
2898
a02a5408 2899 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 2900 if (r->offsets) {
2af232bd 2901 r->offsets[0] = RExC_size;
fac92740 2902 }
a3621e74 2903 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
2904 "%s %"UVuf" bytes for offset annotations.\n",
2905 r->offsets ? "Got" : "Couldn't get",
392fbf5d 2906 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 2907
830247a4 2908 RExC_rx = r;
bbce6d69 2909
2910 /* Second pass: emit code. */
e2509266 2911 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
2912 RExC_parse = exp;
2913 RExC_end = xend;
2914 RExC_naughty = 0;
2915 RExC_npar = 1;
fac92740 2916 RExC_emit_start = r->program;
830247a4 2917 RExC_emit = r->program;
2cd61cdb 2918 /* Store the count of eval-groups for security checks: */
eb160463 2919 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 2920 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 2921 r->data = 0;
830247a4 2922 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
2923 return(NULL);
2924
a3621e74 2925
a0d0e21e 2926 /* Dig out information for optimizations. */
cf93c79d 2927 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 2928 pm->op_pmflags = RExC_flags;
a0ed51b3 2929 if (UTF)
5ff6fc6d 2930 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 2931 r->regstclass = NULL;
830247a4 2932 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 2933 r->reganch |= ROPT_NAUGHTY;
c277df42 2934 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
2935
2936 /* XXXX To minimize changes to RE engine we always allocate
2937 3-units-long substrs field. */
a02a5408 2938 Newxz(r->substrs, 1, struct reg_substr_data);
2779dcf1 2939
2c2d71f5 2940 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 2941 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 2942 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 2943 I32 fake;
c5254dd6 2944 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
2945 struct regnode_charclass_class ch_class;
2946 int stclass_flag;
cb434fcc 2947 I32 last_close = 0;
a0d0e21e
LW
2948
2949 first = scan;
c277df42 2950 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 2951 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 2952 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
2953 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2954 (OP(first) == PLUS) ||
2955 (OP(first) == MINMOD) ||
653099ff 2956 /* An {n,m} with n>0 */
22c35a8c 2957 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
2958 if (OP(first) == PLUS)
2959 sawplus = 1;
2960 else
2961 first += regarglen[(U8)OP(first)];
2962 first = NEXTOPER(first);
a687059c
LW
2963 }
2964
a0d0e21e
LW
2965 /* Starting-point info. */
2966 again:
653099ff 2967 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
2968 if (OP(first) == EXACT)
2969 ; /* Empty, get anchored substr later. */
2970 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
2971 r->regstclass = first;
2972 }
bfed75c6 2973 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 2974 r->regstclass = first;
22c35a8c
GS
2975 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2976 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 2977 r->regstclass = first;
22c35a8c 2978 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
2979 r->reganch |= (OP(first) == MBOL
2980 ? ROPT_ANCH_MBOL
2981 : (OP(first) == SBOL
2982 ? ROPT_ANCH_SBOL
2983 : ROPT_ANCH_BOL));
a0d0e21e 2984 first = NEXTOPER(first);
774d564b 2985 goto again;
2986 }
2987 else if (OP(first) == GPOS) {
2988 r->reganch |= ROPT_ANCH_GPOS;
2989 first = NEXTOPER(first);
2990 goto again;
a0d0e21e 2991 }
e09294f4 2992 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 2993 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
2994 !(r->reganch & ROPT_ANCH) )
2995 {
2996 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
2997 const int type =
2998 (OP(NEXTOPER(first)) == REG_ANY)
2999 ? ROPT_ANCH_MBOL
3000 : ROPT_ANCH_SBOL;
cad2e5aa 3001 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3002 first = NEXTOPER(first);
774d564b 3003 goto again;
a0d0e21e 3004 }
b81d288d 3005 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3006 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3007 /* x+ must match at the 1st pos of run of x's */
3008 r->reganch |= ROPT_SKIP;
a0d0e21e 3009
c277df42 3010 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 3011 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 3012 (IV)(first - scan + 1)));
a0d0e21e
LW
3013 /*
3014 * If there's something expensive in the r.e., find the
3015 * longest literal string that must appear and make it the
3016 * regmust. Resolve ties in favor of later strings, since
3017 * the regstart check works with the beginning of the r.e.
3018 * and avoiding duplication strengthens checking. Not a
3019 * strong reason, but sufficient in the absence of others.
3020 * [Now we resolve ties in favor of the earlier string if
c277df42 3021 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3022 * earlier string may buy us something the later one won't.]
3023 */
a0d0e21e 3024 minlen = 0;
a687059c 3025
396482e1
GA
3026 data.longest_fixed = newSVpvs("");
3027 data.longest_float = newSVpvs("");
3028 data.last_found = newSVpvs("");
c277df42
IZ
3029 data.longest = &(data.longest_fixed);
3030 first = scan;
653099ff 3031 if (!r->regstclass) {
830247a4 3032 cl_init(pRExC_state, &ch_class);
653099ff
GS
3033 data.start_class = &ch_class;
3034 stclass_flag = SCF_DO_STCLASS_AND;
3035 } else /* XXXX Check for BOUND? */
3036 stclass_flag = 0;
cb434fcc 3037 data.last_closep = &last_close;
653099ff 3038
830247a4 3039 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3040 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3041 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3042 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3043 && !RExC_seen_zerolen
3044 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3045 r->reganch |= ROPT_CHECK_ALL;
830247a4 3046 scan_commit(pRExC_state, &data);
c277df42
IZ
3047 SvREFCNT_dec(data.last_found);
3048
a0ed51b3 3049 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3050 if (longest_float_length
c277df42
IZ
3051 || (data.flags & SF_FL_BEFORE_EOL
3052 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3053 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3054 int t;
3055
a0ed51b3 3056 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3057 && data.offset_fixed == data.offset_float_min
3058 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3059 goto remove_float; /* As in (a)+. */
3060
33b8afdf
JH
3061 if (SvUTF8(data.longest_float)) {
3062 r->float_utf8 = data.longest_float;
c445ea15 3063 r->float_substr = NULL;
33b8afdf
JH
3064 } else {
3065 r->float_substr = data.longest_float;
c445ea15 3066 r->float_utf8 = NULL;
33b8afdf 3067 }
c277df42
IZ
3068 r->float_min_offset = data.offset_float_min;
3069 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3070 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3071 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3072 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3073 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3074 }
3075 else {
aca2d497 3076 remove_float:
c445ea15 3077 r->float_substr = r->float_utf8 = NULL;
c277df42 3078 SvREFCNT_dec(data.longest_float);
c5254dd6 3079 longest_float_length = 0;
a0d0e21e 3080 }
c277df42 3081
a0ed51b3 3082 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3083 if (longest_fixed_length
c277df42
IZ
3084 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3085 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3086 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3087 int t;
3088
33b8afdf
JH
3089 if (SvUTF8(data.longest_fixed)) {
3090 r->anchored_utf8 = data.longest_fixed;
c445ea15 3091 r->anchored_substr = NULL;
33b8afdf
JH
3092 } else {
3093 r->anchored_substr = data.longest_fixed;
c445ea15 3094 r->anchored_utf8 = NULL;
33b8afdf 3095 }
c277df42 3096 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3097 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3098 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3099 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3100 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3101 }
3102 else {
c445ea15 3103 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 3104 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3105 longest_fixed_length = 0;
a0d0e21e 3106 }
b81d288d 3107 if (r->regstclass
ffc61ed2 3108 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3109 r->regstclass = NULL;
33b8afdf
JH
3110 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3111 && stclass_flag
653099ff 3112 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3113 && !cl_is_anything(data.start_class))
3114 {
1df70142 3115 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3116
a02a5408 3117 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3118 struct regnode_charclass_class);
3119 StructCopy(data.start_class,
830247a4 3120 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3121 struct regnode_charclass_class);
830247a4 3122 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3123 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 3124 PL_regdata = r->data; /* for regprop() */
a3621e74 3125 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
9c5ffd7c
JH
3126 regprop(sv, (regnode*)data.start_class);
3127 PerlIO_printf(Perl_debug_log,
a0288114 3128 "synthetic stclass \"%s\".\n",
3f7c398e 3129 SvPVX_const(sv));});
653099ff 3130 }
c277df42
IZ
3131
3132 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3133 if (longest_fixed_length > longest_float_length) {
c277df42 3134 r->check_substr = r->anchored_substr;
33b8afdf 3135 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3136 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3137 if (r->reganch & ROPT_ANCH_SINGLE)
3138 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3139 }
3140 else {
c277df42 3141 r->check_substr = r->float_substr;
33b8afdf 3142 r->check_utf8 = r->float_utf8;
c277df42
IZ
3143 r->check_offset_min = data.offset_float_min;
3144 r->check_offset_max = data.offset_float_max;
a0d0e21e 3145 }
30382c73
IZ
3146 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3147 This should be changed ASAP! */
33b8afdf 3148 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3149 r->reganch |= RE_USE_INTUIT;
33b8afdf 3150 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3151 r->reganch |= RE_INTUIT_TAIL;
3152 }
a0ed51b3
LW
3153 }
3154 else {
c277df42
IZ
3155 /* Several toplevels. Best we can is to set minlen. */
3156 I32 fake;
653099ff 3157 struct regnode_charclass_class ch_class;
cb434fcc 3158 I32 last_close = 0;
c277df42 3159
a3621e74 3160 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3161 scan = r->program + 1;
830247a4 3162 cl_init(pRExC_state, &ch_class);
653099ff 3163 data.start_class = &ch_class;
cb434fcc 3164 data.last_closep = &last_close;
a3621e74 3165 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf 3166 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 3167 = r->float_substr = r->float_utf8 = NULL;
653099ff 3168 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3169 && !cl_is_anything(data.start_class))
3170 {
1df70142 3171 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3172
a02a5408 3173 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3174 struct regnode_charclass_class);
3175 StructCopy(data.start_class,
830247a4 3176 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3177 struct regnode_charclass_class);
830247a4 3178 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3179 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3180 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
9c5ffd7c
JH
3181 regprop(sv, (regnode*)data.start_class);
3182 PerlIO_printf(Perl_debug_log,
a0288114 3183 "synthetic stclass \"%s\".\n",
3f7c398e 3184 SvPVX_const(sv));});
653099ff 3185 }
a0d0e21e
LW
3186 }
3187
a0d0e21e 3188 r->minlen = minlen;
b81d288d 3189 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3190 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3191 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3192 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3193 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3194 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3195 if (RExC_seen & REG_SEEN_CANY)
3196 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3197 Newxz(r->startp, RExC_npar, I32);
3198 Newxz(r->endp, RExC_npar, I32);
ffc61ed2 3199 PL_regdata = r->data; /* for regprop() */
a3621e74 3200 DEBUG_COMPILE_r(regdump(r));
a0d0e21e 3201 return(r);
a687059c
LW
3202}
3203
3204/*
3205 - reg - regular expression, i.e. main body or parenthesized thing
3206 *
3207 * Caller must absorb opening parenthesis.
3208 *
3209 * Combining parenthesis handling with the base level of regular expression
3210 * is a trifle forced, but the need to tie the tails of the branches to what
3211 * follows makes it hard to avoid.
3212 */
76e3520e 3213STATIC regnode *
830247a4 3214S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 3215 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3216{
27da23d5 3217 dVAR;
c277df42
IZ
3218 register regnode *ret; /* Will be the head of the group. */
3219 register regnode *br;
3220 register regnode *lastbr;
cbbf8932 3221 register regnode *ender = NULL;
a0d0e21e 3222 register I32 parno = 0;
cbbf8932
AL
3223 I32 flags;
3224 const I32 oregflags = RExC_flags;
3225 I32 have_branch = 0;
3226 I32 open = 0;
9d1d55b5
JP
3227
3228 /* for (?g), (?gc), and (?o) warnings; warning
3229 about (?c) will warn about (?g) -- japhy */
3230
cbbf8932
AL
3231 I32 wastedflags = 0x00;
3232 const I32 wasted_o = 0x01;
3233 const I32 wasted_g = 0x02;
3234 const I32 wasted_gc = 0x02 | 0x04;
3235 const I32 wasted_c = 0x04;
9d1d55b5 3236
fac92740 3237 char * parse_start = RExC_parse; /* MJD */
a28509cc 3238 char * const oregcomp_parse = RExC_parse;
c277df42 3239 char c;
a0d0e21e 3240
821b33a5 3241 *flagp = 0; /* Tentatively. */
a0d0e21e 3242
9d1d55b5 3243
a0d0e21e
LW
3244 /* Make an OPEN node, if parenthesized. */
3245 if (paren) {
fac92740 3246 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3247 U32 posflags = 0, negflags = 0;
3248 U32 *flagsp = &posflags;
0f5d15d6 3249 int logical = 0;
a28509cc 3250 const char * const seqstart = RExC_parse;
ca9dfc88 3251
830247a4
IZ
3252 RExC_parse++;
3253 paren = *RExC_parse++;
c277df42 3254 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3255 switch (paren) {
fac92740 3256 case '<': /* (?<...) */
830247a4 3257 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3258 if (*RExC_parse == '!')
c277df42 3259 paren = ',';
b81d288d 3260 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3261 goto unknown;
830247a4 3262 RExC_parse++;
fac92740
MJD
3263 case '=': /* (?=...) */
3264 case '!': /* (?!...) */
830247a4 3265 RExC_seen_zerolen++;
fac92740
MJD
3266 case ':': /* (?:...) */
3267 case '>': /* (?>...) */
a0d0e21e 3268 break;
fac92740
MJD
3269 case '$': /* (?$...) */
3270 case '@': /* (?@...) */
8615cb43 3271 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3272 break;
fac92740 3273 case '#': /* (?#...) */
830247a4
IZ
3274 while (*RExC_parse && *RExC_parse != ')')
3275 RExC_parse++;
3276 if (*RExC_parse != ')')
c277df42 3277 FAIL("Sequence (?#... not terminated");
830247a4 3278 nextchar(pRExC_state);
a0d0e21e
LW
3279 *flagp = TRYAGAIN;
3280 return NULL;
fac92740 3281 case 'p': /* (?p...) */
9014280d 3282 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3283 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3284 /* FALL THROUGH*/
fac92740 3285 case '?': /* (??...) */
0f5d15d6 3286 logical = 1;
438a3801
YST
3287 if (*RExC_parse != '{')
3288 goto unknown;
830247a4 3289 paren = *RExC_parse++;
0f5d15d6 3290 /* FALL THROUGH */
fac92740 3291 case '{': /* (?{...}) */
c277df42 3292 {
c277df42
IZ
3293 I32 count = 1, n = 0;
3294 char c;
830247a4 3295 char *s = RExC_parse;
c277df42
IZ
3296 SV *sv;
3297 OP_4tree *sop, *rop;
3298
830247a4
IZ
3299 RExC_seen_zerolen++;
3300 RExC_seen |= REG_SEEN_EVAL;
3301 while (count && (c = *RExC_parse)) {
3302 if (c == '\\' && RExC_parse[1])
3303 RExC_parse++;
b81d288d 3304 else if (c == '{')
c277df42 3305 count++;
b81d288d 3306 else if (c == '}')
c277df42 3307 count--;
830247a4 3308 RExC_parse++;
c277df42 3309 }
830247a4 3310 if (*RExC_parse != ')')
b45f050a 3311 {
b81d288d 3312 RExC_parse = s;
b45f050a
JF
3313 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3314 }
c277df42 3315 if (!SIZE_ONLY) {
f3548bdc 3316 PAD *pad;
b81d288d
AB
3317
3318 if (RExC_parse - 1 - s)
830247a4 3319 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3320 else
396482e1 3321 sv = newSVpvs("");
c277df42 3322
569233ed
SB
3323 ENTER;
3324 Perl_save_re_context(aTHX);
f3548bdc 3325 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3326 sop->op_private |= OPpREFCOUNTED;
3327 /* re_dup will OpREFCNT_inc */
3328 OpREFCNT_set(sop, 1);
569233ed 3329 LEAVE;
c277df42 3330
830247a4
IZ
3331 n = add_data(pRExC_state, 3, "nop");
3332 RExC_rx->data->data[n] = (void*)rop;
3333 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3334 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3335 SvREFCNT_dec(sv);
a0ed51b3 3336 }
e24b16f9 3337 else { /* First pass */
830247a4 3338 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3339 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3340 /* No compiled RE interpolated, has runtime
3341 components ===> unsafe. */
3342 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3343 if (PL_tainting && PL_tainted)
cc6b7395 3344 FAIL("Eval-group in insecure regular expression");
923e4eb5 3345 if (IN_PERL_COMPILETIME)
b5c19bd7 3346 PL_cv_has_eval = 1;
c277df42 3347 }
b5c19bd7 3348
830247a4 3349 nextchar(pRExC_state);
0f5d15d6 3350 if (logical) {
830247a4 3351 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3352 if (!SIZE_ONLY)
3353 ret->flags = 2;
830247a4 3354 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3355 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3356 return ret;
3357 }
ccb2c380
MP
3358 ret = reganode(pRExC_state, EVAL, n);
3359 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3360 Set_Node_Offset(ret, parse_start);
3361 return ret;
c277df42 3362 }
fac92740 3363 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3364 {
fac92740 3365 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3366 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3367 || RExC_parse[1] == '<'
830247a4 3368 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3369 I32 flag;
3370
830247a4 3371 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3372 if (!SIZE_ONLY)
3373 ret->flags = 1;
830247a4 3374 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 3375 goto insert_if;
b81d288d 3376 }
a0ed51b3 3377 }
830247a4 3378 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3379 /* (?(1)...) */
830247a4 3380 parno = atoi(RExC_parse++);
c277df42 3381
830247a4
IZ
3382 while (isDIGIT(*RExC_parse))
3383 RExC_parse++;
fac92740 3384 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3385
830247a4 3386 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3387 vFAIL("Switch condition not recognized");
c277df42 3388 insert_if:
830247a4
IZ
3389 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3390 br = regbranch(pRExC_state, &flags, 1);
c277df42 3391 if (br == NULL)
830247a4 3392 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3393 else
830247a4
IZ
3394 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3395 c = *nextchar(pRExC_state);
d1b80229
IZ
3396 if (flags&HASWIDTH)
3397 *flagp |= HASWIDTH;
c277df42 3398 if (c == '|') {
830247a4
IZ
3399 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3400 regbranch(pRExC_state, &flags, 1);
3401 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
3402 if (flags&HASWIDTH)
3403 *flagp |= HASWIDTH;
830247a4 3404 c = *nextchar(pRExC_state);
a0ed51b3
LW
3405 }
3406 else
c277df42
IZ
3407 lastbr = NULL;
3408 if (c != ')')
8615cb43 3409 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
3410 ender = reg_node(pRExC_state, TAIL);
3411 regtail(pRExC_state, br, ender);
c277df42 3412 if (lastbr) {
830247a4
IZ
3413 regtail(pRExC_state, lastbr, ender);
3414 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3415 }
3416 else
830247a4 3417 regtail(pRExC_state, ret, ender);
c277df42 3418 return ret;
a0ed51b3
LW
3419 }
3420 else {
830247a4 3421 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3422 }
3423 }
1b1626e4 3424 case 0:
830247a4 3425 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3426 vFAIL("Sequence (? incomplete");
1b1626e4 3427 break;
a0d0e21e 3428 default:
830247a4 3429 --RExC_parse;
fac92740 3430 parse_flags: /* (?i) */
830247a4 3431 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3432 /* (?g), (?gc) and (?o) are useless here
3433 and must be globally applied -- japhy */
3434
3435 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3436 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3437 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3438 if (! (wastedflags & wflagbit) ) {
3439 wastedflags |= wflagbit;
3440 vWARN5(
3441 RExC_parse + 1,
3442 "Useless (%s%c) - %suse /%c modifier",
3443 flagsp == &negflags ? "?-" : "?",
3444 *RExC_parse,
3445 flagsp == &negflags ? "don't " : "",
3446 *RExC_parse
3447 );
3448 }
3449 }
3450 }
3451 else if (*RExC_parse == 'c') {
3452 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3453 if (! (wastedflags & wasted_c) ) {
3454 wastedflags |= wasted_gc;
3455 vWARN3(
3456 RExC_parse + 1,
3457 "Useless (%sc) - %suse /gc modifier",
3458 flagsp == &negflags ? "?-" : "?",
3459 flagsp == &negflags ? "don't " : ""
3460 );
3461 }
3462 }
3463 }
3464 else { pmflag(flagsp, *RExC_parse); }
3465
830247a4 3466 ++RExC_parse;
ca9dfc88 3467 }
830247a4 3468 if (*RExC_parse == '-') {
ca9dfc88 3469 flagsp = &negflags;
9d1d55b5 3470 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3471 ++RExC_parse;
ca9dfc88 3472 goto parse_flags;
48c036b1 3473 }
e2509266
JH
3474 RExC_flags |= posflags;
3475 RExC_flags &= ~negflags;
830247a4
IZ
3476 if (*RExC_parse == ':') {
3477 RExC_parse++;
ca9dfc88
IZ
3478 paren = ':';
3479 break;
3480 }
c277df42 3481 unknown:
830247a4
IZ
3482 if (*RExC_parse != ')') {
3483 RExC_parse++;
3484 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3485 }
830247a4 3486 nextchar(pRExC_state);
a0d0e21e
LW
3487 *flagp = TRYAGAIN;
3488 return NULL;
3489 }
3490 }
fac92740 3491 else { /* (...) */
830247a4
IZ
3492 parno = RExC_npar;
3493 RExC_npar++;
3494 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3495 Set_Node_Length(ret, 1); /* MJD */
3496 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 3497 open = 1;
a0d0e21e 3498 }
a0ed51b3 3499 }
fac92740 3500 else /* ! paren */
a0d0e21e
LW
3501 ret = NULL;
3502
3503 /* Pick up the branches, linking them together. */
fac92740 3504 parse_start = RExC_parse; /* MJD */
830247a4 3505 br = regbranch(pRExC_state, &flags, 1);
fac92740 3506 /* branch_len = (paren != 0); */
2af232bd 3507
a0d0e21e
LW
3508 if (br == NULL)
3509 return(NULL);
830247a4
IZ
3510 if (*RExC_parse == '|') {
3511 if (!SIZE_ONLY && RExC_extralen) {
3512 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3513 }
fac92740 3514 else { /* MJD */
830247a4 3515 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3516 Set_Node_Length(br, paren != 0);
3517 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3518 }
c277df42
IZ
3519 have_branch = 1;
3520 if (SIZE_ONLY)
830247a4 3521 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3522 }
3523 else if (paren == ':') {
c277df42
IZ
3524 *flagp |= flags&SIMPLE;
3525 }
3526 if (open) { /* Starts with OPEN. */
830247a4 3527 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3528 }
3529 else if (paren != '?') /* Not Conditional */
a0d0e21e 3530 ret = br;
32a0ca98 3531 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3532 lastbr = br;
830247a4
IZ
3533 while (*RExC_parse == '|') {
3534 if (!SIZE_ONLY && RExC_extralen) {
3535 ender = reganode(pRExC_state, LONGJMP,0);
3536 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3537 }
3538 if (SIZE_ONLY)
830247a4
IZ
3539 RExC_extralen += 2; /* Account for LONGJMP. */
3540 nextchar(pRExC_state);
3541 br = regbranch(pRExC_state, &flags, 0);
2af232bd 3542
a687059c 3543 if (br == NULL)
a0d0e21e 3544 return(NULL);
830247a4 3545 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3546 lastbr = br;
821b33a5
IZ
3547 if (flags&HASWIDTH)
3548 *flagp |= HASWIDTH;
a687059c 3549 *flagp |= flags&SPSTART;
a0d0e21e
LW
3550 }
3551
c277df42
IZ
3552 if (have_branch || paren != ':') {
3553 /* Make a closing node, and hook it on the end. */
3554 switch (paren) {
3555 case ':':
830247a4 3556 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3557 break;
3558 case 1:
830247a4 3559 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3560 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3561 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3562 break;
3563 case '<':
c277df42
IZ
3564 case ',':
3565 case '=':
3566 case '!':
c277df42 3567 *flagp &= ~HASWIDTH;
821b33a5
IZ
3568 /* FALL THROUGH */
3569 case '>':
830247a4 3570 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3571 break;
3572 case 0:
830247a4 3573 ender = reg_node(pRExC_state, END);
c277df42
IZ
3574 break;
3575 }
830247a4 3576 regtail(pRExC_state, lastbr, ender);
a0d0e21e 3577
c277df42
IZ
3578 if (have_branch) {
3579 /* Hook the tails of the branches to the closing node. */
3580 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 3581 regoptail(pRExC_state, br, ender);
c277df42
IZ
3582 }
3583 }
a0d0e21e 3584 }
c277df42
IZ
3585
3586 {
e1ec3a88
AL
3587 const char *p;
3588 static const char parens[] = "=!<,>";
c277df42
IZ
3589
3590 if (paren && (p = strchr(parens, paren))) {
eb160463 3591 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
3592 int flag = (p - parens) > 1;
3593
3594 if (paren == '>')
3595 node = SUSPEND, flag = 0;
830247a4 3596 reginsert(pRExC_state, node,ret);
45948336
EP
3597 Set_Node_Cur_Length(ret);
3598 Set_Node_Offset(ret, parse_start + 1);
c277df42 3599 ret->flags = flag;
830247a4 3600 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 3601 }
a0d0e21e
LW
3602 }
3603
3604 /* Check for proper termination. */
ce3e6498 3605 if (paren) {
e2509266 3606 RExC_flags = oregflags;
830247a4
IZ
3607 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3608 RExC_parse = oregcomp_parse;
380a0633 3609 vFAIL("Unmatched (");
ce3e6498 3610 }
a0ed51b3 3611 }
830247a4
IZ
3612 else if (!paren && RExC_parse < RExC_end) {
3613 if (*RExC_parse == ')') {
3614 RExC_parse++;
380a0633 3615 vFAIL("Unmatched )");
a0ed51b3
LW
3616 }
3617 else
b45f050a 3618 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
3619 /* NOTREACHED */
3620 }
a687059c 3621
a0d0e21e 3622 return(ret);
a687059c
LW
3623}
3624
3625/*
3626 - regbranch - one alternative of an | operator
3627 *
3628 * Implements the concatenation operator.
3629 */
76e3520e 3630STATIC regnode *
830247a4 3631S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 3632{
97aff369 3633 dVAR;
c277df42
IZ
3634 register regnode *ret;
3635 register regnode *chain = NULL;
3636 register regnode *latest;
3637 I32 flags = 0, c = 0;
a0d0e21e 3638
b81d288d 3639 if (first)
c277df42
IZ
3640 ret = NULL;
3641 else {
b81d288d 3642 if (!SIZE_ONLY && RExC_extralen)
830247a4 3643 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 3644 else {
830247a4 3645 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
3646 Set_Node_Length(ret, 1);
3647 }
c277df42
IZ
3648 }
3649
b81d288d 3650 if (!first && SIZE_ONLY)
830247a4 3651 RExC_extralen += 1; /* BRANCHJ */
b81d288d 3652
c277df42 3653 *flagp = WORST; /* Tentatively. */
a0d0e21e 3654
830247a4
IZ
3655 RExC_parse--;
3656 nextchar(pRExC_state);
3657 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 3658 flags &= ~TRYAGAIN;
830247a4 3659 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
3660 if (latest == NULL) {
3661 if (flags & TRYAGAIN)
3662 continue;
3663 return(NULL);
a0ed51b3
LW
3664 }
3665 else if (ret == NULL)
c277df42 3666 ret = latest;
a0d0e21e 3667 *flagp |= flags&HASWIDTH;
c277df42 3668 if (chain == NULL) /* First piece. */
a0d0e21e
LW
3669 *flagp |= flags&SPSTART;
3670 else {
830247a4
IZ
3671 RExC_naughty++;
3672 regtail(pRExC_state, chain, latest);
a687059c 3673 }
a0d0e21e 3674 chain = latest;
c277df42
IZ
3675 c++;
3676 }
3677 if (chain == NULL) { /* Loop ran zero times. */
830247a4 3678 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
3679 if (ret == NULL)
3680 ret = chain;
3681 }
3682 if (c == 1) {
3683 *flagp |= flags&SIMPLE;
a0d0e21e 3684 }
a687059c 3685
a0d0e21e 3686 return(ret);
a687059c
LW
3687}
3688
3689/*
3690 - regpiece - something followed by possible [*+?]
3691 *
3692 * Note that the branching code sequences used for ? and the general cases
3693 * of * and + are somewhat optimized: they use the same NOTHING node as
3694 * both the endmarker for their branch list and the body of the last branch.
3695 * It might seem that this node could be dispensed with entirely, but the
3696 * endmarker role is not redundant.
3697 */
76e3520e 3698STATIC regnode *
830247a4 3699S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3700{
97aff369 3701 dVAR;
c277df42 3702 register regnode *ret;
a0d0e21e
LW
3703 register char op;
3704 register char *next;
3705 I32 flags;
1df70142 3706 const char * const origparse = RExC_parse;
a0d0e21e
LW
3707 char *maxpos;
3708 I32 min;
c277df42 3709 I32 max = REG_INFTY;
fac92740 3710 char *parse_start;
a0d0e21e 3711
830247a4 3712 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
3713 if (ret == NULL) {
3714 if (flags & TRYAGAIN)
3715 *flagp |= TRYAGAIN;
3716 return(NULL);
3717 }
3718
830247a4 3719 op = *RExC_parse;
a0d0e21e 3720
830247a4 3721 if (op == '{' && regcurly(RExC_parse)) {
fac92740 3722 parse_start = RExC_parse; /* MJD */
830247a4 3723 next = RExC_parse + 1;
c445ea15 3724 maxpos = NULL;
a0d0e21e
LW
3725 while (isDIGIT(*next) || *next == ',') {
3726 if (*next == ',') {
3727 if (maxpos)
3728 break;
3729 else
3730 maxpos = next;
a687059c 3731 }
a0d0e21e
LW
3732 next++;
3733 }
3734 if (*next == '}') { /* got one */
3735 if (!maxpos)
3736 maxpos = next;
830247a4
IZ
3737 RExC_parse++;
3738 min = atoi(RExC_parse);
a0d0e21e
LW
3739 if (*maxpos == ',')
3740 maxpos++;
3741 else
830247a4 3742 maxpos = RExC_parse;
a0d0e21e
LW
3743 max = atoi(maxpos);
3744 if (!max && *maxpos != '0')
c277df42
IZ
3745 max = REG_INFTY; /* meaning "infinity" */
3746 else if (max >= REG_INFTY)
8615cb43 3747 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
3748 RExC_parse = next;
3749 nextchar(pRExC_state);
a0d0e21e
LW
3750
3751 do_curly:
3752 if ((flags&SIMPLE)) {
830247a4
IZ
3753 RExC_naughty += 2 + RExC_naughty / 2;
3754 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
3755 Set_Node_Offset(ret, parse_start+1); /* MJD */
3756 Set_Node_Cur_Length(ret);
a0d0e21e
LW
3757 }
3758 else {
830247a4 3759 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
3760
3761 w->flags = 0;
830247a4
IZ
3762 regtail(pRExC_state, ret, w);
3763 if (!SIZE_ONLY && RExC_extralen) {
3764 reginsert(pRExC_state, LONGJMP,ret);
3765 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
3766 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3767 }
830247a4 3768 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
3769 /* MJD hk */
3770 Set_Node_Offset(ret, parse_start+1);
2af232bd 3771 Set_Node_Length(ret,
fac92740 3772 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 3773
830247a4 3774 if (!SIZE_ONLY && RExC_extralen)
c277df42 3775 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 3776 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 3777 if (SIZE_ONLY)
830247a4
IZ
3778 RExC_whilem_seen++, RExC_extralen += 3;
3779 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 3780 }
c277df42 3781 ret->flags = 0;
a0d0e21e
LW
3782
3783 if (min > 0)
821b33a5
IZ
3784 *flagp = WORST;
3785 if (max > 0)
3786 *flagp |= HASWIDTH;
a0d0e21e 3787 if (max && max < min)
8615cb43 3788 vFAIL("Can't do {n,m} with n > m");
c277df42 3789 if (!SIZE_ONLY) {
eb160463
GS
3790 ARG1_SET(ret, (U16)min);
3791 ARG2_SET(ret, (U16)max);
a687059c 3792 }
a687059c 3793
a0d0e21e 3794 goto nest_check;
a687059c 3795 }
a0d0e21e 3796 }
a687059c 3797
a0d0e21e
LW
3798 if (!ISMULT1(op)) {
3799 *flagp = flags;
a687059c 3800 return(ret);
a0d0e21e 3801 }
bb20fd44 3802
c277df42 3803#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
3804
3805 /* if this is reinstated, don't forget to put this back into perldiag:
3806
3807 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3808
3809 (F) The part of the regexp subject to either the * or + quantifier
3810 could match an empty string. The {#} shows in the regular
3811 expression about where the problem was discovered.
3812
3813 */
3814
bb20fd44 3815 if (!(flags&HASWIDTH) && op != '?')
b45f050a 3816 vFAIL("Regexp *+ operand could be empty");
b81d288d 3817#endif
bb20fd44 3818
fac92740 3819 parse_start = RExC_parse;
830247a4 3820 nextchar(pRExC_state);
a0d0e21e 3821
821b33a5 3822 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
3823
3824 if (op == '*' && (flags&SIMPLE)) {
830247a4 3825 reginsert(pRExC_state, STAR, ret);
c277df42 3826 ret->flags = 0;
830247a4 3827 RExC_naughty += 4;
a0d0e21e
LW
3828 }
3829 else if (op == '*') {
3830 min = 0;
3831 goto do_curly;
a0ed51b3
LW
3832 }
3833 else if (op == '+' && (flags&SIMPLE)) {
830247a4 3834 reginsert(pRExC_state, PLUS, ret);
c277df42 3835 ret->flags = 0;
830247a4 3836 RExC_naughty += 3;
a0d0e21e
LW
3837 }
3838 else if (op == '+') {
3839 min = 1;
3840 goto do_curly;
a0ed51b3
LW
3841 }
3842 else if (op == '?') {
a0d0e21e
LW
3843 min = 0; max = 1;
3844 goto do_curly;
3845 }
3846 nest_check:
041457d9 3847 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 3848 vWARN3(RExC_parse,
b45f050a 3849 "%.*s matches null string many times",
830247a4 3850 RExC_parse - origparse,
b45f050a 3851 origparse);
a0d0e21e
LW
3852 }
3853
830247a4
IZ
3854 if (*RExC_parse == '?') {
3855 nextchar(pRExC_state);
3856 reginsert(pRExC_state, MINMOD, ret);
3857 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 3858 }
830247a4
IZ
3859 if (ISMULT2(RExC_parse)) {
3860 RExC_parse++;
b45f050a
JF
3861 vFAIL("Nested quantifiers");
3862 }
a0d0e21e
LW
3863
3864 return(ret);
a687059c
LW
3865}
3866
3867/*
3868 - regatom - the lowest level
3869 *
3870 * Optimization: gobbles an entire sequence of ordinary characters so that
3871 * it can turn them into a single node, which is smaller to store and
3872 * faster to run. Backslashed characters are exceptions, each becoming a
3873 * separate node; the code is simpler that way and it's not worth fixing.
3874 *
b45f050a 3875 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 3876STATIC regnode *
830247a4 3877S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3878{
97aff369 3879 dVAR;
cbbf8932 3880 register regnode *ret = NULL;
a0d0e21e 3881 I32 flags;
45948336 3882 char *parse_start = RExC_parse;
a0d0e21e
LW
3883
3884 *flagp = WORST; /* Tentatively. */
3885
3886tryagain:
830247a4 3887 switch (*RExC_parse) {
a0d0e21e 3888 case '^':
830247a4
IZ
3889 RExC_seen_zerolen++;
3890 nextchar(pRExC_state);
e2509266 3891 if (RExC_flags & PMf_MULTILINE)
830247a4 3892 ret = reg_node(pRExC_state, MBOL);
e2509266 3893 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3894 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3895 else
830247a4 3896 ret = reg_node(pRExC_state, BOL);
fac92740 3897 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3898 break;
3899 case '$':
830247a4 3900 nextchar(pRExC_state);
b81d288d 3901 if (*RExC_parse)
830247a4 3902 RExC_seen_zerolen++;
e2509266 3903 if (RExC_flags & PMf_MULTILINE)
830247a4 3904 ret = reg_node(pRExC_state, MEOL);
e2509266 3905 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3906 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3907 else
830247a4 3908 ret = reg_node(pRExC_state, EOL);
fac92740 3909 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3910 break;
3911 case '.':
830247a4 3912 nextchar(pRExC_state);
e2509266 3913 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
3914 ret = reg_node(pRExC_state, SANY);
3915 else
3916 ret = reg_node(pRExC_state, REG_ANY);
3917 *flagp |= HASWIDTH|SIMPLE;
830247a4 3918 RExC_naughty++;
fac92740 3919 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3920 break;
3921 case '[':
b45f050a 3922 {
830247a4 3923 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 3924 ret = regclass(pRExC_state);
830247a4
IZ
3925 if (*RExC_parse != ']') {
3926 RExC_parse = oregcomp_parse;
b45f050a
JF
3927 vFAIL("Unmatched [");
3928 }
830247a4 3929 nextchar(pRExC_state);
a0d0e21e 3930 *flagp |= HASWIDTH|SIMPLE;
fac92740 3931 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 3932 break;
b45f050a 3933 }
a0d0e21e 3934 case '(':
830247a4
IZ
3935 nextchar(pRExC_state);
3936 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 3937 if (ret == NULL) {
bf93d4cc 3938 if (flags & TRYAGAIN) {
830247a4 3939 if (RExC_parse == RExC_end) {
bf93d4cc
GS
3940 /* Make parent create an empty node if needed. */
3941 *flagp |= TRYAGAIN;
3942 return(NULL);
3943 }
a0d0e21e 3944 goto tryagain;
bf93d4cc 3945 }
a0d0e21e
LW
3946 return(NULL);
3947 }
c277df42 3948 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
3949 break;
3950 case '|':
3951 case ')':
3952 if (flags & TRYAGAIN) {
3953 *flagp |= TRYAGAIN;
3954 return NULL;
3955 }
b45f050a 3956 vFAIL("Internal urp");
a0d0e21e
LW
3957 /* Supposed to be caught earlier. */
3958 break;
85afd4ae 3959 case '{':
830247a4
IZ
3960 if (!regcurly(RExC_parse)) {
3961 RExC_parse++;
85afd4ae
CS
3962 goto defchar;
3963 }
3964 /* FALL THROUGH */
a0d0e21e
LW
3965 case '?':
3966 case '+':
3967 case '*':
830247a4 3968 RExC_parse++;
b45f050a 3969 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
3970 break;
3971 case '\\':
830247a4 3972 switch (*++RExC_parse) {
a0d0e21e 3973 case 'A':
830247a4
IZ
3974 RExC_seen_zerolen++;
3975 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3976 *flagp |= SIMPLE;
830247a4 3977 nextchar(pRExC_state);
fac92740 3978 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3979 break;
3980 case 'G':
830247a4
IZ
3981 ret = reg_node(pRExC_state, GPOS);
3982 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 3983 *flagp |= SIMPLE;
830247a4 3984 nextchar(pRExC_state);
fac92740 3985 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3986 break;
3987 case 'Z':
830247a4 3988 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3989 *flagp |= SIMPLE;
a1917ab9 3990 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 3991 nextchar(pRExC_state);
a0d0e21e 3992 break;
b85d18e9 3993 case 'z':
830247a4 3994 ret = reg_node(pRExC_state, EOS);
b85d18e9 3995 *flagp |= SIMPLE;
830247a4
IZ
3996 RExC_seen_zerolen++; /* Do not optimize RE away */
3997 nextchar(pRExC_state);
fac92740 3998 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 3999 break;
4a2d328f 4000 case 'C':
f33976b4
DB
4001 ret = reg_node(pRExC_state, CANY);
4002 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 4003 *flagp |= HASWIDTH|SIMPLE;
830247a4 4004 nextchar(pRExC_state);
fac92740 4005 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
4006 break;
4007 case 'X':
830247a4 4008 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 4009 *flagp |= HASWIDTH;
830247a4 4010 nextchar(pRExC_state);
fac92740 4011 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 4012 break;
a0d0e21e 4013 case 'w':
eb160463 4014 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 4015 *flagp |= HASWIDTH|SIMPLE;
830247a4 4016 nextchar(pRExC_state);
fac92740 4017 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4018 break;
4019 case 'W':
eb160463 4020 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4021 *flagp |= HASWIDTH|SIMPLE;
830247a4 4022 nextchar(pRExC_state);
fac92740 4023 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4024 break;
4025 case 'b':
830247a4
IZ
4026 RExC_seen_zerolen++;
4027 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4028 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4029 *flagp |= SIMPLE;
830247a4 4030 nextchar(pRExC_state);
fac92740 4031 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4032 break;
4033 case 'B':
830247a4
IZ
4034 RExC_seen_zerolen++;
4035 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4036 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4037 *flagp |= SIMPLE;
830247a4 4038 nextchar(pRExC_state);
fac92740 4039 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4040 break;
4041 case 's':
eb160463 4042 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4043 *flagp |= HASWIDTH|SIMPLE;
830247a4 4044 nextchar(pRExC_state);
fac92740 4045 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4046 break;
4047 case 'S':
eb160463 4048 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4049 *flagp |= HASWIDTH|SIMPLE;
830247a4 4050 nextchar(pRExC_state);
fac92740 4051 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4052 break;
4053 case 'd':
ffc61ed2 4054 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4055 *flagp |= HASWIDTH|SIMPLE;
830247a4 4056 nextchar(pRExC_state);
fac92740 4057 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4058 break;
4059 case 'D':
ffc61ed2 4060 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4061 *flagp |= HASWIDTH|SIMPLE;
830247a4 4062 nextchar(pRExC_state);
fac92740 4063 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4064 break;
a14b48bc
LW
4065 case 'p':
4066 case 'P':
3568d838 4067 {
830247a4 4068 char* oldregxend = RExC_end;
ccb2c380 4069 char* parse_start = RExC_parse - 2;
a14b48bc 4070
830247a4 4071 if (RExC_parse[1] == '{') {
3568d838 4072 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4073 RExC_end = strchr(RExC_parse, '}');
4074 if (!RExC_end) {
0da60cf5 4075 U8 c = (U8)*RExC_parse;
830247a4
IZ
4076 RExC_parse += 2;
4077 RExC_end = oldregxend;
0da60cf5 4078 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4079 }
830247a4 4080 RExC_end++;
a14b48bc 4081 }
af6f566e 4082 else {
830247a4 4083 RExC_end = RExC_parse + 2;
af6f566e
HS
4084 if (RExC_end > oldregxend)
4085 RExC_end = oldregxend;
4086 }
830247a4 4087 RExC_parse--;
a14b48bc 4088
ffc61ed2 4089 ret = regclass(pRExC_state);
a14b48bc 4090
830247a4
IZ
4091 RExC_end = oldregxend;
4092 RExC_parse--;
ccb2c380
MP
4093
4094 Set_Node_Offset(ret, parse_start + 2);
4095 Set_Node_Cur_Length(ret);
830247a4 4096 nextchar(pRExC_state);
a14b48bc
LW
4097 *flagp |= HASWIDTH|SIMPLE;
4098 }
4099 break;
a0d0e21e
LW
4100 case 'n':
4101 case 'r':
4102 case 't':
4103 case 'f':
4104 case 'e':
4105 case 'a':
4106 case 'x':
4107 case 'c':
4108 case '0':
4109 goto defchar;
4110 case '1': case '2': case '3': case '4':
4111 case '5': case '6': case '7': case '8': case '9':
4112 {
1df70142 4113 const I32 num = atoi(RExC_parse);
a0d0e21e 4114
830247a4 4115 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4116 goto defchar;
4117 else {
fac92740 4118 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4119 while (isDIGIT(*RExC_parse))
4120 RExC_parse++;
b45f050a 4121
eb160463 4122 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4123 vFAIL("Reference to nonexistent group");
830247a4 4124 RExC_sawback = 1;
eb160463
GS
4125 ret = reganode(pRExC_state,
4126 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4127 num);
a0d0e21e 4128 *flagp |= HASWIDTH;
2af232bd 4129
fac92740 4130 /* override incorrect value set in reganode MJD */
2af232bd 4131 Set_Node_Offset(ret, parse_start+1);
fac92740 4132 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4133 RExC_parse--;
4134 nextchar(pRExC_state);
a0d0e21e
LW
4135 }
4136 }
4137 break;
4138 case '\0':
830247a4 4139 if (RExC_parse >= RExC_end)
b45f050a 4140 FAIL("Trailing \\");
a0d0e21e
LW
4141 /* FALL THROUGH */
4142 default:
a0288114 4143 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 4144 back into the quick-grab loop below */
45948336 4145 parse_start--;
a0d0e21e
LW
4146 goto defchar;
4147 }
4148 break;
4633a7c4
LW
4149
4150 case '#':
e2509266 4151 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4152 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4153 if (RExC_parse < RExC_end)
4633a7c4
LW
4154 goto tryagain;
4155 }
4156 /* FALL THROUGH */
4157
a0d0e21e 4158 default: {
ba210ebe 4159 register STRLEN len;
58ae7d3f 4160 register UV ender;
a0d0e21e 4161 register char *p;
c277df42 4162 char *oldp, *s;
80aecb99 4163 STRLEN foldlen;
89ebb4a3 4164 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4165
4166 parse_start = RExC_parse - 1;
a0d0e21e 4167
830247a4 4168 RExC_parse++;
a0d0e21e
LW
4169
4170 defchar:
58ae7d3f 4171 ender = 0;
eb160463
GS
4172 ret = reg_node(pRExC_state,
4173 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4174 s = STRING(ret);
830247a4
IZ
4175 for (len = 0, p = RExC_parse - 1;
4176 len < 127 && p < RExC_end;
a0d0e21e
LW
4177 len++)
4178 {
4179 oldp = p;
5b5a24f7 4180
e2509266 4181 if (RExC_flags & PMf_EXTENDED)
830247a4 4182 p = regwhite(p, RExC_end);
a0d0e21e
LW
4183 switch (*p) {
4184 case '^':
4185 case '$':
4186 case '.':
4187 case '[':
4188 case '(':
4189 case ')':
4190 case '|':
4191 goto loopdone;
4192 case '\\':
4193 switch (*++p) {
4194 case 'A':
1ed8eac0
JF
4195 case 'C':
4196 case 'X':
a0d0e21e
LW
4197 case 'G':
4198 case 'Z':
b85d18e9 4199 case 'z':
a0d0e21e
LW
4200 case 'w':
4201 case 'W':
4202 case 'b':
4203 case 'B':
4204 case 's':
4205 case 'S':
4206 case 'd':
4207 case 'D':
a14b48bc
LW
4208 case 'p':
4209 case 'P':
a0d0e21e
LW
4210 --p;
4211 goto loopdone;
4212 case 'n':
4213 ender = '\n';
4214 p++;
a687059c 4215 break;
a0d0e21e
LW
4216 case 'r':
4217 ender = '\r';
4218 p++;
a687059c 4219 break;
a0d0e21e
LW
4220 case 't':
4221 ender = '\t';
4222 p++;
a687059c 4223 break;
a0d0e21e
LW
4224 case 'f':
4225 ender = '\f';
4226 p++;
a687059c 4227 break;
a0d0e21e 4228 case 'e':
c7f1f016 4229 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4230 p++;
a687059c 4231 break;
a0d0e21e 4232 case 'a':
c7f1f016 4233 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4234 p++;
a687059c 4235 break;
a0d0e21e 4236 case 'x':
a0ed51b3 4237 if (*++p == '{') {
1df70142 4238 char* const e = strchr(p, '}');
b81d288d 4239
b45f050a 4240 if (!e) {
830247a4 4241 RExC_parse = p + 1;
b45f050a
JF
4242 vFAIL("Missing right brace on \\x{}");
4243 }
de5f0749 4244 else {
a4c04bdc
NC
4245 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4246 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 4247 STRLEN numlen = e - p - 1;
53305cf1 4248 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4249 if (ender > 0xff)
4250 RExC_utf8 = 1;
a0ed51b3
LW
4251 p = e + 1;
4252 }
a0ed51b3
LW
4253 }
4254 else {
a4c04bdc 4255 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 4256 STRLEN numlen = 2;
53305cf1 4257 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4258 p += numlen;
4259 }
a687059c 4260 break;
a0d0e21e
LW
4261 case 'c':
4262 p++;
bbce6d69 4263 ender = UCHARAT(p++);
4264 ender = toCTRL(ender);
a687059c 4265 break;
a0d0e21e
LW
4266 case '0': case '1': case '2': case '3':case '4':
4267 case '5': case '6': case '7': case '8':case '9':
4268 if (*p == '0' ||
830247a4 4269 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 4270 I32 flags = 0;
1df70142 4271 STRLEN numlen = 3;
53305cf1 4272 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4273 p += numlen;
4274 }
4275 else {
4276 --p;
4277 goto loopdone;
a687059c
LW
4278 }
4279 break;
a0d0e21e 4280 case '\0':
830247a4 4281 if (p >= RExC_end)
b45f050a 4282 FAIL("Trailing \\");
a687059c 4283 /* FALL THROUGH */
a0d0e21e 4284 default:
041457d9 4285 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 4286 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4287 goto normal_default;
a0d0e21e
LW
4288 }
4289 break;
a687059c 4290 default:
a0ed51b3 4291 normal_default:
fd400ab9 4292 if (UTF8_IS_START(*p) && UTF) {
1df70142 4293 STRLEN numlen;
5e12f4fb 4294 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 4295 &numlen, 0);
a0ed51b3
LW
4296 p += numlen;
4297 }
4298 else
4299 ender = *p++;
a0d0e21e 4300 break;
a687059c 4301 }
e2509266 4302 if (RExC_flags & PMf_EXTENDED)
830247a4 4303 p = regwhite(p, RExC_end);
60a8b682
JH
4304 if (UTF && FOLD) {
4305 /* Prime the casefolded buffer. */
ac7e0132 4306 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4307 }
a0d0e21e
LW
4308 if (ISMULT2(p)) { /* Back off on ?+*. */
4309 if (len)
4310 p = oldp;
16ea2a2e 4311 else if (UTF) {
0ebc6274
JH
4312 STRLEN unilen;
4313
80aecb99 4314 if (FOLD) {
60a8b682 4315 /* Emit all the Unicode characters. */
1df70142 4316 STRLEN numlen;
80aecb99
JH
4317 for (foldbuf = tmpbuf;
4318 foldlen;
4319 foldlen -= numlen) {
4320 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4321 if (numlen > 0) {
0ebc6274
JH
4322 reguni(pRExC_state, ender, s, &unilen);
4323 s += unilen;
4324 len += unilen;
4325 /* In EBCDIC the numlen
4326 * and unilen can differ. */
9dc45d57 4327 foldbuf += numlen;
47654450
JH
4328 if (numlen >= foldlen)
4329 break;
9dc45d57
JH
4330 }
4331 else
4332 break; /* "Can't happen." */
80aecb99
JH
4333 }
4334 }
4335 else {
0ebc6274 4336 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4337 if (unilen > 0) {
0ebc6274
JH
4338 s += unilen;
4339 len += unilen;
9dc45d57 4340 }
80aecb99 4341 }
a0ed51b3 4342 }
a0d0e21e
LW
4343 else {
4344 len++;
eb160463 4345 REGC((char)ender, s++);
a0d0e21e
LW
4346 }
4347 break;
a687059c 4348 }
16ea2a2e 4349 if (UTF) {
0ebc6274
JH
4350 STRLEN unilen;
4351
80aecb99 4352 if (FOLD) {
60a8b682 4353 /* Emit all the Unicode characters. */
1df70142 4354 STRLEN numlen;
80aecb99
JH
4355 for (foldbuf = tmpbuf;
4356 foldlen;
4357 foldlen -= numlen) {
4358 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4359 if (numlen > 0) {
0ebc6274
JH
4360 reguni(pRExC_state, ender, s, &unilen);
4361 len += unilen;
4362 s += unilen;
4363 /* In EBCDIC the numlen
4364 * and unilen can differ. */
9dc45d57 4365 foldbuf += numlen;
47654450
JH
4366 if (numlen >= foldlen)
4367 break;
9dc45d57
JH
4368 }
4369 else
4370 break;
80aecb99
JH
4371 }
4372 }
4373 else {
0ebc6274 4374 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4375 if (unilen > 0) {
0ebc6274
JH
4376 s += unilen;
4377 len += unilen;
9dc45d57 4378 }
80aecb99
JH
4379 }
4380 len--;
a0ed51b3
LW
4381 }
4382 else
eb160463 4383 REGC((char)ender, s++);
a0d0e21e
LW
4384 }
4385 loopdone:
830247a4 4386 RExC_parse = p - 1;
fac92740 4387 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4388 nextchar(pRExC_state);
793db0cb
JH
4389 {
4390 /* len is STRLEN which is unsigned, need to copy to signed */
4391 IV iv = len;
4392 if (iv < 0)
4393 vFAIL("Internal disaster");
4394 }
a0d0e21e
LW
4395 if (len > 0)
4396 *flagp |= HASWIDTH;
090f7165 4397 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4398 *flagp |= SIMPLE;
c277df42 4399 if (!SIZE_ONLY)
cd439c50
IZ
4400 STR_LEN(ret) = len;
4401 if (SIZE_ONLY)
830247a4 4402 RExC_size += STR_SZ(len);
cd439c50 4403 else
830247a4 4404 RExC_emit += STR_SZ(len);
a687059c 4405 }
a0d0e21e
LW
4406 break;
4407 }
a687059c 4408
60a8b682
JH
4409 /* If the encoding pragma is in effect recode the text of
4410 * any EXACT-kind nodes. */
22c54be3 4411 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
4412 STRLEN oldlen = STR_LEN(ret);
4413 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4414
4415 if (RExC_utf8)
4416 SvUTF8_on(sv);
4417 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
4418 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4419 const STRLEN newlen = SvCUR(sv);
d0063567
DK
4420
4421 if (SvUTF8(sv))
4422 RExC_utf8 = 1;
4423 if (!SIZE_ONLY) {
a3621e74
YO
4424 GET_RE_DEBUG_FLAGS_DECL;
4425 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
4426 (int)oldlen, STRING(ret),
4427 (int)newlen, s));
4428 Copy(s, STRING(ret), newlen, char);
4429 STR_LEN(ret) += newlen - oldlen;
4430 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4431 } else
4432 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4433 }
a72c7584
JH
4434 }
4435
a0d0e21e 4436 return(ret);
a687059c
LW
4437}
4438
873ef191 4439STATIC char *
504618e9 4440S_regwhite(pTHX_ char *p, const char *e)
5b5a24f7
CS
4441{
4442 while (p < e) {
4443 if (isSPACE(*p))
4444 ++p;
4445 else if (*p == '#') {
4446 do {
4447 p++;
4448 } while (p < e && *p != '\n');
4449 }
4450 else
4451 break;
4452 }
4453 return p;
4454}
4455
b8c5462f
JH
4456/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4457 Character classes ([:foo:]) can also be negated ([:^foo:]).
4458 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4459 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 4460 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
4461
4462#define POSIXCC_DONE(c) ((c) == ':')
4463#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4464#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4465
b8c5462f 4466STATIC I32
830247a4 4467S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 4468{
97aff369 4469 dVAR;
936ed897 4470 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 4471
830247a4 4472 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 4473 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 4474 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 4475 const char c = UCHARAT(RExC_parse);
830247a4 4476 char* s = RExC_parse++;
b81d288d 4477
9a86a77b 4478 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
4479 RExC_parse++;
4480 if (RExC_parse == RExC_end)
620e46c5 4481 /* Grandfather lone [:, [=, [. */
830247a4 4482 RExC_parse = s;
620e46c5 4483 else {
1df70142 4484 const char* t = RExC_parse++; /* skip over the c */
a28509cc 4485 const char *posixcc;
b8c5462f 4486
80916619
NC
4487 assert(*t == c);
4488
9a86a77b 4489 if (UCHARAT(RExC_parse) == ']') {
830247a4 4490 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
4491 posixcc = s + 1;
4492 if (*s == ':') {
1df70142
AL
4493 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4494 const I32 skip = t - posixcc;
80916619
NC
4495
4496 /* Initially switch on the length of the name. */
4497 switch (skip) {
4498 case 4:
4499 if (memEQ(posixcc, "word", 4)) {
4500 /* this is not POSIX, this is the Perl \w */;
4501 namedclass
4502 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4503 }
cc4319de 4504 break;
80916619
NC
4505 case 5:
4506 /* Names all of length 5. */
4507 /* alnum alpha ascii blank cntrl digit graph lower
4508 print punct space upper */
4509 /* Offset 4 gives the best switch position. */
4510 switch (posixcc[4]) {
4511 case 'a':
4512 if (memEQ(posixcc, "alph", 4)) {
4513 /* a */
4514 namedclass
4515 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4516 }
4517 break;
4518 case 'e':
4519 if (memEQ(posixcc, "spac", 4)) {
4520 /* e */
4521 namedclass
4522 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4523 }
4524 break;
4525 case 'h':
4526 if (memEQ(posixcc, "grap", 4)) {
4527 /* h */
4528 namedclass
4529 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4530 }
4531 break;
4532 case 'i':
4533 if (memEQ(posixcc, "asci", 4)) {
4534 /* i */
4535 namedclass
4536 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4537 }
4538 break;
4539 case 'k':
4540 if (memEQ(posixcc, "blan", 4)) {
4541 /* k */
4542 namedclass
4543 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4544 }
4545 break;
4546 case 'l':
4547 if (memEQ(posixcc, "cntr", 4)) {
4548 /* l */
4549 namedclass
4550 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4551 }
4552 break;
4553 case 'm':
4554 if (memEQ(posixcc, "alnu", 4)) {
4555 /* m */
4556 namedclass
4557 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4558 }
4559 break;
4560 case 'r':
4561 if (memEQ(posixcc, "lowe", 4)) {
4562 /* r */
4563 namedclass
4564 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4565 }
4566 if (memEQ(posixcc, "uppe", 4)) {
8fdec511 4567 /* r */
80916619
NC
4568 namedclass
4569 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4570 }
4571 break;
4572 case 't':
4573 if (memEQ(posixcc, "digi", 4)) {
4574 /* t */
4575 namedclass
4576 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4577 }
4578 if (memEQ(posixcc, "prin", 4)) {
8fdec511 4579 /* t */
80916619
NC
4580 namedclass
4581 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4582 }
4583 if (memEQ(posixcc, "punc", 4)) {
4584 /* t */
4585 namedclass
4586 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4587 }
4588 break;
b8c5462f
JH
4589 }
4590 break;
80916619
NC
4591 case 6:
4592 if (memEQ(posixcc, "xdigit", 6)) {
4593 namedclass
4594 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
4595 }
4596 break;
4597 }
80916619
NC
4598
4599 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
4600 {
4601 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4602 t - s - 1, s + 1);
4603 }
80916619
NC
4604 assert (posixcc[skip] == ':');
4605 assert (posixcc[skip+1] == ']');
b45f050a 4606 } else if (!SIZE_ONLY) {
b8c5462f 4607 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 4608
830247a4 4609 /* adjust RExC_parse so the warning shows after
b45f050a 4610 the class closes */
9a86a77b 4611 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 4612 RExC_parse++;
b45f050a
JF
4613 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4614 }
b8c5462f
JH
4615 } else {
4616 /* Maternal grandfather:
4617 * "[:" ending in ":" but not in ":]" */
830247a4 4618 RExC_parse = s;
767d463e 4619 }
620e46c5
JH
4620 }
4621 }
4622
b8c5462f
JH
4623 return namedclass;
4624}
4625
4626STATIC void
830247a4 4627S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 4628{
97aff369 4629 dVAR;
b938889d 4630 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
4631 const char *s = RExC_parse;
4632 const char c = *s++;
b8c5462f
JH
4633
4634 while(*s && isALNUM(*s))
4635 s++;
4636 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
4637 if (ckWARN(WARN_REGEXP))
4638 vWARN3(s+2,
4639 "POSIX syntax [%c %c] belongs inside character classes",
4640 c, c);
b45f050a
JF
4641
4642 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 4643 if (POSIXCC_NOTYET(c)) {
830247a4 4644 /* adjust RExC_parse so the error shows after
b45f050a 4645 the class closes */
9a86a77b 4646 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
4647 ;
4648 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4649 }
b8c5462f
JH
4650 }
4651 }
620e46c5
JH
4652}
4653
76e3520e 4654STATIC regnode *
830247a4 4655S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 4656{
97aff369 4657 dVAR;
ffc61ed2 4658 register UV value;
9a86a77b 4659 register UV nextvalue;
3568d838 4660 register IV prevvalue = OOB_UNICODE;
ffc61ed2 4661 register IV range = 0;
c277df42 4662 register regnode *ret;
ba210ebe 4663 STRLEN numlen;
ffc61ed2 4664 IV namedclass;
cbbf8932 4665 char *rangebegin = NULL;
936ed897 4666 bool need_class = 0;
c445ea15 4667 SV *listsv = NULL;
ffc61ed2
JH
4668 register char *e;
4669 UV n;
9e55ce06 4670 bool optimize_invert = TRUE;
cbbf8932 4671 AV* unicode_alternate = NULL;
1b2d223b
JH
4672#ifdef EBCDIC
4673 UV literal_endpoint = 0;
4674#endif
ffc61ed2
JH
4675
4676 ret = reganode(pRExC_state, ANYOF, 0);
4677
4678 if (!SIZE_ONLY)
4679 ANYOF_FLAGS(ret) = 0;
4680
9a86a77b 4681 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
4682 RExC_naughty++;
4683 RExC_parse++;
4684 if (!SIZE_ONLY)
4685 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4686 }
a0d0e21e 4687
936ed897 4688 if (SIZE_ONLY)
830247a4 4689 RExC_size += ANYOF_SKIP;
936ed897 4690 else {
830247a4 4691 RExC_emit += ANYOF_SKIP;
936ed897
IZ
4692 if (FOLD)
4693 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4694 if (LOC)
4695 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 4696 ANYOF_BITMAP_ZERO(ret);
396482e1 4697 listsv = newSVpvs("# comment\n");
a0d0e21e 4698 }
b8c5462f 4699
9a86a77b
JH
4700 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4701
b938889d 4702 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 4703 checkposixcc(pRExC_state);
b8c5462f 4704
f064b6ad
HS
4705 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4706 if (UCHARAT(RExC_parse) == ']')
4707 goto charclassloop;
ffc61ed2 4708
9a86a77b 4709 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
4710
4711 charclassloop:
4712
4713 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4714
73b437c8 4715 if (!range)
830247a4 4716 rangebegin = RExC_parse;
ffc61ed2 4717 if (UTF) {
5e12f4fb 4718 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
4719 RExC_end - RExC_parse,
4720 &numlen, 0);
ffc61ed2
JH
4721 RExC_parse += numlen;
4722 }
4723 else
4724 value = UCHARAT(RExC_parse++);
9a86a77b
JH
4725 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4726 if (value == '[' && POSIXCC(nextvalue))
830247a4 4727 namedclass = regpposixcc(pRExC_state, value);
620e46c5 4728 else if (value == '\\') {
ffc61ed2 4729 if (UTF) {
5e12f4fb 4730 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
4731 RExC_end - RExC_parse,
4732 &numlen, 0);
4733 RExC_parse += numlen;
4734 }
4735 else
4736 value = UCHARAT(RExC_parse++);
470c3474 4737 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 4738 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
4739 * be a problem later if we want switch on Unicode.
4740 * A similar issue a little bit later when switching on
4741 * namedclass. --jhi */
ffc61ed2 4742 switch ((I32)value) {
b8c5462f
JH
4743 case 'w': namedclass = ANYOF_ALNUM; break;
4744 case 'W': namedclass = ANYOF_NALNUM; break;
4745 case 's': namedclass = ANYOF_SPACE; break;
4746 case 'S': namedclass = ANYOF_NSPACE; break;
4747 case 'd': namedclass = ANYOF_DIGIT; break;
4748 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
4749 case 'p':
4750 case 'P':
af6f566e 4751 if (RExC_parse >= RExC_end)
2a4859cd 4752 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 4753 if (*RExC_parse == '{') {
1df70142 4754 const U8 c = (U8)value;
ffc61ed2
JH
4755 e = strchr(RExC_parse++, '}');
4756 if (!e)
0da60cf5 4757 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
4758 while (isSPACE(UCHARAT(RExC_parse)))
4759 RExC_parse++;
4760 if (e == RExC_parse)
0da60cf5 4761 vFAIL2("Empty \\%c{}", c);
ffc61ed2 4762 n = e - RExC_parse;
ab13f0c7
JH
4763 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4764 n--;
ffc61ed2
JH
4765 }
4766 else {
4767 e = RExC_parse;
4768 n = 1;
4769 }
4770 if (!SIZE_ONLY) {
ab13f0c7
JH
4771 if (UCHARAT(RExC_parse) == '^') {
4772 RExC_parse++;
4773 n--;
4774 value = value == 'p' ? 'P' : 'p'; /* toggle */
4775 while (isSPACE(UCHARAT(RExC_parse))) {
4776 RExC_parse++;
4777 n--;
4778 }
4779 }
ffc61ed2 4780 if (value == 'p')
ab13f0c7
JH
4781 Perl_sv_catpvf(aTHX_ listsv,
4782 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 4783 else
ab13f0c7
JH
4784 Perl_sv_catpvf(aTHX_ listsv,
4785 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
4786 }
4787 RExC_parse = e + 1;
4788 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2
JP
4789 namedclass = ANYOF_MAX; /* no official name, but it's named */
4790 break;
b8c5462f
JH
4791 case 'n': value = '\n'; break;
4792 case 'r': value = '\r'; break;
4793 case 't': value = '\t'; break;
4794 case 'f': value = '\f'; break;
4795 case 'b': value = '\b'; break;
c7f1f016
NIS
4796 case 'e': value = ASCII_TO_NATIVE('\033');break;
4797 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 4798 case 'x':
ffc61ed2 4799 if (*RExC_parse == '{') {
a4c04bdc
NC
4800 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4801 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 4802 e = strchr(RExC_parse++, '}');
b81d288d 4803 if (!e)
ffc61ed2 4804 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
4805
4806 numlen = e - RExC_parse;
4807 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4808 RExC_parse = e + 1;
4809 }
4810 else {
a4c04bdc 4811 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4812 numlen = 2;
4813 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4814 RExC_parse += numlen;
4815 }
b8c5462f
JH
4816 break;
4817 case 'c':
830247a4 4818 value = UCHARAT(RExC_parse++);
b8c5462f
JH
4819 value = toCTRL(value);
4820 break;
4821 case '0': case '1': case '2': case '3': case '4':
4822 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
4823 {
4824 I32 flags = 0;
4825 numlen = 3;
4826 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 4827 RExC_parse += numlen;
b8c5462f 4828 break;
53305cf1 4829 }
1028017a 4830 default:
041457d9 4831 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
4832 vWARN2(RExC_parse,
4833 "Unrecognized escape \\%c in character class passed through",
4834 (int)value);
1028017a 4835 break;
b8c5462f 4836 }
ffc61ed2 4837 } /* end of \blah */
1b2d223b
JH
4838#ifdef EBCDIC
4839 else
4840 literal_endpoint++;
4841#endif
ffc61ed2
JH
4842
4843 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4844
4845 if (!SIZE_ONLY && !need_class)
936ed897 4846 ANYOF_CLASS_ZERO(ret);
ffc61ed2 4847
936ed897 4848 need_class = 1;
ffc61ed2
JH
4849
4850 /* a bad range like a-\d, a-[:digit:] ? */
4851 if (range) {
73b437c8 4852 if (!SIZE_ONLY) {
e476b1b5 4853 if (ckWARN(WARN_REGEXP))
830247a4 4854 vWARN4(RExC_parse,
b45f050a 4855 "False [] range \"%*.*s\"",
830247a4
IZ
4856 RExC_parse - rangebegin,
4857 RExC_parse - rangebegin,
b45f050a 4858 rangebegin);
3568d838
JH
4859 if (prevvalue < 256) {
4860 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
4861 ANYOF_BITMAP_SET(ret, '-');
4862 }
4863 else {
4864 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4865 Perl_sv_catpvf(aTHX_ listsv,
3568d838 4866 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 4867 }
b8c5462f 4868 }
ffc61ed2
JH
4869
4870 range = 0; /* this was not a true range */
73b437c8 4871 }
ffc61ed2 4872
73b437c8 4873 if (!SIZE_ONLY) {
c49a72a9
NC
4874 const char *what = NULL;
4875 char yesno = 0;
4876
3568d838
JH
4877 if (namedclass > OOB_NAMEDCLASS)
4878 optimize_invert = FALSE;
e2962f66
JH
4879 /* Possible truncation here but in some 64-bit environments
4880 * the compiler gets heartburn about switch on 64-bit values.
4881 * A similar issue a little earlier when switching on value.
98f323fa 4882 * --jhi */
e2962f66 4883 switch ((I32)namedclass) {
73b437c8
JH
4884 case ANYOF_ALNUM:
4885 if (LOC)
936ed897 4886 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
4887 else {
4888 for (value = 0; value < 256; value++)
4889 if (isALNUM(value))
936ed897 4890 ANYOF_BITMAP_SET(ret, value);
73b437c8 4891 }
c49a72a9
NC
4892 yesno = '+';
4893 what = "Word";
73b437c8
JH
4894 break;
4895 case ANYOF_NALNUM:
4896 if (LOC)
936ed897 4897 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
4898 else {
4899 for (value = 0; value < 256; value++)
4900 if (!isALNUM(value))
936ed897 4901 ANYOF_BITMAP_SET(ret, value);
73b437c8 4902 }
c49a72a9
NC
4903 yesno = '!';
4904 what = "Word";
73b437c8 4905 break;
ffc61ed2 4906 case ANYOF_ALNUMC:
73b437c8 4907 if (LOC)
ffc61ed2 4908 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
4909 else {
4910 for (value = 0; value < 256; value++)
ffc61ed2 4911 if (isALNUMC(value))
936ed897 4912 ANYOF_BITMAP_SET(ret, value);
73b437c8 4913 }
c49a72a9
NC
4914 yesno = '+';
4915 what = "Alnum";
73b437c8
JH
4916 break;
4917 case ANYOF_NALNUMC:
4918 if (LOC)
936ed897 4919 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
4920 else {
4921 for (value = 0; value < 256; value++)
4922 if (!isALNUMC(value))
936ed897 4923 ANYOF_BITMAP_SET(ret, value);
73b437c8 4924 }
c49a72a9
NC
4925 yesno = '!';
4926 what = "Alnum";
73b437c8
JH
4927 break;
4928 case ANYOF_ALPHA:
4929 if (LOC)
936ed897 4930 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
4931 else {
4932 for (value = 0; value < 256; value++)
4933 if (isALPHA(value))
936ed897 4934 ANYOF_BITMAP_SET(ret, value);
73b437c8 4935 }
c49a72a9
NC
4936 yesno = '+';
4937 what = "Alpha";
73b437c8
JH
4938 break;
4939 case ANYOF_NALPHA:
4940 if (LOC)
936ed897 4941 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
4942 else {
4943 for (value = 0; value < 256; value++)
4944 if (!isALPHA(value))
936ed897 4945 ANYOF_BITMAP_SET(ret, value);
73b437c8 4946 }
c49a72a9
NC
4947 yesno = '!';
4948 what = "Alpha";
73b437c8
JH
4949 break;
4950 case ANYOF_ASCII:
4951 if (LOC)
936ed897 4952 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 4953 else {
c7f1f016 4954#ifndef EBCDIC
1ba5c669
JH
4955 for (value = 0; value < 128; value++)
4956 ANYOF_BITMAP_SET(ret, value);
4957#else /* EBCDIC */
ffbc6a93 4958 for (value = 0; value < 256; value++) {
3a3c4447
JH
4959 if (isASCII(value))
4960 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4961 }
1ba5c669 4962#endif /* EBCDIC */
73b437c8 4963 }
c49a72a9
NC
4964 yesno = '+';
4965 what = "ASCII";
73b437c8
JH
4966 break;
4967 case ANYOF_NASCII:
4968 if (LOC)
936ed897 4969 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 4970 else {
c7f1f016 4971#ifndef EBCDIC
1ba5c669
JH
4972 for (value = 128; value < 256; value++)
4973 ANYOF_BITMAP_SET(ret, value);
4974#else /* EBCDIC */
ffbc6a93 4975 for (value = 0; value < 256; value++) {
3a3c4447
JH
4976 if (!isASCII(value))
4977 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4978 }
1ba5c669 4979#endif /* EBCDIC */
73b437c8 4980 }
c49a72a9
NC
4981 yesno = '!';
4982 what = "ASCII";
73b437c8 4983 break;
aaa51d5e
JF
4984 case ANYOF_BLANK:
4985 if (LOC)
4986 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4987 else {
4988 for (value = 0; value < 256; value++)
4989 if (isBLANK(value))
4990 ANYOF_BITMAP_SET(ret, value);
4991 }
c49a72a9
NC
4992 yesno = '+';
4993 what = "Blank";
aaa51d5e
JF
4994 break;
4995 case ANYOF_NBLANK:
4996 if (LOC)
4997 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4998 else {
4999 for (value = 0; value < 256; value++)
5000 if (!isBLANK(value))
5001 ANYOF_BITMAP_SET(ret, value);
5002 }
c49a72a9
NC
5003 yesno = '!';
5004 what = "Blank";
aaa51d5e 5005 break;
73b437c8
JH
5006 case ANYOF_CNTRL:
5007 if (LOC)
936ed897 5008 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
5009 else {
5010 for (value = 0; value < 256; value++)
5011 if (isCNTRL(value))
936ed897 5012 ANYOF_BITMAP_SET(ret, value);
73b437c8 5013 }
c49a72a9
NC
5014 yesno = '+';
5015 what = "Cntrl";
73b437c8
JH
5016 break;
5017 case ANYOF_NCNTRL:
5018 if (LOC)
936ed897 5019 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5020 else {
5021 for (value = 0; value < 256; value++)
5022 if (!isCNTRL(value))
936ed897 5023 ANYOF_BITMAP_SET(ret, value);
73b437c8 5024 }
c49a72a9
NC
5025 yesno = '!';
5026 what = "Cntrl";
ffc61ed2
JH
5027 break;
5028 case ANYOF_DIGIT:
5029 if (LOC)
5030 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5031 else {
5032 /* consecutive digits assumed */
5033 for (value = '0'; value <= '9'; value++)
5034 ANYOF_BITMAP_SET(ret, value);
5035 }
c49a72a9
NC
5036 yesno = '+';
5037 what = "Digit";
ffc61ed2
JH
5038 break;
5039 case ANYOF_NDIGIT:
5040 if (LOC)
5041 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5042 else {
5043 /* consecutive digits assumed */
5044 for (value = 0; value < '0'; value++)
5045 ANYOF_BITMAP_SET(ret, value);
5046 for (value = '9' + 1; value < 256; value++)
5047 ANYOF_BITMAP_SET(ret, value);
5048 }
c49a72a9
NC
5049 yesno = '!';
5050 what = "Digit";
73b437c8
JH
5051 break;
5052 case ANYOF_GRAPH:
5053 if (LOC)
936ed897 5054 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5055 else {
5056 for (value = 0; value < 256; value++)
5057 if (isGRAPH(value))
936ed897 5058 ANYOF_BITMAP_SET(ret, value);
73b437c8 5059 }
c49a72a9
NC
5060 yesno = '+';
5061 what = "Graph";
73b437c8
JH
5062 break;
5063 case ANYOF_NGRAPH:
5064 if (LOC)
936ed897 5065 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5066 else {
5067 for (value = 0; value < 256; value++)
5068 if (!isGRAPH(value))
936ed897 5069 ANYOF_BITMAP_SET(ret, value);
73b437c8 5070 }
c49a72a9
NC
5071 yesno = '!';
5072 what = "Graph";
73b437c8
JH
5073 break;
5074 case ANYOF_LOWER:
5075 if (LOC)
936ed897 5076 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5077 else {
5078 for (value = 0; value < 256; value++)
5079 if (isLOWER(value))
936ed897 5080 ANYOF_BITMAP_SET(ret, value);
73b437c8 5081 }
c49a72a9
NC
5082 yesno = '+';
5083 what = "Lower";
73b437c8
JH
5084 break;
5085 case ANYOF_NLOWER:
5086 if (LOC)
936ed897 5087 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5088 else {
5089 for (value = 0; value < 256; value++)
5090 if (!isLOWER(value))
936ed897 5091 ANYOF_BITMAP_SET(ret, value);
73b437c8 5092 }
c49a72a9
NC
5093 yesno = '!';
5094 what = "Lower";
73b437c8
JH
5095 break;
5096 case ANYOF_PRINT:
5097 if (LOC)
936ed897 5098 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5099 else {
5100 for (value = 0; value < 256; value++)
5101 if (isPRINT(value))
936ed897 5102 ANYOF_BITMAP_SET(ret, value);
73b437c8 5103 }
c49a72a9
NC
5104 yesno = '+';
5105 what = "Print";
73b437c8
JH
5106 break;
5107 case ANYOF_NPRINT:
5108 if (LOC)
936ed897 5109 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5110 else {
5111 for (value = 0; value < 256; value++)
5112 if (!isPRINT(value))
936ed897 5113 ANYOF_BITMAP_SET(ret, value);
73b437c8 5114 }
c49a72a9
NC
5115 yesno = '!';
5116 what = "Print";
73b437c8 5117 break;
aaa51d5e
JF
5118 case ANYOF_PSXSPC:
5119 if (LOC)
5120 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5121 else {
5122 for (value = 0; value < 256; value++)
5123 if (isPSXSPC(value))
5124 ANYOF_BITMAP_SET(ret, value);
5125 }
c49a72a9
NC
5126 yesno = '+';
5127 what = "Space";
aaa51d5e
JF
5128 break;
5129 case ANYOF_NPSXSPC:
5130 if (LOC)
5131 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5132 else {
5133 for (value = 0; value < 256; value++)
5134 if (!isPSXSPC(value))
5135 ANYOF_BITMAP_SET(ret, value);
5136 }
c49a72a9
NC
5137 yesno = '!';
5138 what = "Space";
aaa51d5e 5139 break;
73b437c8
JH
5140 case ANYOF_PUNCT:
5141 if (LOC)
936ed897 5142 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5143 else {
5144 for (value = 0; value < 256; value++)
5145 if (isPUNCT(value))
936ed897 5146 ANYOF_BITMAP_SET(ret, value);
73b437c8 5147 }
c49a72a9
NC
5148 yesno = '+';
5149 what = "Punct";
73b437c8
JH
5150 break;
5151 case ANYOF_NPUNCT:
5152 if (LOC)
936ed897 5153 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5154 else {
5155 for (value = 0; value < 256; value++)
5156 if (!isPUNCT(value))
936ed897 5157 ANYOF_BITMAP_SET(ret, value);
73b437c8 5158 }
c49a72a9
NC
5159 yesno = '!';
5160 what = "Punct";
ffc61ed2
JH
5161 break;
5162 case ANYOF_SPACE:
5163 if (LOC)
5164 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5165 else {
5166 for (value = 0; value < 256; value++)
5167 if (isSPACE(value))
5168 ANYOF_BITMAP_SET(ret, value);
5169 }
c49a72a9
NC
5170 yesno = '+';
5171 what = "SpacePerl";
ffc61ed2
JH
5172 break;
5173 case ANYOF_NSPACE:
5174 if (LOC)
5175 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5176 else {
5177 for (value = 0; value < 256; value++)
5178 if (!isSPACE(value))
5179 ANYOF_BITMAP_SET(ret, value);
5180 }
c49a72a9
NC
5181 yesno = '!';
5182 what = "SpacePerl";
73b437c8
JH
5183 break;
5184 case ANYOF_UPPER:
5185 if (LOC)
936ed897 5186 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5187 else {
5188 for (value = 0; value < 256; value++)
5189 if (isUPPER(value))
936ed897 5190 ANYOF_BITMAP_SET(ret, value);
73b437c8 5191 }
c49a72a9
NC
5192 yesno = '+';
5193 what = "Upper";
73b437c8
JH
5194 break;
5195 case ANYOF_NUPPER:
5196 if (LOC)
936ed897 5197 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5198 else {
5199 for (value = 0; value < 256; value++)
5200 if (!isUPPER(value))
936ed897 5201 ANYOF_BITMAP_SET(ret, value);
73b437c8 5202 }
c49a72a9
NC
5203 yesno = '!';
5204 what = "Upper";
73b437c8
JH
5205 break;
5206 case ANYOF_XDIGIT:
5207 if (LOC)
936ed897 5208 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5209 else {
5210 for (value = 0; value < 256; value++)
5211 if (isXDIGIT(value))
936ed897 5212 ANYOF_BITMAP_SET(ret, value);
73b437c8 5213 }
c49a72a9
NC
5214 yesno = '+';
5215 what = "XDigit";
73b437c8
JH
5216 break;
5217 case ANYOF_NXDIGIT:
5218 if (LOC)
936ed897 5219 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5220 else {
5221 for (value = 0; value < 256; value++)
5222 if (!isXDIGIT(value))
936ed897 5223 ANYOF_BITMAP_SET(ret, value);
73b437c8 5224 }
c49a72a9
NC
5225 yesno = '!';
5226 what = "XDigit";
73b437c8 5227 break;
f81125e2
JP
5228 case ANYOF_MAX:
5229 /* this is to handle \p and \P */
5230 break;
73b437c8 5231 default:
b45f050a 5232 vFAIL("Invalid [::] class");
73b437c8 5233 break;
b8c5462f 5234 }
c49a72a9
NC
5235 if (what) {
5236 /* Strings such as "+utf8::isWord\n" */
5237 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5238 }
b8c5462f 5239 if (LOC)
936ed897 5240 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5241 continue;
a0d0e21e 5242 }
ffc61ed2
JH
5243 } /* end of namedclass \blah */
5244
a0d0e21e 5245 if (range) {
eb160463 5246 if (prevvalue > (IV)value) /* b-a */ {
b45f050a 5247 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
5248 RExC_parse - rangebegin,
5249 RExC_parse - rangebegin,
b45f050a 5250 rangebegin);
3568d838 5251 range = 0; /* not a valid range */
73b437c8 5252 }
a0d0e21e
LW
5253 }
5254 else {
3568d838 5255 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5256 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5257 RExC_parse[1] != ']') {
5258 RExC_parse++;
ffc61ed2
JH
5259
5260 /* a bad range like \w-, [:word:]- ? */
5261 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 5262 if (ckWARN(WARN_REGEXP))
830247a4 5263 vWARN4(RExC_parse,
b45f050a 5264 "False [] range \"%*.*s\"",
830247a4
IZ
5265 RExC_parse - rangebegin,
5266 RExC_parse - rangebegin,
b45f050a 5267 rangebegin);
73b437c8 5268 if (!SIZE_ONLY)
936ed897 5269 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5270 } else
ffc61ed2
JH
5271 range = 1; /* yeah, it's a range! */
5272 continue; /* but do it the next time */
a0d0e21e 5273 }
a687059c 5274 }
ffc61ed2 5275
93733859 5276 /* now is the next time */
ae5c130c 5277 if (!SIZE_ONLY) {
3568d838
JH
5278 IV i;
5279
5280 if (prevvalue < 256) {
1df70142 5281 const IV ceilvalue = value < 256 ? value : 255;
3568d838
JH
5282
5283#ifdef EBCDIC
1b2d223b
JH
5284 /* In EBCDIC [\x89-\x91] should include
5285 * the \x8e but [i-j] should not. */
5286 if (literal_endpoint == 2 &&
5287 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5288 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5289 {
3568d838
JH
5290 if (isLOWER(prevvalue)) {
5291 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5292 if (isLOWER(i))
5293 ANYOF_BITMAP_SET(ret, i);
5294 } else {
3568d838 5295 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5296 if (isUPPER(i))
5297 ANYOF_BITMAP_SET(ret, i);
5298 }
8ada0baa 5299 }
ffc61ed2 5300 else
8ada0baa 5301#endif
a5961de5
JH
5302 for (i = prevvalue; i <= ceilvalue; i++)
5303 ANYOF_BITMAP_SET(ret, i);
3568d838 5304 }
a5961de5 5305 if (value > 255 || UTF) {
1df70142
AL
5306 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5307 const UV natvalue = NATIVE_TO_UNI(value);
b08decb7 5308
ffc61ed2 5309 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5310 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5311 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5312 prevnatvalue, natvalue);
5313 }
5314 else if (prevnatvalue == natvalue) {
5315 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5316 if (FOLD) {
89ebb4a3 5317 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5318 STRLEN foldlen;
1df70142 5319 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5320
c840d2a2
JH
5321 /* If folding and foldable and a single
5322 * character, insert also the folded version
5323 * to the charclass. */
9e55ce06 5324 if (f != value) {
eb160463 5325 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5326 Perl_sv_catpvf(aTHX_ listsv,
5327 "%04"UVxf"\n", f);
5328 else {
5329 /* Any multicharacter foldings
5330 * require the following transform:
5331 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5332 * where E folds into "pq" and F folds
5333 * into "rst", all other characters
5334 * fold to single characters. We save
5335 * away these multicharacter foldings,
5336 * to be later saved as part of the
5337 * additional "s" data. */
5338 SV *sv;
5339
5340 if (!unicode_alternate)
5341 unicode_alternate = newAV();
5342 sv = newSVpvn((char*)foldbuf, foldlen);
5343 SvUTF8_on(sv);
5344 av_push(unicode_alternate, sv);
5345 }
5346 }
254ba52a 5347
60a8b682
JH
5348 /* If folding and the value is one of the Greek
5349 * sigmas insert a few more sigmas to make the
5350 * folding rules of the sigmas to work right.
5351 * Note that not all the possible combinations
5352 * are handled here: some of them are handled
9e55ce06
JH
5353 * by the standard folding rules, and some of
5354 * them (literal or EXACTF cases) are handled
5355 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5356 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5357 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5358 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5359 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5360 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5361 }
5362 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5363 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5364 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5365 }
5366 }
ffc61ed2 5367 }
1b2d223b
JH
5368#ifdef EBCDIC
5369 literal_endpoint = 0;
5370#endif
8ada0baa 5371 }
ffc61ed2
JH
5372
5373 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5374 }
ffc61ed2 5375
936ed897 5376 if (need_class) {
4f66b38d 5377 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5378 if (SIZE_ONLY)
830247a4 5379 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5380 else
830247a4 5381 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5382 }
ffc61ed2 5383
ae5c130c 5384 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 5385 if (!SIZE_ONLY &&
ffc61ed2 5386 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5387 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5388 ) {
a0ed51b3 5389 for (value = 0; value < 256; ++value) {
936ed897 5390 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5391 UV fold = PL_fold[value];
ffc61ed2
JH
5392
5393 if (fold != value)
5394 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5395 }
5396 }
936ed897 5397 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5398 }
ffc61ed2 5399
ae5c130c 5400 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 5401 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
5402 /* If the only flag is inversion. */
5403 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5404 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5405 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5406 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5407 }
a0d0e21e 5408
b81d288d 5409 if (!SIZE_ONLY) {
fde631ed 5410 AV *av = newAV();
ffc61ed2
JH
5411 SV *rv;
5412
9e55ce06 5413 /* The 0th element stores the character class description
6a0407ee 5414 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5415 * to initialize the appropriate swash (which gets stored in
5416 * the 1st element), and also useful for dumping the regnode.
5417 * The 2nd element stores the multicharacter foldings,
6a0407ee 5418 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
5419 av_store(av, 0, listsv);
5420 av_store(av, 1, NULL);
9e55ce06 5421 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 5422 rv = newRV_noinc((SV*)av);
19860706 5423 n = add_data(pRExC_state, 1, "s");
830247a4 5424 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 5425 ARG_SET(ret, n);
a0ed51b3
LW
5426 }
5427
5428 return ret;
5429}
5430
76e3520e 5431STATIC char*
830247a4 5432S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 5433{
97aff369 5434 dVAR;
830247a4 5435 char* retval = RExC_parse++;
a0d0e21e 5436
4633a7c4 5437 for (;;) {
830247a4
IZ
5438 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5439 RExC_parse[2] == '#') {
e994fd66
AE
5440 while (*RExC_parse != ')') {
5441 if (RExC_parse == RExC_end)
5442 FAIL("Sequence (?#... not terminated");
830247a4 5443 RExC_parse++;
e994fd66 5444 }
830247a4 5445 RExC_parse++;
4633a7c4
LW
5446 continue;
5447 }
e2509266 5448 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
5449 if (isSPACE(*RExC_parse)) {
5450 RExC_parse++;
748a9306
LW
5451 continue;
5452 }
830247a4 5453 else if (*RExC_parse == '#') {
e994fd66
AE
5454 while (RExC_parse < RExC_end)
5455 if (*RExC_parse++ == '\n') break;
748a9306
LW
5456 continue;
5457 }
748a9306 5458 }
4633a7c4 5459 return retval;
a0d0e21e 5460 }
a687059c
LW
5461}
5462
5463/*
c277df42 5464- reg_node - emit a node
a0d0e21e 5465*/
76e3520e 5466STATIC regnode * /* Location. */
830247a4 5467S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 5468{
97aff369 5469 dVAR;
c277df42 5470 register regnode *ptr;
504618e9 5471 regnode * const ret = RExC_emit;
a687059c 5472
c277df42 5473 if (SIZE_ONLY) {
830247a4
IZ
5474 SIZE_ALIGN(RExC_size);
5475 RExC_size += 1;
a0d0e21e
LW
5476 return(ret);
5477 }
a687059c 5478
c277df42 5479 NODE_ALIGN_FILL(ret);
a0d0e21e 5480 ptr = ret;
c277df42 5481 FILL_ADVANCE_NODE(ptr, op);
fac92740 5482 if (RExC_offsets) { /* MJD */
ccb2c380 5483 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
5484 "reg_node", __LINE__,
5485 reg_name[op],
5486 RExC_emit - RExC_emit_start > RExC_offsets[0]
5487 ? "Overwriting end of array!\n" : "OK",
5488 RExC_emit - RExC_emit_start,
5489 RExC_parse - RExC_start,
5490 RExC_offsets[0]));
ccb2c380 5491 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
5492 }
5493
830247a4 5494 RExC_emit = ptr;
a687059c 5495
a0d0e21e 5496 return(ret);
a687059c
LW
5497}
5498
5499/*
a0d0e21e
LW
5500- reganode - emit a node with an argument
5501*/
76e3520e 5502STATIC regnode * /* Location. */
830247a4 5503S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 5504{
97aff369 5505 dVAR;
c277df42 5506 register regnode *ptr;
504618e9 5507 regnode * const ret = RExC_emit;
fe14fcc3 5508
c277df42 5509 if (SIZE_ONLY) {
830247a4
IZ
5510 SIZE_ALIGN(RExC_size);
5511 RExC_size += 2;
a0d0e21e
LW
5512 return(ret);
5513 }
fe14fcc3 5514
c277df42 5515 NODE_ALIGN_FILL(ret);
a0d0e21e 5516 ptr = ret;
c277df42 5517 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 5518 if (RExC_offsets) { /* MJD */
ccb2c380 5519 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5520 "reganode",
ccb2c380
MP
5521 __LINE__,
5522 reg_name[op],
fac92740
MJD
5523 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5524 "Overwriting end of array!\n" : "OK",
5525 RExC_emit - RExC_emit_start,
5526 RExC_parse - RExC_start,
5527 RExC_offsets[0]));
ccb2c380 5528 Set_Cur_Node_Offset;
fac92740
MJD
5529 }
5530
830247a4 5531 RExC_emit = ptr;
fe14fcc3 5532
a0d0e21e 5533 return(ret);
fe14fcc3
LW
5534}
5535
5536/*
cd439c50 5537- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
5538*/
5539STATIC void
a28509cc 5540S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 5541{
97aff369 5542 dVAR;
5e12f4fb 5543 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
5544}
5545
5546/*
a0d0e21e
LW
5547- reginsert - insert an operator in front of already-emitted operand
5548*
5549* Means relocating the operand.
5550*/
76e3520e 5551STATIC void
830247a4 5552S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 5553{
97aff369 5554 dVAR;
c277df42
IZ
5555 register regnode *src;
5556 register regnode *dst;
5557 register regnode *place;
504618e9 5558 const int offset = regarglen[(U8)op];
b81d288d 5559
22c35a8c 5560/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
5561
5562 if (SIZE_ONLY) {
830247a4 5563 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
5564 return;
5565 }
a687059c 5566
830247a4
IZ
5567 src = RExC_emit;
5568 RExC_emit += NODE_STEP_REGNODE + offset;
5569 dst = RExC_emit;
fac92740 5570 while (src > opnd) {
c277df42 5571 StructCopy(--src, --dst, regnode);
fac92740 5572 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 5573 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 5574 "reg_insert",
ccb2c380
MP
5575 __LINE__,
5576 reg_name[op],
fac92740
MJD
5577 dst - RExC_emit_start > RExC_offsets[0]
5578 ? "Overwriting end of array!\n" : "OK",
5579 src - RExC_emit_start,
5580 dst - RExC_emit_start,
5581 RExC_offsets[0]));
ccb2c380
MP
5582 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5583 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
5584 }
5585 }
5586
a0d0e21e
LW
5587
5588 place = opnd; /* Op node, where operand used to be. */
fac92740 5589 if (RExC_offsets) { /* MJD */
ccb2c380 5590 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5591 "reginsert",
ccb2c380
MP
5592 __LINE__,
5593 reg_name[op],
fac92740
MJD
5594 place - RExC_emit_start > RExC_offsets[0]
5595 ? "Overwriting end of array!\n" : "OK",
5596 place - RExC_emit_start,
5597 RExC_parse - RExC_start,
5598 RExC_offsets[0]));
ccb2c380 5599 Set_Node_Offset(place, RExC_parse);
45948336 5600 Set_Node_Length(place, 1);
fac92740 5601 }
c277df42
IZ
5602 src = NEXTOPER(place);
5603 FILL_ADVANCE_NODE(place, op);
5604 Zero(src, offset, regnode);
a687059c
LW
5605}
5606
5607/*
c277df42 5608- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 5609*/
76e3520e 5610STATIC void
830247a4 5611S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5612{
97aff369 5613 dVAR;
c277df42 5614 register regnode *scan;
a0d0e21e 5615
c277df42 5616 if (SIZE_ONLY)
a0d0e21e
LW
5617 return;
5618
5619 /* Find last node. */
5620 scan = p;
5621 for (;;) {
504618e9 5622 regnode * const temp = regnext(scan);
a0d0e21e
LW
5623 if (temp == NULL)
5624 break;
5625 scan = temp;
5626 }
a687059c 5627
c277df42
IZ
5628 if (reg_off_by_arg[OP(scan)]) {
5629 ARG_SET(scan, val - scan);
a0ed51b3
LW
5630 }
5631 else {
c277df42
IZ
5632 NEXT_OFF(scan) = val - scan;
5633 }
a687059c
LW
5634}
5635
5636/*
a0d0e21e
LW
5637- regoptail - regtail on operand of first argument; nop if operandless
5638*/
76e3520e 5639STATIC void
830247a4 5640S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5641{
97aff369 5642 dVAR;
a0d0e21e 5643 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
5644 if (p == NULL || SIZE_ONLY)
5645 return;
22c35a8c 5646 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 5647 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 5648 }
22c35a8c 5649 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 5650 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
5651 }
5652 else
a0d0e21e 5653 return;
a687059c
LW
5654}
5655
5656/*
5657 - regcurly - a little FSA that accepts {\d+,?\d*}
5658 */
79072805 5659STATIC I32
a3b680e6 5660S_regcurly(pTHX_ register const char *s)
a687059c
LW
5661{
5662 if (*s++ != '{')
5663 return FALSE;
f0fcb552 5664 if (!isDIGIT(*s))
a687059c 5665 return FALSE;
f0fcb552 5666 while (isDIGIT(*s))
a687059c
LW
5667 s++;
5668 if (*s == ',')
5669 s++;
f0fcb552 5670 while (isDIGIT(*s))
a687059c
LW
5671 s++;
5672 if (*s != '}')
5673 return FALSE;
5674 return TRUE;
5675}
5676
a687059c
LW
5677
5678/*
fd181c75 5679 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
5680 */
5681void
864dbfa3 5682Perl_regdump(pTHX_ regexp *r)
a687059c 5683{
35ff7856 5684#ifdef DEBUGGING
97aff369 5685 dVAR;
c445ea15 5686 SV * const sv = sv_newmortal();
a687059c 5687
c277df42 5688 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
5689
5690 /* Header fields of interest. */
c277df42 5691 if (r->anchored_substr)
7b0972df 5692 PerlIO_printf(Perl_debug_log,
a0288114 5693 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
3280af22 5694 PL_colors[0],
7b0972df 5695 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
3f7c398e 5696 SvPVX_const(r->anchored_substr),
3280af22 5697 PL_colors[1],
c277df42 5698 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 5699 (IV)r->anchored_offset);
33b8afdf
JH
5700 else if (r->anchored_utf8)
5701 PerlIO_printf(Perl_debug_log,
a0288114 5702 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
33b8afdf
JH
5703 PL_colors[0],
5704 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
3f7c398e 5705 SvPVX_const(r->anchored_utf8),
33b8afdf
JH
5706 PL_colors[1],
5707 SvTAIL(r->anchored_utf8) ? "$" : "",
5708 (IV)r->anchored_offset);
c277df42 5709 if (r->float_substr)
7b0972df 5710 PerlIO_printf(Perl_debug_log,
a0288114 5711 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
3280af22 5712 PL_colors[0],
b81d288d 5713 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
3f7c398e 5714 SvPVX_const(r->float_substr),
3280af22 5715 PL_colors[1],
c277df42 5716 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 5717 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
5718 else if (r->float_utf8)
5719 PerlIO_printf(Perl_debug_log,
a0288114 5720 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
33b8afdf
JH
5721 PL_colors[0],
5722 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
3f7c398e 5723 SvPVX_const(r->float_utf8),
33b8afdf
JH
5724 PL_colors[1],
5725 SvTAIL(r->float_utf8) ? "$" : "",
5726 (IV)r->float_min_offset, (UV)r->float_max_offset);
5727 if (r->check_substr || r->check_utf8)
b81d288d
AB
5728 PerlIO_printf(Perl_debug_log,
5729 r->check_substr == r->float_substr
33b8afdf 5730 && r->check_utf8 == r->float_utf8
c277df42
IZ
5731 ? "(checking floating" : "(checking anchored");
5732 if (r->reganch & ROPT_NOSCAN)
5733 PerlIO_printf(Perl_debug_log, " noscan");
5734 if (r->reganch & ROPT_CHECK_ALL)
5735 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 5736 if (r->check_substr || r->check_utf8)
c277df42
IZ
5737 PerlIO_printf(Perl_debug_log, ") ");
5738
46fc3d4c 5739 if (r->regstclass) {
5740 regprop(sv, r->regstclass);
3f7c398e 5741 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
46fc3d4c 5742 }
774d564b 5743 if (r->reganch & ROPT_ANCH) {
5744 PerlIO_printf(Perl_debug_log, "anchored");
5745 if (r->reganch & ROPT_ANCH_BOL)
5746 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
5747 if (r->reganch & ROPT_ANCH_MBOL)
5748 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
5749 if (r->reganch & ROPT_ANCH_SBOL)
5750 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 5751 if (r->reganch & ROPT_ANCH_GPOS)
5752 PerlIO_printf(Perl_debug_log, "(GPOS)");
5753 PerlIO_putc(Perl_debug_log, ' ');
5754 }
c277df42
IZ
5755 if (r->reganch & ROPT_GPOS_SEEN)
5756 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 5757 if (r->reganch & ROPT_SKIP)
760ac839 5758 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 5759 if (r->reganch & ROPT_IMPLICIT)
760ac839 5760 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 5761 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
5762 if (r->reganch & ROPT_EVAL_SEEN)
5763 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 5764 PerlIO_printf(Perl_debug_log, "\n");
fac92740 5765 if (r->offsets) {
e4584336 5766 const U32 len = r->offsets[0];
a3621e74
YO
5767 GET_RE_DEBUG_FLAGS_DECL;
5768 DEBUG_OFFSETS_r({
1df70142 5769 U32 i;
e4584336
RB
5770 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5771 for (i = 1; i <= len; i++)
5772 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5773 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5774 PerlIO_printf(Perl_debug_log, "\n");
a3621e74 5775 });
fac92740 5776 }
65e66c80
SP
5777#else
5778 PERL_UNUSED_ARG(r);
17c3b450 5779#endif /* DEBUGGING */
a687059c
LW
5780}
5781
5782/*
a0d0e21e
LW
5783- regprop - printable representation of opcode
5784*/
46fc3d4c 5785void
a3b680e6 5786Perl_regprop(pTHX_ SV *sv, const regnode *o)
a687059c 5787{
35ff7856 5788#ifdef DEBUGGING
97aff369 5789 dVAR;
9b155405 5790 register int k;
a0d0e21e 5791
54dc92de 5792 sv_setpvn(sv, "", 0);
9b155405 5793 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
5794 /* It would be nice to FAIL() here, but this may be called from
5795 regexec.c, and it would be hard to supply pRExC_state. */
5796 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 5797 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405
IZ
5798
5799 k = PL_regkind[(U8)OP(o)];
5800
2a782b5b 5801 if (k == EXACT) {
396482e1 5802 SV * const dsv = sv_2mortal(newSVpvs(""));
c728cb41
JH
5803 /* Using is_utf8_string() is a crude hack but it may
5804 * be the best for now since we have no flag "this EXACTish
5805 * node was UTF-8" --jhi */
1df70142 5806 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
2d03de9c 5807 const char * const s = do_utf8 ?
c728cb41
JH
5808 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5809 UNI_DISPLAY_REGEX) :
2a782b5b 5810 STRING(o);
e1ec3a88 5811 const int len = do_utf8 ?
2a782b5b
JH
5812 strlen(s) :
5813 STR_LEN(o);
5814 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5815 PL_colors[0],
5816 len, s,
5817 PL_colors[1]);
a3621e74
YO
5818 } else if (k == TRIE) {/*
5819 this isn't always safe, as Pl_regdata may not be for this regex yet
5820 (depending on where its called from) so its being moved to dumpuntil
5821 I32 n = ARG(o);
5822 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5823 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5824 trie->wordcount,
5825 trie->charcount,
5826 trie->uniquecharcount,
5827 trie->laststate);
5828 */
5829 } else if (k == CURLY) {
cb434fcc 5830 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
5831 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5832 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 5833 }
2c2d71f5
JH
5834 else if (k == WHILEM && o->flags) /* Ordinal/of */
5835 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 5836 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 5837 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 5838 else if (k == LOGICAL)
04ebc1ab 5839 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
5840 else if (k == ANYOF) {
5841 int i, rangestart = -1;
2d03de9c 5842 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
5843
5844 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
5845 static const char * const anyofs[] = {
653099ff
GS
5846 "\\w",
5847 "\\W",
5848 "\\s",
5849 "\\S",
5850 "\\d",
5851 "\\D",
5852 "[:alnum:]",
5853 "[:^alnum:]",
5854 "[:alpha:]",
5855 "[:^alpha:]",
5856 "[:ascii:]",
5857 "[:^ascii:]",
5858 "[:ctrl:]",
5859 "[:^ctrl:]",
5860 "[:graph:]",
5861 "[:^graph:]",
5862 "[:lower:]",
5863 "[:^lower:]",
5864 "[:print:]",
5865 "[:^print:]",
5866 "[:punct:]",
5867 "[:^punct:]",
5868 "[:upper:]",
aaa51d5e 5869 "[:^upper:]",
653099ff 5870 "[:xdigit:]",
aaa51d5e
JF
5871 "[:^xdigit:]",
5872 "[:space:]",
5873 "[:^space:]",
5874 "[:blank:]",
5875 "[:^blank:]"
653099ff
GS
5876 };
5877
19860706 5878 if (flags & ANYOF_LOCALE)
396482e1 5879 sv_catpvs(sv, "{loc}");
19860706 5880 if (flags & ANYOF_FOLD)
396482e1 5881 sv_catpvs(sv, "{i}");
653099ff 5882 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 5883 if (flags & ANYOF_INVERT)
396482e1 5884 sv_catpvs(sv, "^");
ffc61ed2
JH
5885 for (i = 0; i <= 256; i++) {
5886 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5887 if (rangestart == -1)
5888 rangestart = i;
5889 } else if (rangestart != -1) {
5890 if (i <= rangestart + 3)
5891 for (; rangestart < i; rangestart++)
653099ff 5892 put_byte(sv, rangestart);
ffc61ed2
JH
5893 else {
5894 put_byte(sv, rangestart);
396482e1 5895 sv_catpvs(sv, "-");
ffc61ed2 5896 put_byte(sv, i - 1);
653099ff 5897 }
ffc61ed2 5898 rangestart = -1;
653099ff 5899 }
847a199f 5900 }
ffc61ed2
JH
5901
5902 if (o->flags & ANYOF_CLASS)
5903 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
5904 if (ANYOF_CLASS_TEST(o,i))
5905 sv_catpv(sv, anyofs[i]);
5906
5907 if (flags & ANYOF_UNICODE)
396482e1 5908 sv_catpvs(sv, "{unicode}");
1aa99e6b 5909 else if (flags & ANYOF_UNICODE_ALL)
396482e1 5910 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
5911
5912 {
5913 SV *lv;
2d03de9c 5914 SV * const sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 5915
ffc61ed2
JH
5916 if (lv) {
5917 if (sw) {
89ebb4a3 5918 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 5919
ffc61ed2 5920 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 5921 uvchr_to_utf8(s, i);
ffc61ed2 5922
3568d838 5923 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
5924 if (rangestart == -1)
5925 rangestart = i;
5926 } else if (rangestart != -1) {
ffc61ed2
JH
5927 if (i <= rangestart + 3)
5928 for (; rangestart < i; rangestart++) {
2d03de9c
AL
5929 const U8 * const e = uvchr_to_utf8(s,rangestart);
5930 U8 *p;
5931 for(p = s; p < e; p++)
ffc61ed2
JH
5932 put_byte(sv, *p);
5933 }
5934 else {
2d03de9c
AL
5935 const U8 *e = uvchr_to_utf8(s,rangestart);
5936 U8 *p;
5937 for (p = s; p < e; p++)
ffc61ed2 5938 put_byte(sv, *p);
396482e1 5939 sv_catpvs(sv, "-");
2d03de9c
AL
5940 e = uvchr_to_utf8(s, i-1);
5941 for (p = s; p < e; p++)
1df70142 5942 put_byte(sv, *p);
ffc61ed2
JH
5943 }
5944 rangestart = -1;
5945 }
19860706 5946 }
ffc61ed2 5947
396482e1 5948 sv_catpvs(sv, "..."); /* et cetera */
19860706 5949 }
fde631ed 5950
ffc61ed2 5951 {
2e0de35c 5952 char *s = savesvpv(lv);
c445ea15 5953 char * const origs = s;
b81d288d 5954
ffc61ed2 5955 while(*s && *s != '\n') s++;
b81d288d 5956
ffc61ed2 5957 if (*s == '\n') {
2d03de9c 5958 const char * const t = ++s;
ffc61ed2
JH
5959
5960 while (*s) {
5961 if (*s == '\n')
5962 *s = ' ';
5963 s++;
5964 }
5965 if (s[-1] == ' ')
5966 s[-1] = 0;
5967
5968 sv_catpv(sv, t);
fde631ed 5969 }
b81d288d 5970
ffc61ed2 5971 Safefree(origs);
fde631ed
JH
5972 }
5973 }
653099ff 5974 }
ffc61ed2 5975
653099ff
GS
5976 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5977 }
9b155405 5978 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 5979 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
65e66c80
SP
5980#else
5981 PERL_UNUSED_ARG(sv);
5982 PERL_UNUSED_ARG(o);
17c3b450 5983#endif /* DEBUGGING */
35ff7856 5984}
a687059c 5985
cad2e5aa
JH
5986SV *
5987Perl_re_intuit_string(pTHX_ regexp *prog)
5988{ /* Assume that RE_INTUIT is set */
97aff369 5989 dVAR;
a3621e74
YO
5990 GET_RE_DEBUG_FLAGS_DECL;
5991 DEBUG_COMPILE_r(
cfd0369c 5992 {
2d03de9c 5993 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 5994 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
5995
5996 if (!PL_colorset) reginitcolors();
5997 PerlIO_printf(Perl_debug_log,
a0288114 5998 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
5999 PL_colors[4],
6000 prog->check_substr ? "" : "utf8 ",
6001 PL_colors[5],PL_colors[0],
cad2e5aa
JH
6002 s,
6003 PL_colors[1],
6004 (strlen(s) > 60 ? "..." : ""));
6005 } );
6006
33b8afdf 6007 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
6008}
6009
2b69d0c2 6010void
864dbfa3 6011Perl_pregfree(pTHX_ struct regexp *r)
a687059c 6012{
27da23d5 6013 dVAR;
9e55ce06 6014#ifdef DEBUGGING
c445ea15
AL
6015 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
6016 SV * const re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
9e55ce06 6017#endif
7821416a 6018
a3621e74 6019
7821416a
IZ
6020 if (!r || (--r->refcnt > 0))
6021 return;
a3621e74 6022 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
c445ea15 6023 const char * const s = (r->reganch & ROPT_UTF8)
e1ec3a88 6024 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 6025 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
e1ec3a88 6026 const int len = SvCUR(dsv);
9e55ce06
JH
6027 if (!PL_colorset)
6028 reginitcolors();
6029 PerlIO_printf(Perl_debug_log,
a3621e74 6030 "%sFreeing REx:%s %s%*.*s%s%s\n",
9e55ce06
JH
6031 PL_colors[4],PL_colors[5],PL_colors[0],
6032 len, len, s,
6033 PL_colors[1],
6034 len > 60 ? "..." : "");
6035 });
cad2e5aa 6036
43c5f42d
NC
6037 /* gcov results gave these as non-null 100% of the time, so there's no
6038 optimisation in checking them before calling Safefree */
6039 Safefree(r->precomp);
6040 Safefree(r->offsets); /* 20010421 MJD */
ed252734 6041 RX_MATCH_COPY_FREE(r);
f8c7b90f 6042#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
6043 if (r->saved_copy)
6044 SvREFCNT_dec(r->saved_copy);
6045#endif
a193d654
GS
6046 if (r->substrs) {
6047 if (r->anchored_substr)
6048 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6049 if (r->anchored_utf8)
6050 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6051 if (r->float_substr)
6052 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6053 if (r->float_utf8)
6054 SvREFCNT_dec(r->float_utf8);
2779dcf1 6055 Safefree(r->substrs);
a193d654 6056 }
c277df42
IZ
6057 if (r->data) {
6058 int n = r->data->count;
f3548bdc
DM
6059 PAD* new_comppad = NULL;
6060 PAD* old_comppad;
4026c95a 6061 PADOFFSET refcnt;
dfad63ad 6062
c277df42 6063 while (--n >= 0) {
261faec3 6064 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6065 switch (r->data->what[n]) {
6066 case 's':
6067 SvREFCNT_dec((SV*)r->data->data[n]);
6068 break;
653099ff
GS
6069 case 'f':
6070 Safefree(r->data->data[n]);
6071 break;
dfad63ad
HS
6072 case 'p':
6073 new_comppad = (AV*)r->data->data[n];
6074 break;
c277df42 6075 case 'o':
dfad63ad 6076 if (new_comppad == NULL)
cea2e8a9 6077 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6078 PAD_SAVE_LOCAL(old_comppad,
6079 /* Watch out for global destruction's random ordering. */
c445ea15 6080 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 6081 );
b34c0dd4 6082 OP_REFCNT_LOCK;
4026c95a
SH
6083 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6084 OP_REFCNT_UNLOCK;
6085 if (!refcnt)
9b978d73 6086 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6087
f3548bdc 6088 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6089 SvREFCNT_dec((SV*)new_comppad);
6090 new_comppad = NULL;
c277df42
IZ
6091 break;
6092 case 'n':
9e55ce06 6093 break;
a3621e74
YO
6094 case 't':
6095 {
c445ea15 6096 reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
a3621e74
YO
6097 U32 refcount;
6098 OP_REFCNT_LOCK;
e27afef8 6099 refcount = --trie->refcount;
a3621e74
YO
6100 OP_REFCNT_UNLOCK;
6101 if ( !refcount ) {
43c5f42d 6102 Safefree(trie->charmap);
a3621e74
YO
6103 if (trie->widecharmap)
6104 SvREFCNT_dec((SV*)trie->widecharmap);
43c5f42d
NC
6105 Safefree(trie->states);
6106 Safefree(trie->trans);
a3621e74
YO
6107#ifdef DEBUGGING
6108 if (trie->words)
6109 SvREFCNT_dec((SV*)trie->words);
6110 if (trie->revcharmap)
6111 SvREFCNT_dec((SV*)trie->revcharmap);
6112#endif
6113 Safefree(r->data->data[n]); /* do this last!!!! */
6114 }
6115 break;
6116 }
c277df42 6117 default:
830247a4 6118 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6119 }
6120 }
6121 Safefree(r->data->what);
6122 Safefree(r->data);
a0d0e21e
LW
6123 }
6124 Safefree(r->startp);
6125 Safefree(r->endp);
6126 Safefree(r);
a687059c 6127}
c277df42
IZ
6128
6129/*
6130 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6131 */
6132regnode *
864dbfa3 6133Perl_regnext(pTHX_ register regnode *p)
c277df42 6134{
97aff369 6135 dVAR;
c277df42
IZ
6136 register I32 offset;
6137
3280af22 6138 if (p == &PL_regdummy)
c277df42
IZ
6139 return(NULL);
6140
6141 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6142 if (offset == 0)
6143 return(NULL);
6144
c277df42 6145 return(p+offset);
c277df42
IZ
6146}
6147
01f988be 6148STATIC void
cea2e8a9 6149S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6150{
6151 va_list args;
6152 STRLEN l1 = strlen(pat1);
6153 STRLEN l2 = strlen(pat2);
6154 char buf[512];
06bf62c7 6155 SV *msv;
73d840c0 6156 const char *message;
c277df42
IZ
6157
6158 if (l1 > 510)
6159 l1 = 510;
6160 if (l1 + l2 > 510)
6161 l2 = 510 - l1;
6162 Copy(pat1, buf, l1 , char);
6163 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6164 buf[l1 + l2] = '\n';
6165 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6166#ifdef I_STDARG
6167 /* ANSI variant takes additional second argument */
c277df42 6168 va_start(args, pat2);
8736538c
AS
6169#else
6170 va_start(args);
6171#endif
5a844595 6172 msv = vmess(buf, &args);
c277df42 6173 va_end(args);
cfd0369c 6174 message = SvPV_const(msv,l1);
c277df42
IZ
6175 if (l1 > 512)
6176 l1 = 512;
6177 Copy(message, buf, l1 , char);
197cf9b9 6178 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6179 Perl_croak(aTHX_ "%s", buf);
c277df42 6180}
a0ed51b3
LW
6181
6182/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6183
6184void
864dbfa3 6185Perl_save_re_context(pTHX)
b81d288d 6186{
97aff369 6187 dVAR;
830247a4 6188 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 6189 SAVEPPTR(PL_bostr);
a0ed51b3
LW
6190 SAVEPPTR(PL_reginput); /* String-input pointer. */
6191 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6192 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
6193 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6194 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6195 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a5db57d6 6196 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
a0ed51b3 6197 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 6198 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 6199 PL_reg_start_tmp = 0;
a0ed51b3
LW
6200 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6201 PL_reg_start_tmpl = 0;
7766f137 6202 SAVEVPTR(PL_regdata);
a0ed51b3
LW
6203 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6204 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 6205 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 6206 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
6207 SAVEVPTR(PL_regcc); /* from regexec.c */
6208 SAVEVPTR(PL_curcop);
7766f137
GS
6209 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6210 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
6211 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6212 SAVESPTR(PL_reg_sv); /* from regexec.c */
9febdf04 6213 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
7766f137 6214 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 6215 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
6216 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6217 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
a5db57d6 6218 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
c445ea15 6219 PL_reg_oldsaved = NULL;
a5db57d6
GS
6220 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6221 PL_reg_oldsavedlen = 0;
f8c7b90f 6222#ifdef PERL_OLD_COPY_ON_WRITE
ed252734 6223 SAVESPTR(PL_nrs);
c445ea15 6224 PL_nrs = NULL;
ed252734 6225#endif
a5db57d6
GS
6226 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6227 PL_reg_maxiter = 0;
6228 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6229 PL_reg_leftiter = 0;
6230 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
c445ea15 6231 PL_reg_poscache = NULL;
a5db57d6
GS
6232 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6233 PL_reg_poscache_size = 0;
6234 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5fb7366e 6235 SAVEI32(PL_regnpar); /* () count. */
e49a9654 6236 SAVEI32(PL_regsize); /* from regexec.c */
ada6e8a9 6237
c445ea15
AL
6238 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6239 if (PL_curpm) {
6240 const REGEXP * const rx = PM_GETRE(PL_curpm);
6241 if (rx) {
1df70142 6242 U32 i;
ada6e8a9 6243 for (i = 1; i <= rx->nparens; i++) {
1df70142 6244 char digits[TYPE_CHARS(long)];
e5105eda 6245 const STRLEN len = my_sprintf(digits, "%lu", (long)i);
c445ea15
AL
6246 GV * const mgv = gv_fetchpvn_flags(digits, len, 0, SVt_PV);
6247 if (mgv)
ada6e8a9
AMS
6248 save_scalar(mgv);
6249 }
6250 }
6251 }
6252
54b6e2fa 6253#ifdef DEBUGGING
b81d288d 6254 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 6255#endif
a0ed51b3 6256}
51371543 6257
51371543 6258static void
acfe0abc 6259clear_re(pTHX_ void *r)
51371543 6260{
97aff369 6261 dVAR;
51371543
GS
6262 ReREFCNT_dec((regexp *)r);
6263}
ffbc6a93 6264
a28509cc
AL
6265#ifdef DEBUGGING
6266
6267STATIC void
6268S_put_byte(pTHX_ SV *sv, int c)
6269{
6270 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6271 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6272 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6273 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6274 else
6275 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6276}
6277
6278
6279STATIC regnode *
6280S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
6281{
97aff369 6282 dVAR;
a28509cc
AL
6283 register U8 op = EXACT; /* Arbitrary non-END op. */
6284 register regnode *next;
6285
6286 while (op != END && (!last || node < last)) {
6287 /* While that wasn't END last time... */
6288
6289 NODE_ALIGN(node);
6290 op = OP(node);
6291 if (op == CLOSE)
6292 l--;
6293 next = regnext(node);
6294 /* Where, what. */
6295 if (OP(node) == OPTIMIZED)
6296 goto after_print;
6297 regprop(sv, node);
6298 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6299 (int)(2*l + 1), "", SvPVX_const(sv));
6300 if (next == NULL) /* Next ptr. */
6301 PerlIO_printf(Perl_debug_log, "(0)");
6302 else
6303 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6304 (void)PerlIO_putc(Perl_debug_log, '\n');
6305 after_print:
6306 if (PL_regkind[(U8)op] == BRANCHJ) {
6307 register regnode *nnode = (OP(next) == LONGJMP
6308 ? regnext(next)
6309 : next);
6310 if (last && nnode > last)
6311 nnode = last;
6312 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6313 }
6314 else if (PL_regkind[(U8)op] == BRANCH) {
6315 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
6316 }
6317 else if ( PL_regkind[(U8)op] == TRIE ) {
6318 const I32 n = ARG(node);
6319 const reg_trie_data * const trie = (reg_trie_data*)PL_regdata->data[n];
6320 const I32 arry_len = av_len(trie->words)+1;
6321 I32 word_idx;
6322 PerlIO_printf(Perl_debug_log,
6323 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6324 (int)(2*(l+3)),
6325 "",
6326 trie->wordcount,
6327 (int)trie->charcount,
6328 trie->uniquecharcount,
6329 (IV)trie->laststate-1,
6330 node->flags ? " EVAL mode" : "");
6331
6332 for (word_idx=0; word_idx < arry_len; word_idx++) {
6333 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
6334 if (elem_ptr) {
6335 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6336 (int)(2*(l+4)), "",
6337 PL_colors[0],
cfd0369c 6338 SvPV_nolen_const(*elem_ptr),
a28509cc
AL
6339 PL_colors[1]
6340 );
6341 /*
6342 if (next == NULL)
6343 PerlIO_printf(Perl_debug_log, "(0)\n");
6344 else
6345 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6346 */
6347 }
6348
6349 }
6350
6351 node = NEXTOPER(node);
6352 node += regarglen[(U8)op];
6353
6354 }
6355 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6356 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6357 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6358 }
6359 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6360 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6361 next, sv, l + 1);
6362 }
6363 else if ( op == PLUS || op == STAR) {
6364 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6365 }
6366 else if (op == ANYOF) {
6367 /* arglen 1 + class block */
6368 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6369 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6370 node = NEXTOPER(node);
6371 }
6372 else if (PL_regkind[(U8)op] == EXACT) {
6373 /* Literal string, where present. */
6374 node += NODE_SZ_STR(node) - 1;
6375 node = NEXTOPER(node);
6376 }
6377 else {
6378 node = NEXTOPER(node);
6379 node += regarglen[(U8)op];
6380 }
6381 if (op == CURLYX || op == OPEN)
6382 l++;
6383 else if (op == WHILEM)
6384 l--;
6385 }
6386 return node;
6387}
6388
6389#endif /* DEBUGGING */
6390
241d1a3b
NC
6391/*
6392 * Local variables:
6393 * c-indentation-style: bsd
6394 * c-basic-offset: 4
6395 * indent-tabs-mode: t
6396 * End:
6397 *
37442d52
RGS
6398 * ex: set ts=8 sts=4 sw=4 noet:
6399 */