This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Serialise changes to %^H onto the current COP. Return the compile time
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e
AD
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
cad2e5aa 38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
d06ea78c 44/* We *really* need to overwrite these symbols: */
56953603
IZ
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
d06ea78c 48# define Perl_pregfree my_regfree
cad2e5aa
JH
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
d06ea78c 51# define Perl_regnext my_regnext
f0b8d043 52# define Perl_save_re_context my_save_re_context
b81d288d 53# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
b81d288d 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 *
79 **** Alterations to Henry's code are...
80 ****
4bb101f2 81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 82 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 83 ****
9ef589d8
LW
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
86
a687059c
LW
87 *
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
91 */
92#include "EXTERN.h"
864dbfa3 93#define PERL_IN_REGCOMP_C
a687059c 94#include "perl.h"
d06ea78c 95
acfe0abc 96#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
97# include "INTERN.h"
98#endif
c277df42
IZ
99
100#define REG_COMP_C
a687059c
LW
101#include "regcomp.h"
102
d4cce5f1 103#ifdef op
11343788 104#undef op
d4cce5f1 105#endif /* op */
11343788 106
fe14fcc3 107#ifdef MSDOS
7e4e8c89 108# if defined(BUGGY_MSC6)
fe14fcc3 109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 110# pragma optimize("a",off)
fe14fcc3 111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
112# pragma optimize("w",on )
113# endif /* BUGGY_MSC6 */
fe14fcc3
LW
114#endif /* MSDOS */
115
a687059c
LW
116#ifndef STATIC
117#define STATIC static
118#endif
119
830247a4 120typedef struct RExC_state_t {
e2509266 121 U32 flags; /* are we folding, multilining? */
830247a4
IZ
122 char *precomp; /* uncompiled string. */
123 regexp *rx;
fac92740 124 char *start; /* Start of input for compile */
830247a4
IZ
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 128 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 129 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
132 U32 seen;
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
135 I32 extralen;
136 I32 seen_zerolen;
137 I32 seen_evals;
1aa99e6b 138 I32 utf8;
830247a4
IZ
139#if ADD_TO_REGEXEC
140 char *starttry; /* -Dr: where regtry was called. */
141#define RExC_starttry (pRExC_state->starttry)
142#endif
143} RExC_state_t;
144
e2509266 145#define RExC_flags (pRExC_state->flags)
830247a4
IZ
146#define RExC_precomp (pRExC_state->precomp)
147#define RExC_rx (pRExC_state->rx)
fac92740 148#define RExC_start (pRExC_state->start)
830247a4
IZ
149#define RExC_end (pRExC_state->end)
150#define RExC_parse (pRExC_state->parse)
151#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 152#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 153#define RExC_emit (pRExC_state->emit)
fac92740 154#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
155#define RExC_naughty (pRExC_state->naughty)
156#define RExC_sawback (pRExC_state->sawback)
157#define RExC_seen (pRExC_state->seen)
158#define RExC_size (pRExC_state->size)
159#define RExC_npar (pRExC_state->npar)
160#define RExC_extralen (pRExC_state->extralen)
161#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 163#define RExC_utf8 (pRExC_state->utf8)
830247a4 164
a687059c
LW
165#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
a687059c 168
35c8bce7
LW
169#ifdef SPSTART
170#undef SPSTART /* dratted cpp namespace... */
171#endif
a687059c
LW
172/*
173 * Flags to be passed up and down.
174 */
a687059c 175#define WORST 0 /* Worst case. */
821b33a5 176#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
177#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178#define SPSTART 0x4 /* Starts with * or +. */
179#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 180
2c2d71f5
JH
181/* Length of a variant. */
182
183typedef struct scan_data_t {
184 I32 len_min;
185 I32 len_delta;
186 I32 pos_min;
187 I32 pos_delta;
188 SV *last_found;
189 I32 last_end; /* min value, <0 unless valid. */
190 I32 last_start_min;
191 I32 last_start_max;
192 SV **longest; /* Either &l_fixed, or &l_float. */
193 SV *longest_fixed;
194 I32 offset_fixed;
195 SV *longest_float;
196 I32 offset_float_min;
197 I32 offset_float_max;
198 I32 flags;
199 I32 whilem_c;
cb434fcc 200 I32 *last_closep;
653099ff 201 struct regnode_charclass_class *start_class;
2c2d71f5
JH
202} scan_data_t;
203
a687059c 204/*
e50aee73 205 * Forward declarations for pregcomp()'s friends.
a687059c 206 */
a0d0e21e 207
27da23d5
JH
208static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
210
211#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212#define SF_BEFORE_SEOL 0x1
213#define SF_BEFORE_MEOL 0x2
214#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
216
09b7f37c
CB
217#ifdef NO_UNARY_PLUS
218# define SF_FIX_SHIFT_EOL (0+2)
219# define SF_FL_SHIFT_EOL (0+4)
220#else
221# define SF_FIX_SHIFT_EOL (+2)
222# define SF_FL_SHIFT_EOL (+4)
223#endif
c277df42
IZ
224
225#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
227
228#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230#define SF_IS_INF 0x40
231#define SF_HAS_PAR 0x80
232#define SF_IN_PAR 0x100
233#define SF_HAS_EVAL 0x200
4bfe0158 234#define SCF_DO_SUBSTR 0x400
653099ff
GS
235#define SCF_DO_STCLASS_AND 0x0800
236#define SCF_DO_STCLASS_OR 0x1000
237#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 238#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 239
eb160463 240#define UTF (RExC_utf8 != 0)
e2509266
JH
241#define LOC ((RExC_flags & PMf_LOCALE) != 0)
242#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 243
ffc61ed2 244#define OOB_UNICODE 12345678
93733859 245#define OOB_NAMEDCLASS -1
b8c5462f 246
a0ed51b3
LW
247#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
249
8615cb43 250
b45f050a
JF
251/* length of regex to show in messages that don't mark a position within */
252#define RegexLengthToShowInErrorMessages 127
253
254/*
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
258 */
7253e4e3
RK
259#define MARKER1 "<-- HERE" /* marker as it appears in the description */
260#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 261
7253e4e3 262#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
263
264/*
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
267 * "...".
268 */
ccb2c380 269#define FAIL(msg) STMT_START { \
bfed75c6 270 const char *ellipses = ""; \
ccb2c380
MP
271 IV len = RExC_end - RExC_precomp; \
272 \
273 if (!SIZE_ONLY) \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
278 ellipses = "..."; \
279 } \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
282} STMT_END
8615cb43 283
b45f050a 284/*
b45f050a
JF
285 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
286 */
ccb2c380 287#define Simple_vFAIL(m) STMT_START { \
a28509cc 288 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
289 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
290 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
291} STMT_END
b45f050a
JF
292
293/*
294 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
295 */
ccb2c380
MP
296#define vFAIL(m) STMT_START { \
297 if (!SIZE_ONLY) \
298 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
299 Simple_vFAIL(m); \
300} STMT_END
b45f050a
JF
301
302/*
303 * Like Simple_vFAIL(), but accepts two arguments.
304 */
ccb2c380 305#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 306 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
307 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
308 (int)offset, RExC_precomp, RExC_precomp + offset); \
309} STMT_END
b45f050a
JF
310
311/*
312 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
313 */
ccb2c380
MP
314#define vFAIL2(m,a1) STMT_START { \
315 if (!SIZE_ONLY) \
316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
317 Simple_vFAIL2(m, a1); \
318} STMT_END
b45f050a
JF
319
320
321/*
322 * Like Simple_vFAIL(), but accepts three arguments.
323 */
ccb2c380 324#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 325 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
326 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
327 (int)offset, RExC_precomp, RExC_precomp + offset); \
328} STMT_END
b45f050a
JF
329
330/*
331 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
332 */
ccb2c380
MP
333#define vFAIL3(m,a1,a2) STMT_START { \
334 if (!SIZE_ONLY) \
335 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
336 Simple_vFAIL3(m, a1, a2); \
337} STMT_END
b45f050a
JF
338
339/*
340 * Like Simple_vFAIL(), but accepts four arguments.
341 */
ccb2c380 342#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 343 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
344 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
345 (int)offset, RExC_precomp, RExC_precomp + offset); \
346} STMT_END
b45f050a 347
ccb2c380 348#define vWARN(loc,m) STMT_START { \
a28509cc 349 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
350 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
351 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
352} STMT_END
353
354#define vWARNdep(loc,m) STMT_START { \
a28509cc 355 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
356 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
357 "%s" REPORT_LOCATION, \
358 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
359} STMT_END
360
361
362#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 363 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
364 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
365 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
366} STMT_END
367
368#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 369 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
370 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
371 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
372} STMT_END
373
374#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 375 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
376 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
377 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
378} STMT_END
379
380#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 381 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
383 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
384} STMT_END
9d1d55b5 385
8615cb43 386
cd439c50 387/* Allow for side effects in s */
ccb2c380
MP
388#define REGC(c,s) STMT_START { \
389 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
390} STMT_END
cd439c50 391
fac92740
MJD
392/* Macros for recording node offsets. 20001227 mjd@plover.com
393 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
394 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
395 * Element 0 holds the number n.
396 */
397
398#define MJD_OFFSET_DEBUG(x)
a3621e74 399/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
400
401
402#define Set_Node_Offset_To_R(node,byte) STMT_START { \
403 if (! SIZE_ONLY) { \
404 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
405 __LINE__, (node), (byte))); \
406 if((node) < 0) { \
551405c4 407 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
408 } else { \
409 RExC_offsets[2*(node)-1] = (byte); \
410 } \
411 } \
412} STMT_END
413
414#define Set_Node_Offset(node,byte) \
415 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
416#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
417
418#define Set_Node_Length_To_R(node,len) STMT_START { \
419 if (! SIZE_ONLY) { \
420 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 421 __LINE__, (int)(node), (int)(len))); \
ccb2c380 422 if((node) < 0) { \
551405c4 423 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
424 } else { \
425 RExC_offsets[2*(node)] = (len); \
426 } \
427 } \
428} STMT_END
429
430#define Set_Node_Length(node,len) \
431 Set_Node_Length_To_R((node)-RExC_emit_start, len)
432#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
433#define Set_Node_Cur_Length(node) \
434 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
435
436/* Get offsets and lengths */
437#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
438#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
439
acfe0abc 440static void clear_re(pTHX_ void *r);
4327152a 441
653099ff
GS
442/* Mark that we cannot extend a found fixed substring at this point.
443 Updata the longest found anchored substring and the longest found
444 floating substrings if needed. */
445
4327152a 446STATIC void
830247a4 447S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 448{
e1ec3a88
AL
449 const STRLEN l = CHR_SVLEN(data->last_found);
450 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 451
c277df42 452 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 453 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
454 if (*data->longest == data->longest_fixed) {
455 data->offset_fixed = l ? data->last_start_min : data->pos_min;
456 if (data->flags & SF_BEFORE_EOL)
b81d288d 457 data->flags
c277df42
IZ
458 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
459 else
460 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
461 }
462 else {
c277df42 463 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
464 data->offset_float_max = (l
465 ? data->last_start_max
c277df42 466 : data->pos_min + data->pos_delta);
9051bda5
HS
467 if ((U32)data->offset_float_max > (U32)I32_MAX)
468 data->offset_float_max = I32_MAX;
c277df42 469 if (data->flags & SF_BEFORE_EOL)
b81d288d 470 data->flags
c277df42
IZ
471 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
472 else
473 data->flags &= ~SF_FL_BEFORE_EOL;
474 }
475 }
476 SvCUR_set(data->last_found, 0);
0eda9292 477 {
a28509cc
AL
478 SV * const sv = data->last_found;
479 MAGIC * const mg =
0eda9292 480 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
45f47268 481 if (mg)
0eda9292
JH
482 mg->mg_len = 0;
483 }
c277df42
IZ
484 data->last_end = -1;
485 data->flags &= ~SF_BEFORE_EOL;
486}
487
653099ff
GS
488/* Can match anything (initialization) */
489STATIC void
5f66b61c 490S_cl_anything(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 491{
653099ff 492 ANYOF_CLASS_ZERO(cl);
f8bef550 493 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 494 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
495 if (LOC)
496 cl->flags |= ANYOF_LOCALE;
497}
498
499/* Can match anything (initialization) */
500STATIC int
5f66b61c 501S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
502{
503 int value;
504
aaa51d5e 505 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
506 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
507 return 1;
1aa99e6b
IH
508 if (!(cl->flags & ANYOF_UNICODE_ALL))
509 return 0;
f8bef550
NC
510 if (!ANYOF_BITMAP_TESTALLSET(cl))
511 return 0;
653099ff
GS
512 return 1;
513}
514
515/* Can match anything (initialization) */
516STATIC void
5f66b61c 517S_cl_init(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 518{
8ecf7187 519 Zero(cl, 1, struct regnode_charclass_class);
653099ff 520 cl->type = ANYOF;
830247a4 521 cl_anything(pRExC_state, cl);
653099ff
GS
522}
523
524STATIC void
5f66b61c 525S_cl_init_zero(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 526{
8ecf7187 527 Zero(cl, 1, struct regnode_charclass_class);
653099ff 528 cl->type = ANYOF;
830247a4 529 cl_anything(pRExC_state, cl);
653099ff
GS
530 if (LOC)
531 cl->flags |= ANYOF_LOCALE;
532}
533
534/* 'And' a given class with another one. Can create false positives */
535/* We assume that cl is not inverted */
536STATIC void
5f66b61c 537S_cl_and(struct regnode_charclass_class *cl,
a28509cc 538 const struct regnode_charclass_class *and_with)
653099ff 539{
653099ff
GS
540 if (!(and_with->flags & ANYOF_CLASS)
541 && !(cl->flags & ANYOF_CLASS)
542 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
543 && !(and_with->flags & ANYOF_FOLD)
544 && !(cl->flags & ANYOF_FOLD)) {
545 int i;
546
547 if (and_with->flags & ANYOF_INVERT)
548 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
549 cl->bitmap[i] &= ~and_with->bitmap[i];
550 else
551 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
552 cl->bitmap[i] &= and_with->bitmap[i];
553 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
554 if (!(and_with->flags & ANYOF_EOS))
555 cl->flags &= ~ANYOF_EOS;
1aa99e6b 556
14ebb1a2
JH
557 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
558 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
559 cl->flags &= ~ANYOF_UNICODE_ALL;
560 cl->flags |= ANYOF_UNICODE;
561 ARG_SET(cl, ARG(and_with));
562 }
14ebb1a2
JH
563 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
564 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 565 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
566 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
567 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 568 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
569}
570
571/* 'OR' a given class with another one. Can create false positives */
572/* We assume that cl is not inverted */
573STATIC void
5f66b61c 574S_cl_or(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 575{
653099ff
GS
576 if (or_with->flags & ANYOF_INVERT) {
577 /* We do not use
578 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
579 * <= (B1 | !B2) | (CL1 | !CL2)
580 * which is wasteful if CL2 is small, but we ignore CL2:
581 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
582 * XXXX Can we handle case-fold? Unclear:
583 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
584 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
585 */
586 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
587 && !(or_with->flags & ANYOF_FOLD)
588 && !(cl->flags & ANYOF_FOLD) ) {
589 int i;
590
591 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
592 cl->bitmap[i] |= ~or_with->bitmap[i];
593 } /* XXXX: logic is complicated otherwise */
594 else {
830247a4 595 cl_anything(pRExC_state, cl);
653099ff
GS
596 }
597 } else {
598 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
599 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 600 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
601 || (cl->flags & ANYOF_FOLD)) ) {
602 int i;
603
604 /* OR char bitmap and class bitmap separately */
605 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
606 cl->bitmap[i] |= or_with->bitmap[i];
607 if (or_with->flags & ANYOF_CLASS) {
608 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
609 cl->classflags[i] |= or_with->classflags[i];
610 cl->flags |= ANYOF_CLASS;
611 }
612 }
613 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 614 cl_anything(pRExC_state, cl);
653099ff
GS
615 }
616 }
617 if (or_with->flags & ANYOF_EOS)
618 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
619
620 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
621 ARG(cl) != ARG(or_with)) {
622 cl->flags |= ANYOF_UNICODE_ALL;
623 cl->flags &= ~ANYOF_UNICODE;
624 }
625 if (or_with->flags & ANYOF_UNICODE_ALL) {
626 cl->flags |= ANYOF_UNICODE_ALL;
627 cl->flags &= ~ANYOF_UNICODE;
628 }
653099ff
GS
629}
630
5d1c421c 631/*
a3621e74
YO
632
633 make_trie(startbranch,first,last,tail,flags)
634 startbranch: the first branch in the whole branch sequence
635 first : start branch of sequence of branch-exact nodes.
636 May be the same as startbranch
637 last : Thing following the last branch.
638 May be the same as tail.
639 tail : item following the branch sequence
640 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
641
642Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
643
644A trie is an N'ary tree where the branches are determined by digital
645decomposition of the key. IE, at the root node you look up the 1st character and
646follow that branch repeat until you find the end of the branches. Nodes can be
647marked as "accepting" meaning they represent a complete word. Eg:
648
649 /he|she|his|hers/
650
651would convert into the following structure. Numbers represent states, letters
652following numbers represent valid transitions on the letter from that state, if
653the number is in square brackets it represents an accepting state, otherwise it
654will be in parenthesis.
655
656 +-h->+-e->[3]-+-r->(8)-+-s->[9]
657 | |
658 | (2)
659 | |
660 (1) +-i->(6)-+-s->[7]
661 |
662 +-s->(3)-+-h->(4)-+-e->[5]
663
664 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
665
666This shows that when matching against the string 'hers' we will begin at state 1
667read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
668then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
669is also accepting. Thus we know that we can match both 'he' and 'hers' with a
670single traverse. We store a mapping from accepting to state to which word was
671matched, and then when we have multiple possibilities we try to complete the
672rest of the regex in the order in which they occured in the alternation.
673
674The only prior NFA like behaviour that would be changed by the TRIE support is
675the silent ignoring of duplicate alternations which are of the form:
676
677 / (DUPE|DUPE) X? (?{ ... }) Y /x
678
679Thus EVAL blocks follwing a trie may be called a different number of times with
680and without the optimisation. With the optimisations dupes will be silently
681ignored. This inconsistant behaviour of EVAL type nodes is well established as
682the following demonstrates:
683
684 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
685
686which prints out 'word' three times, but
687
688 'words'=~/(word|word|word)(?{ print $1 })S/
689
690which doesnt print it out at all. This is due to other optimisations kicking in.
691
692Example of what happens on a structural level:
693
694The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
695
696 1: CURLYM[1] {1,32767}(18)
697 5: BRANCH(8)
698 6: EXACT <ac>(16)
699 8: BRANCH(11)
700 9: EXACT <ad>(16)
701 11: BRANCH(14)
702 12: EXACT <ab>(16)
703 16: SUCCEED(0)
704 17: NOTHING(18)
705 18: END(0)
706
707This would be optimizable with startbranch=5, first=5, last=16, tail=16
708and should turn into:
709
710 1: CURLYM[1] {1,32767}(18)
711 5: TRIE(16)
712 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
713 <ac>
714 <ad>
715 <ab>
716 16: SUCCEED(0)
717 17: NOTHING(18)
718 18: END(0)
719
720Cases where tail != last would be like /(?foo|bar)baz/:
721
722 1: BRANCH(4)
723 2: EXACT <foo>(8)
724 4: BRANCH(7)
725 5: EXACT <bar>(8)
726 7: TAIL(8)
727 8: EXACT <baz>(10)
728 10: END(0)
729
730which would be optimizable with startbranch=1, first=1, last=7, tail=8
731and would end up looking like:
732
733 1: TRIE(8)
734 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
735 <foo>
736 <bar>
737 7: TAIL(8)
738 8: EXACT <baz>(10)
739 10: END(0)
740
741*/
742
743#define TRIE_DEBUG_CHAR \
744 DEBUG_TRIE_COMPILE_r({ \
745 SV *tmp; \
746 if ( UTF ) { \
6136c704 747 tmp = newSVpvs( "" ); \
a3621e74
YO
748 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
749 } else { \
e4584336 750 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
a3621e74
YO
751 } \
752 av_push( trie->revcharmap, tmp ); \
753 })
754
755#define TRIE_READ_CHAR STMT_START { \
756 if ( UTF ) { \
757 if ( folder ) { \
758 if ( foldlen > 0 ) { \
759 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
760 foldlen -= len; \
761 scan += len; \
762 len = 0; \
763 } else { \
e1ec3a88 764 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
765 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
766 foldlen -= UNISKIP( uvc ); \
767 scan = foldbuf + UNISKIP( uvc ); \
768 } \
769 } else { \
e1ec3a88 770 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
771 } \
772 } else { \
773 uvc = (U32)*uc; \
774 len = 1; \
775 } \
776} STMT_END
777
778
779#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
780#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
781#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
782#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
783
784#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
785 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
786 TRIE_LIST_LEN( state ) *= 2; \
787 Renew( trie->states[ state ].trans.list, \
788 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
789 } \
790 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
791 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
792 TRIE_LIST_CUR( state )++; \
793} STMT_END
794
795#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 796 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
797 4, reg_trie_trans_le ); \
798 TRIE_LIST_CUR( state ) = 1; \
799 TRIE_LIST_LEN( state ) = 4; \
800} STMT_END
801
802STATIC I32
803S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
804{
27da23d5 805 dVAR;
a3621e74
YO
806 /* first pass, loop through and scan words */
807 reg_trie_data *trie;
808 regnode *cur;
e1ec3a88 809 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
a3621e74
YO
810 STRLEN len = 0;
811 UV uvc = 0;
812 U16 curword = 0;
813 U32 next_alloc = 0;
814 /* we just use folder as a flag in utf8 */
e1ec3a88 815 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
816 ? PL_fold
817 : ( flags == EXACTFL
818 ? PL_fold_locale
819 : NULL
820 )
821 );
822
e1ec3a88 823 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74
YO
824 SV *re_trie_maxbuff;
825
826 GET_RE_DEBUG_FLAGS_DECL;
827
a02a5408 828 Newxz( trie, 1, reg_trie_data );
a3621e74
YO
829 trie->refcount = 1;
830 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 831 Newxz( trie->charmap, 256, U16 );
a3621e74
YO
832 DEBUG_r({
833 trie->words = newAV();
834 trie->revcharmap = newAV();
835 });
836
837
0111c4fd 838 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 839 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 840 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74
YO
841 }
842
843 /* -- First loop and Setup --
844
845 We first traverse the branches and scan each word to determine if it
846 contains widechars, and how many unique chars there are, this is
847 important as we have to build a table with at least as many columns as we
848 have unique chars.
849
850 We use an array of integers to represent the character codes 0..255
851 (trie->charmap) and we use a an HV* to store unicode characters. We use the
852 native representation of the character value as the key and IV's for the
853 coded index.
854
855 *TODO* If we keep track of how many times each character is used we can
856 remap the columns so that the table compression later on is more
857 efficient in terms of memory by ensuring most common value is in the
858 middle and the least common are on the outside. IMO this would be better
859 than a most to least common mapping as theres a decent chance the most
860 common letter will share a node with the least common, meaning the node
861 will not be compressable. With a middle is most common approach the worst
862 case is when we have the least common nodes twice.
863
864 */
865
866
867 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 868 regnode * const noper = NEXTOPER( cur );
e1ec3a88 869 const U8 *uc = (U8*)STRING( noper );
a28509cc 870 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
871 STRLEN foldlen = 0;
872 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 873 const U8 *scan = (U8*)NULL;
a3621e74
YO
874
875 for ( ; uc < e ; uc += len ) {
876 trie->charcount++;
877 TRIE_READ_CHAR;
878 if ( uvc < 256 ) {
879 if ( !trie->charmap[ uvc ] ) {
880 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
881 if ( folder )
882 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
883 TRIE_DEBUG_CHAR;
884 }
885 } else {
886 SV** svpp;
887 if ( !trie->widecharmap )
888 trie->widecharmap = newHV();
889
890 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
891
892 if ( !svpp )
e4584336 893 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
894
895 if ( !SvTRUE( *svpp ) ) {
896 sv_setiv( *svpp, ++trie->uniquecharcount );
897 TRIE_DEBUG_CHAR;
898 }
899 }
900 }
901 trie->wordcount++;
902 } /* end first pass */
903 DEBUG_TRIE_COMPILE_r(
904 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
905 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
5d7488b2 906 (int)trie->charcount, trie->uniquecharcount )
a3621e74
YO
907 );
908
909
910 /*
911 We now know what we are dealing with in terms of unique chars and
912 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
913 representation using a flat table will take. If it's over a reasonable
914 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
915 conservative but potentially much slower representation using an array
916 of lists.
917
918 At the end we convert both representations into the same compressed
919 form that will be used in regexec.c for matching with. The latter
920 is a form that cannot be used to construct with but has memory
921 properties similar to the list form and access properties similar
922 to the table form making it both suitable for fast searches and
923 small enough that its feasable to store for the duration of a program.
924
925 See the comment in the code where the compressed table is produced
926 inplace from the flat tabe representation for an explanation of how
927 the compression works.
928
929 */
930
931
932 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
933 /*
934 Second Pass -- Array Of Lists Representation
935
936 Each state will be represented by a list of charid:state records
937 (reg_trie_trans_le) the first such element holds the CUR and LEN
938 points of the allocated array. (See defines above).
939
940 We build the initial structure using the lists, and then convert
941 it into the compressed table form which allows faster lookups
942 (but cant be modified once converted).
943
944
945 */
946
947
948 STRLEN transcount = 1;
949
a02a5408 950 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
951 TRIE_LIST_NEW(1);
952 next_alloc = 2;
953
954 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
955
c445ea15
AL
956 regnode * const noper = NEXTOPER( cur );
957 U8 *uc = (U8*)STRING( noper );
958 const U8 * const e = uc + STR_LEN( noper );
959 U32 state = 1; /* required init */
960 U16 charid = 0; /* sanity init */
961 U8 *scan = (U8*)NULL; /* sanity init */
962 STRLEN foldlen = 0; /* required init */
963 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
964
965 for ( ; uc < e ; uc += len ) {
966
967 TRIE_READ_CHAR;
968
969 if ( uvc < 256 ) {
970 charid = trie->charmap[ uvc ];
971 } else {
972 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
973 if ( !svpp ) {
974 charid = 0;
975 } else {
976 charid=(U16)SvIV( *svpp );
977 }
978 }
979 if ( charid ) {
a3621e74 980
c445ea15
AL
981 U16 check;
982 U32 newstate = 0;
a3621e74 983
c445ea15
AL
984 charid--;
985 if ( !trie->states[ state ].trans.list ) {
986 TRIE_LIST_NEW( state );
987 }
988 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
989 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
990 newstate = TRIE_LIST_ITEM( state, check ).newstate;
991 break;
992 }
993 }
994 if ( ! newstate ) {
995 newstate = next_alloc++;
996 TRIE_LIST_PUSH( state, charid, newstate );
997 transcount++;
998 }
999 state = newstate;
1000 } else {
1001 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a28509cc 1002 }
c445ea15
AL
1003 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1004 }
a3621e74 1005
c445ea15
AL
1006 if ( !trie->states[ state ].wordnum ) {
1007 /* we havent inserted this word into the structure yet. */
1008 trie->states[ state ].wordnum = ++curword;
a3621e74 1009
c445ea15
AL
1010 DEBUG_r({
1011 /* store the word for dumping */
1012 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1013 if ( UTF ) SvUTF8_on( tmp );
1014 av_push( trie->words, tmp );
1015 });
a3621e74 1016
c445ea15 1017 } else {
bb263b4e 1018 /*EMPTY*/; /* It's a dupe. So ignore it. */
c445ea15 1019 }
a3621e74
YO
1020
1021 } /* end second pass */
1022
1023 trie->laststate = next_alloc;
1024 Renew( trie->states, next_alloc, reg_trie_state );
1025
1026 DEBUG_TRIE_COMPILE_MORE_r({
1027 U32 state;
a3621e74 1028
a28509cc 1029 /* print out the table precompression. */
a3621e74
YO
1030
1031 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1032 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1033
1034 for( state=1 ; state < next_alloc ; state ++ ) {
a28509cc 1035 U16 charid;
a3621e74 1036
e4584336 1037 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
a3621e74
YO
1038 if ( ! trie->states[ state ].wordnum ) {
1039 PerlIO_printf( Perl_debug_log, "%5s| ","");
1040 } else {
e4584336 1041 PerlIO_printf( Perl_debug_log, "W%04x| ",
a3621e74
YO
1042 trie->states[ state ].wordnum
1043 );
1044 }
1045 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1046 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
e4584336 1047 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
cfd0369c 1048 SvPV_nolen_const( *tmp ),
a3621e74 1049 TRIE_LIST_ITEM(state,charid).forid,
e4584336 1050 (UV)TRIE_LIST_ITEM(state,charid).newstate
a3621e74
YO
1051 );
1052 }
1053
1054 }
1055 PerlIO_printf( Perl_debug_log, "\n\n" );
1056 });
1057
a02a5408 1058 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1059 {
1060 U32 state;
a3621e74
YO
1061 U32 tp = 0;
1062 U32 zp = 0;
1063
1064
1065 for( state=1 ; state < next_alloc ; state ++ ) {
1066 U32 base=0;
1067
1068 /*
1069 DEBUG_TRIE_COMPILE_MORE_r(
1070 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1071 );
1072 */
1073
1074 if (trie->states[state].trans.list) {
1075 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1076 U16 maxid=minid;
a28509cc 1077 U16 idx;
a3621e74
YO
1078
1079 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1080 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1081 if ( forid < minid ) {
1082 minid=forid;
1083 } else if ( forid > maxid ) {
1084 maxid=forid;
1085 }
a3621e74
YO
1086 }
1087 if ( transcount < tp + maxid - minid + 1) {
1088 transcount *= 2;
1089 Renew( trie->trans, transcount, reg_trie_trans );
1090 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1091 }
1092 base = trie->uniquecharcount + tp - minid;
1093 if ( maxid == minid ) {
1094 U32 set = 0;
1095 for ( ; zp < tp ; zp++ ) {
1096 if ( ! trie->trans[ zp ].next ) {
1097 base = trie->uniquecharcount + zp - minid;
1098 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1099 trie->trans[ zp ].check = state;
1100 set = 1;
1101 break;
1102 }
1103 }
1104 if ( !set ) {
1105 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1106 trie->trans[ tp ].check = state;
1107 tp++;
1108 zp = tp;
1109 }
1110 } else {
1111 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1112 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1113 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1114 trie->trans[ tid ].check = state;
1115 }
1116 tp += ( maxid - minid + 1 );
1117 }
1118 Safefree(trie->states[ state ].trans.list);
1119 }
1120 /*
1121 DEBUG_TRIE_COMPILE_MORE_r(
1122 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1123 );
1124 */
1125 trie->states[ state ].trans.base=base;
1126 }
cc601c31 1127 trie->lasttrans = tp + 1;
a3621e74
YO
1128 }
1129 } else {
1130 /*
1131 Second Pass -- Flat Table Representation.
1132
1133 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1134 We know that we will need Charcount+1 trans at most to store the data
1135 (one row per char at worst case) So we preallocate both structures
1136 assuming worst case.
1137
1138 We then construct the trie using only the .next slots of the entry
1139 structs.
1140
1141 We use the .check field of the first entry of the node temporarily to
1142 make compression both faster and easier by keeping track of how many non
1143 zero fields are in the node.
1144
1145 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1146 transition.
1147
1148 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1149 number representing the first entry of the node, and state as a
1150 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1151 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1152 are 2 entrys per node. eg:
1153
1154 A B A B
1155 1. 2 4 1. 3 7
1156 2. 0 3 3. 0 5
1157 3. 0 0 5. 0 0
1158 4. 0 0 7. 0 0
1159
1160 The table is internally in the right hand, idx form. However as we also
1161 have to deal with the states array which is indexed by nodenum we have to
1162 use TRIE_NODENUM() to convert.
1163
1164 */
1165
a02a5408 1166 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
a3621e74 1167 reg_trie_trans );
a02a5408 1168 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
1169 next_alloc = trie->uniquecharcount + 1;
1170
1171 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1172
c445ea15 1173 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1174 const U8 *uc = (U8*)STRING( noper );
1175 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1176
1177 U32 state = 1; /* required init */
1178
1179 U16 charid = 0; /* sanity init */
1180 U32 accept_state = 0; /* sanity init */
1181 U8 *scan = (U8*)NULL; /* sanity init */
1182
1183 STRLEN foldlen = 0; /* required init */
1184 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1185
1186
1187 for ( ; uc < e ; uc += len ) {
1188
1189 TRIE_READ_CHAR;
1190
1191 if ( uvc < 256 ) {
1192 charid = trie->charmap[ uvc ];
1193 } else {
c445ea15
AL
1194 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1195 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74
YO
1196 }
1197 if ( charid ) {
1198 charid--;
1199 if ( !trie->trans[ state + charid ].next ) {
1200 trie->trans[ state + charid ].next = next_alloc;
1201 trie->trans[ state ].check++;
1202 next_alloc += trie->uniquecharcount;
1203 }
1204 state = trie->trans[ state + charid ].next;
1205 } else {
e4584336 1206 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1207 }
1208 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1209 }
1210
1211 accept_state = TRIE_NODENUM( state );
1212 if ( !trie->states[ accept_state ].wordnum ) {
1213 /* we havent inserted this word into the structure yet. */
1214 trie->states[ accept_state ].wordnum = ++curword;
1215
1216 DEBUG_r({
1217 /* store the word for dumping */
1218 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1219 if ( UTF ) SvUTF8_on( tmp );
1220 av_push( trie->words, tmp );
1221 });
1222
1223 } else {
bb263b4e 1224 /*EMPTY*/; /* Its a dupe. So ignore it. */
a3621e74
YO
1225 }
1226
1227 } /* end second pass */
1228
1229 DEBUG_TRIE_COMPILE_MORE_r({
1230 /*
1231 print out the table precompression so that we can do a visual check
1232 that they are identical.
1233 */
1234 U32 state;
1235 U16 charid;
1236 PerlIO_printf( Perl_debug_log, "\nChar : " );
1237
1238 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1239 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1240 if ( tmp ) {
cfd0369c 1241 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1242 }
1243 }
1244
1245 PerlIO_printf( Perl_debug_log, "\nState+-" );
1246
1247 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1248 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1249 }
1250
1251 PerlIO_printf( Perl_debug_log, "\n" );
1252
1253 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1254
e4584336 1255 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
a3621e74
YO
1256
1257 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
e4584336
RB
1258 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1259 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
a3621e74
YO
1260 }
1261 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
e4584336 1262 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
a3621e74 1263 } else {
e4584336 1264 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
a3621e74
YO
1265 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1266 }
1267 }
1268 PerlIO_printf( Perl_debug_log, "\n\n" );
1269 });
1270 {
1271 /*
1272 * Inplace compress the table.*
1273
1274 For sparse data sets the table constructed by the trie algorithm will
1275 be mostly 0/FAIL transitions or to put it another way mostly empty.
1276 (Note that leaf nodes will not contain any transitions.)
1277
1278 This algorithm compresses the tables by eliminating most such
1279 transitions, at the cost of a modest bit of extra work during lookup:
1280
1281 - Each states[] entry contains a .base field which indicates the
1282 index in the state[] array wheres its transition data is stored.
1283
1284 - If .base is 0 there are no valid transitions from that node.
1285
1286 - If .base is nonzero then charid is added to it to find an entry in
1287 the trans array.
1288
1289 -If trans[states[state].base+charid].check!=state then the
1290 transition is taken to be a 0/Fail transition. Thus if there are fail
1291 transitions at the front of the node then the .base offset will point
1292 somewhere inside the previous nodes data (or maybe even into a node
1293 even earlier), but the .check field determines if the transition is
1294 valid.
1295
1296 The following process inplace converts the table to the compressed
1297 table: We first do not compress the root node 1,and mark its all its
1298 .check pointers as 1 and set its .base pointer as 1 as well. This
1299 allows to do a DFA construction from the compressed table later, and
1300 ensures that any .base pointers we calculate later are greater than
1301 0.
1302
1303 - We set 'pos' to indicate the first entry of the second node.
1304
1305 - We then iterate over the columns of the node, finding the first and
1306 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1307 and set the .check pointers accordingly, and advance pos
1308 appropriately and repreat for the next node. Note that when we copy
1309 the next pointers we have to convert them from the original
1310 NODEIDX form to NODENUM form as the former is not valid post
1311 compression.
1312
1313 - If a node has no transitions used we mark its base as 0 and do not
1314 advance the pos pointer.
1315
1316 - If a node only has one transition we use a second pointer into the
1317 structure to fill in allocated fail transitions from other states.
1318 This pointer is independent of the main pointer and scans forward
1319 looking for null transitions that are allocated to a state. When it
1320 finds one it writes the single transition into the "hole". If the
1321 pointer doesnt find one the single transition is appeneded as normal.
1322
1323 - Once compressed we can Renew/realloc the structures to release the
1324 excess space.
1325
1326 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1327 specifically Fig 3.47 and the associated pseudocode.
1328
1329 demq
1330 */
a3b680e6 1331 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1332 U32 state, charid;
a3621e74
YO
1333 U32 pos = 0, zp=0;
1334 trie->laststate = laststate;
1335
1336 for ( state = 1 ; state < laststate ; state++ ) {
1337 U8 flag = 0;
a28509cc
AL
1338 const U32 stateidx = TRIE_NODEIDX( state );
1339 const U32 o_used = trie->trans[ stateidx ].check;
1340 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1341 trie->trans[ stateidx ].check = 0;
1342
1343 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1344 if ( flag || trie->trans[ stateidx + charid ].next ) {
1345 if ( trie->trans[ stateidx + charid ].next ) {
1346 if (o_used == 1) {
1347 for ( ; zp < pos ; zp++ ) {
1348 if ( ! trie->trans[ zp ].next ) {
1349 break;
1350 }
1351 }
1352 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1353 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1354 trie->trans[ zp ].check = state;
1355 if ( ++zp > pos ) pos = zp;
1356 break;
1357 }
1358 used--;
1359 }
1360 if ( !flag ) {
1361 flag = 1;
1362 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1363 }
1364 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1365 trie->trans[ pos ].check = state;
1366 pos++;
1367 }
1368 }
1369 }
cc601c31 1370 trie->lasttrans = pos + 1;
a3621e74
YO
1371 Renew( trie->states, laststate + 1, reg_trie_state);
1372 DEBUG_TRIE_COMPILE_MORE_r(
e4584336
RB
1373 PerlIO_printf( Perl_debug_log,
1374 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
5d7488b2
AL
1375 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1376 (IV)next_alloc,
1377 (IV)pos,
a3621e74
YO
1378 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1379 );
1380
1381 } /* end table compress */
1382 }
cc601c31
YO
1383 /* resize the trans array to remove unused space */
1384 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74
YO
1385
1386 DEBUG_TRIE_COMPILE_r({
1387 U32 state;
1388 /*
1389 Now we print it out again, in a slightly different form as there is additional
1390 info we want to be able to see when its compressed. They are close enough for
1391 visual comparison though.
1392 */
1393 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1394
1395 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1396 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1397 if ( tmp ) {
cfd0369c 1398 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1399 }
1400 }
1401 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
cc601c31 1402
a3621e74
YO
1403 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1404 PerlIO_printf( Perl_debug_log, "-----");
1405 PerlIO_printf( Perl_debug_log, "\n");
cc601c31 1406
a3621e74 1407 for( state = 1 ; state < trie->laststate ; state++ ) {
a28509cc 1408 const U32 base = trie->states[ state ].trans.base;
a3621e74 1409
e4584336 1410 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
a3621e74
YO
1411
1412 if ( trie->states[ state ].wordnum ) {
1413 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1414 } else {
1415 PerlIO_printf( Perl_debug_log, "%6s", "" );
1416 }
1417
e4584336 1418 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
a3621e74
YO
1419
1420 if ( base ) {
1421 U32 ofs = 0;
1422
cc601c31
YO
1423 while( ( base + ofs < trie->uniquecharcount ) ||
1424 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1425 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
a3621e74
YO
1426 ofs++;
1427
e4584336 1428 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
a3621e74
YO
1429
1430 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1431 if ( ( base + ofs >= trie->uniquecharcount ) &&
1432 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1433 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1434 {
e4584336
RB
1435 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1436 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
a3621e74
YO
1437 } else {
1438 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1439 }
1440 }
1441
e4584336 1442 PerlIO_printf( Perl_debug_log, "]");
a3621e74
YO
1443
1444 }
1445 PerlIO_printf( Perl_debug_log, "\n" );
1446 }
1447 });
1448
1449 {
1450 /* now finally we "stitch in" the new TRIE node
1451 This means we convert either the first branch or the first Exact,
1452 depending on whether the thing following (in 'last') is a branch
1453 or not and whther first is the startbranch (ie is it a sub part of
1454 the alternation or is it the whole thing.)
1455 Assuming its a sub part we conver the EXACT otherwise we convert
1456 the whole branch sequence, including the first.
1457 */
1458 regnode *convert;
1459
1460
1461
1462
1463 if ( first == startbranch && OP( last ) != BRANCH ) {
1464 convert = first;
1465 } else {
1466 convert = NEXTOPER( first );
1467 NEXT_OFF( first ) = (U16)(last - first);
1468 }
1469
1470 OP( convert ) = TRIE + (U8)( flags - EXACT );
1471 NEXT_OFF( convert ) = (U16)(tail - convert);
1472 ARG_SET( convert, data_slot );
1473
1474 /* tells us if we need to handle accept buffers specially */
1475 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1476
1477
1478 /* needed for dumping*/
1479 DEBUG_r({
1480 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1481 /* We now need to mark all of the space originally used by the
1482 branches as optimized away. This keeps the dumpuntil from
1483 throwing a wobbly as it doesnt use regnext() to traverse the
1484 opcodes.
1485 */
1486 while( optimize < last ) {
1487 OP( optimize ) = OPTIMIZED;
1488 optimize++;
1489 }
1490 });
1491 } /* end node insert */
1492 return 1;
1493}
1494
1495
1496
1497/*
5d1c421c
JH
1498 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1499 * These need to be revisited when a newer toolchain becomes available.
1500 */
1501#if defined(__sparc64__) && defined(__GNUC__)
1502# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1503# undef SPARC64_GCC_WORKAROUND
1504# define SPARC64_GCC_WORKAROUND 1
1505# endif
1506#endif
1507
653099ff
GS
1508/* REx optimizer. Converts nodes into quickier variants "in place".
1509 Finds fixed substrings. */
1510
a0288114 1511/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
1512 to the position after last scanned or to NULL. */
1513
a3621e74 1514
76e3520e 1515STATIC I32
9a957fbc
AL
1516S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1517 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1518 /* scanp: Start here (read-write). */
1519 /* deltap: Write maxlen-minlen here. */
1520 /* last: Stop before this one. */
1521{
97aff369 1522 dVAR;
c277df42
IZ
1523 I32 min = 0, pars = 0, code;
1524 regnode *scan = *scanp, *next;
1525 I32 delta = 0;
1526 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1527 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1528 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1529 scan_data_t data_fake;
653099ff 1530 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1531 SV *re_trie_maxbuff = NULL;
1532
1533 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1534
c277df42
IZ
1535 while (scan && OP(scan) != END && scan < last) {
1536 /* Peephole optimizer: */
a3621e74 1537 DEBUG_OPTIMISE_r({
c445ea15 1538 SV * const mysv=sv_newmortal();
a3621e74 1539 regprop( mysv, scan);
e4584336 1540 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
cfd0369c 1541 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
a3621e74 1542 });
c277df42 1543
22c35a8c 1544 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 1545 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1546 regnode *n = regnext(scan);
1547 U32 stringok = 1;
1548#ifdef DEBUGGING
1549 regnode *stop = scan;
b81d288d 1550#endif
c277df42 1551
cd439c50 1552 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1553 /* Skip NOTHING, merge EXACT*. */
1554 while (n &&
b81d288d 1555 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
1556 (stringok && (OP(n) == OP(scan))))
1557 && NEXT_OFF(n)
1558 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1559 if (OP(n) == TAIL || n > next)
1560 stringok = 0;
22c35a8c 1561 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
1562 NEXT_OFF(scan) += NEXT_OFF(n);
1563 next = n + NODE_STEP_REGNODE;
1564#ifdef DEBUGGING
1565 if (stringok)
1566 stop = n;
b81d288d 1567#endif
c277df42 1568 n = regnext(n);
a0ed51b3 1569 }
f49d4d0f 1570 else if (stringok) {
a3b680e6 1571 const int oldl = STR_LEN(scan);
c445ea15 1572 regnode * const nnext = regnext(n);
f49d4d0f 1573
b81d288d 1574 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1575 break;
1576 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1577 STR_LEN(scan) += STR_LEN(n);
1578 next = n + NODE_SZ_STR(n);
c277df42 1579 /* Now we can overwrite *n : */
f49d4d0f 1580 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1581#ifdef DEBUGGING
f49d4d0f 1582 stop = next - 1;
b81d288d 1583#endif
c277df42
IZ
1584 n = nnext;
1585 }
1586 }
61a36c01 1587
a3621e74 1588 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1589/*
1590 Two problematic code points in Unicode casefolding of EXACT nodes:
1591
1592 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1593 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1594
1595 which casefold to
1596
1597 Unicode UTF-8
1598
1599 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1600 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1601
1602 This means that in case-insensitive matching (or "loose matching",
1603 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1604 length of the above casefolded versions) can match a target string
1605 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1606 This would rather mess up the minimum length computation.
1607
1608 What we'll do is to look for the tail four bytes, and then peek
1609 at the preceding two bytes to see whether we need to decrease
1610 the minimum length by four (six minus two).
1611
1612 Thanks to the design of UTF-8, there cannot be false matches:
1613 A sequence of valid UTF-8 bytes cannot be a subsequence of
1614 another valid sequence of UTF-8 bytes.
1615
1616*/
c445ea15
AL
1617 char * const s0 = STRING(scan), *s, *t;
1618 char * const s1 = s0 + STR_LEN(scan) - 1;
1619 char * const s2 = s1 - 4;
d4c19fe8 1620 const char t0[] = "\xcc\x88\xcc\x81";
a28509cc 1621 const char * const t1 = t0 + 3;
2af232bd 1622
61a36c01
JH
1623 for (s = s0 + 2;
1624 s < s2 && (t = ninstr(s, s1, t0, t1));
1625 s = t + 4) {
1626 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1627 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1628 min -= 4;
1629 }
1630 }
1631
c277df42
IZ
1632#ifdef DEBUGGING
1633 /* Allow dumping */
cd439c50 1634 n = scan + NODE_SZ_STR(scan);
c277df42 1635 while (n <= stop) {
22c35a8c 1636 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1637 OP(n) = OPTIMIZED;
1638 NEXT_OFF(n) = 0;
1639 }
1640 n++;
1641 }
653099ff 1642#endif
c277df42 1643 }
a3621e74
YO
1644
1645
1646
653099ff
GS
1647 /* Follow the next-chain of the current node and optimize
1648 away all the NOTHINGs from it. */
c277df42 1649 if (OP(scan) != CURLYX) {
a3b680e6 1650 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
1651 ? I32_MAX
1652 /* I32 may be smaller than U16 on CRAYs! */
1653 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1654 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1655 int noff;
1656 regnode *n = scan;
b81d288d 1657
c277df42
IZ
1658 /* Skip NOTHING and LONGJMP. */
1659 while ((n = regnext(n))
22c35a8c 1660 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1661 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1662 && off + noff < max)
1663 off += noff;
1664 if (reg_off_by_arg[OP(scan)])
1665 ARG(scan) = off;
b81d288d 1666 else
c277df42
IZ
1667 NEXT_OFF(scan) = off;
1668 }
a3621e74 1669
653099ff
GS
1670 /* The principal pseudo-switch. Cannot be a switch, since we
1671 look into several different things. */
b81d288d 1672 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1673 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1674 next = regnext(scan);
1675 code = OP(scan);
a3621e74 1676 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1677
1678 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1679 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1680 struct regnode_charclass_class accum;
d4c19fe8 1681 regnode * const startbranch=scan;
c277df42 1682
653099ff 1683 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1684 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1685 if (flags & SCF_DO_STCLASS)
830247a4 1686 cl_init_zero(pRExC_state, &accum);
a3621e74 1687
c277df42 1688 while (OP(scan) == code) {
830247a4 1689 I32 deltanext, minnext, f = 0, fake;
653099ff 1690 struct regnode_charclass_class this_class;
c277df42
IZ
1691
1692 num++;
1693 data_fake.flags = 0;
b81d288d 1694 if (data) {
2c2d71f5 1695 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1696 data_fake.last_closep = data->last_closep;
1697 }
1698 else
1699 data_fake.last_closep = &fake;
c277df42
IZ
1700 next = regnext(scan);
1701 scan = NEXTOPER(scan);
1702 if (code != BRANCH)
1703 scan = NEXTOPER(scan);
653099ff 1704 if (flags & SCF_DO_STCLASS) {
830247a4 1705 cl_init(pRExC_state, &this_class);
653099ff
GS
1706 data_fake.start_class = &this_class;
1707 f = SCF_DO_STCLASS_AND;
b81d288d 1708 }
e1901655
IZ
1709 if (flags & SCF_WHILEM_VISITED_POS)
1710 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1711
653099ff 1712 /* we suppose the run is continuous, last=next...*/
830247a4 1713 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1714 next, &data_fake, f,depth+1);
b81d288d 1715 if (min1 > minnext)
c277df42
IZ
1716 min1 = minnext;
1717 if (max1 < minnext + deltanext)
1718 max1 = minnext + deltanext;
1719 if (deltanext == I32_MAX)
aca2d497 1720 is_inf = is_inf_internal = 1;
c277df42
IZ
1721 scan = next;
1722 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1723 pars++;
405ff068 1724 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1725 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1726 if (data)
1727 data->whilem_c = data_fake.whilem_c;
653099ff 1728 if (flags & SCF_DO_STCLASS)
830247a4 1729 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1730 if (code == SUSPEND)
c277df42
IZ
1731 break;
1732 }
1733 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1734 min1 = 0;
1735 if (flags & SCF_DO_SUBSTR) {
1736 data->pos_min += min1;
1737 data->pos_delta += max1 - min1;
1738 if (max1 != min1 || is_inf)
1739 data->longest = &(data->longest_float);
1740 }
1741 min += min1;
1742 delta += max1 - min1;
653099ff 1743 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1744 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1745 if (min1) {
1746 cl_and(data->start_class, &and_with);
1747 flags &= ~SCF_DO_STCLASS;
1748 }
1749 }
1750 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1751 if (min1) {
1752 cl_and(data->start_class, &accum);
653099ff 1753 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1754 }
1755 else {
b81d288d 1756 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1757 * data->start_class */
1758 StructCopy(data->start_class, &and_with,
1759 struct regnode_charclass_class);
1760 flags &= ~SCF_DO_STCLASS_AND;
1761 StructCopy(&accum, data->start_class,
1762 struct regnode_charclass_class);
1763 flags |= SCF_DO_STCLASS_OR;
1764 data->start_class->flags |= ANYOF_EOS;
1765 }
653099ff 1766 }
a3621e74
YO
1767
1768 /* demq.
1769
1770 Assuming this was/is a branch we are dealing with: 'scan' now
1771 points at the item that follows the branch sequence, whatever
1772 it is. We now start at the beginning of the sequence and look
1773 for subsequences of
1774
1775 BRANCH->EXACT=>X
1776 BRANCH->EXACT=>X
1777
1778 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1779
1780 If we can find such a subseqence we need to turn the first
1781 element into a trie and then add the subsequent branch exact
1782 strings to the trie.
1783
1784 We have two cases
1785
1786 1. patterns where the whole set of branch can be converted to a trie,
1787
1788 2. patterns where only a subset of the alternations can be
1789 converted to a trie.
1790
1791 In case 1 we can replace the whole set with a single regop
1792 for the trie. In case 2 we need to keep the start and end
1793 branchs so
1794
1795 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1796 becomes BRANCH TRIE; BRANCH X;
1797
1798 Hypthetically when we know the regex isnt anchored we can
1799 turn a case 1 into a DFA and let it rip... Every time it finds a match
1800 it would just call its tail, no WHILEM/CURLY needed.
1801
1802 */
0111c4fd
RGS
1803 if (DO_TRIE) {
1804 if (!re_trie_maxbuff) {
1805 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1806 if (!SvIOK(re_trie_maxbuff))
1807 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1808 }
a3621e74
YO
1809 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1810 regnode *cur;
1811 regnode *first = (regnode *)NULL;
1812 regnode *last = (regnode *)NULL;
1813 regnode *tail = scan;
1814 U8 optype = 0;
1815 U32 count=0;
1816
1817#ifdef DEBUGGING
c445ea15 1818 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
1819#endif
1820 /* var tail is used because there may be a TAIL
1821 regop in the way. Ie, the exacts will point to the
1822 thing following the TAIL, but the last branch will
1823 point at the TAIL. So we advance tail. If we
1824 have nested (?:) we may have to move through several
1825 tails.
1826 */
1827
1828 while ( OP( tail ) == TAIL ) {
1829 /* this is the TAIL generated by (?:) */
1830 tail = regnext( tail );
1831 }
1832
1833 DEBUG_OPTIMISE_r({
1834 regprop( mysv, tail );
1835 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
cfd0369c 1836 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
a3621e74
YO
1837 (RExC_seen_evals) ? "[EVAL]" : ""
1838 );
1839 });
1840 /*
1841
1842 step through the branches, cur represents each
1843 branch, noper is the first thing to be matched
1844 as part of that branch and noper_next is the
1845 regnext() of that node. if noper is an EXACT
1846 and noper_next is the same as scan (our current
1847 position in the regex) then the EXACT branch is
1848 a possible optimization target. Once we have
1849 two or more consequetive such branches we can
1850 create a trie of the EXACT's contents and stich
1851 it in place. If the sequence represents all of
1852 the branches we eliminate the whole thing and
1853 replace it with a single TRIE. If it is a
1854 subsequence then we need to stitch it in. This
1855 means the first branch has to remain, and needs
1856 to be repointed at the item on the branch chain
1857 following the last branch optimized. This could
1858 be either a BRANCH, in which case the
1859 subsequence is internal, or it could be the
1860 item following the branch sequence in which
1861 case the subsequence is at the end.
1862
1863 */
1864
1865 /* dont use tail as the end marker for this traverse */
1866 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
1867 regnode * const noper = NEXTOPER( cur );
1868 regnode * const noper_next = regnext( noper );
a3621e74 1869
a3621e74
YO
1870 DEBUG_OPTIMISE_r({
1871 regprop( mysv, cur);
1872 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1873 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
a3621e74
YO
1874
1875 regprop( mysv, noper);
1876 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 1877 SvPV_nolen_const(mysv));
a3621e74
YO
1878
1879 if ( noper_next ) {
1880 regprop( mysv, noper_next );
1881 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 1882 SvPV_nolen_const(mysv));
a3621e74
YO
1883 }
1884 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1885 first, last, cur );
1886 });
1887 if ( ( first ? OP( noper ) == optype
1888 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1889 && noper_next == tail && count<U16_MAX)
1890 {
1891 count++;
1892 if ( !first ) {
1893 first = cur;
1894 optype = OP( noper );
1895 } else {
1896 DEBUG_OPTIMISE_r(
1897 if (!last ) {
1898 regprop( mysv, first);
1899 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1900 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
a3621e74
YO
1901 regprop( mysv, NEXTOPER(first) );
1902 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1903 SvPV_nolen_const( mysv ) );
a3621e74
YO
1904 }
1905 );
1906 last = cur;
1907 DEBUG_OPTIMISE_r({
1908 regprop( mysv, cur);
1909 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1910 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
a3621e74
YO
1911 regprop( mysv, noper );
1912 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1913 SvPV_nolen_const( mysv ) );
a3621e74
YO
1914 });
1915 }
1916 } else {
1917 if ( last ) {
1918 DEBUG_OPTIMISE_r(
1919 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1920 (int)depth * 2 + 2, "E:", "**END**" );
a3621e74
YO
1921 );
1922 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1923 }
1924 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1925 && noper_next == tail )
1926 {
1927 count = 1;
1928 first = cur;
1929 optype = OP( noper );
1930 } else {
1931 count = 0;
1932 first = NULL;
1933 optype = 0;
1934 }
1935 last = NULL;
1936 }
1937 }
1938 DEBUG_OPTIMISE_r({
1939 regprop( mysv, cur);
1940 PerlIO_printf( Perl_debug_log,
e4584336 1941 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
cfd0369c 1942 " ", SvPV_nolen_const( mysv ), first, last, cur);
a3621e74
YO
1943
1944 });
1945 if ( last ) {
1946 DEBUG_OPTIMISE_r(
1947 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1948 (int)depth * 2 + 2, "E:", "==END==" );
a3621e74
YO
1949 );
1950 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1951 }
1952 }
1953 }
a0ed51b3 1954 }
a3621e74 1955 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 1956 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 1957 } else /* single branch is optimized. */
c277df42
IZ
1958 scan = NEXTOPER(scan);
1959 continue;
a0ed51b3
LW
1960 }
1961 else if (OP(scan) == EXACT) {
cd439c50 1962 I32 l = STR_LEN(scan);
c445ea15 1963 UV uc;
a0ed51b3 1964 if (UTF) {
a3b680e6 1965 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 1966 l = utf8_length(s, s + l);
9041c2e3 1967 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
1968 } else {
1969 uc = *((U8*)STRING(scan));
a0ed51b3
LW
1970 }
1971 min += l;
c277df42 1972 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
1973 /* The code below prefers earlier match for fixed
1974 offset, later match for variable offset. */
1975 if (data->last_end == -1) { /* Update the start info. */
1976 data->last_start_min = data->pos_min;
1977 data->last_start_max = is_inf
b81d288d 1978 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 1979 }
cd439c50 1980 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
1981 if (UTF)
1982 SvUTF8_on(data->last_found);
0eda9292 1983 {
9a957fbc 1984 SV * const sv = data->last_found;
a28509cc 1985 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
1986 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1987 if (mg && mg->mg_len >= 0)
5e43f467
JH
1988 mg->mg_len += utf8_length((U8*)STRING(scan),
1989 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 1990 }
c277df42
IZ
1991 data->last_end = data->pos_min + l;
1992 data->pos_min += l; /* As in the first entry. */
1993 data->flags &= ~SF_BEFORE_EOL;
1994 }
653099ff
GS
1995 if (flags & SCF_DO_STCLASS_AND) {
1996 /* Check whether it is compatible with what we know already! */
1997 int compat = 1;
1998
1aa99e6b 1999 if (uc >= 0x100 ||
516a5887 2000 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2001 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2002 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2003 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2004 )
653099ff
GS
2005 compat = 0;
2006 ANYOF_CLASS_ZERO(data->start_class);
2007 ANYOF_BITMAP_ZERO(data->start_class);
2008 if (compat)
1aa99e6b 2009 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2010 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2011 if (uc < 0x100)
2012 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2013 }
2014 else if (flags & SCF_DO_STCLASS_OR) {
2015 /* false positive possible if the class is case-folded */
1aa99e6b 2016 if (uc < 0x100)
9b877dbb
IH
2017 ANYOF_BITMAP_SET(data->start_class, uc);
2018 else
2019 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2020 data->start_class->flags &= ~ANYOF_EOS;
2021 cl_and(data->start_class, &and_with);
2022 }
2023 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2024 }
653099ff 2025 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2026 I32 l = STR_LEN(scan);
1aa99e6b 2027 UV uc = *((U8*)STRING(scan));
653099ff
GS
2028
2029 /* Search for fixed substrings supports EXACT only. */
b81d288d 2030 if (flags & SCF_DO_SUBSTR)
830247a4 2031 scan_commit(pRExC_state, data);
a0ed51b3 2032 if (UTF) {
6136c704 2033 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2034 l = utf8_length(s, s + l);
9041c2e3 2035 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2036 }
2037 min += l;
c277df42 2038 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 2039 data->pos_min += l;
653099ff
GS
2040 if (flags & SCF_DO_STCLASS_AND) {
2041 /* Check whether it is compatible with what we know already! */
2042 int compat = 1;
2043
1aa99e6b 2044 if (uc >= 0x100 ||
516a5887 2045 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2046 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2047 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2048 compat = 0;
2049 ANYOF_CLASS_ZERO(data->start_class);
2050 ANYOF_BITMAP_ZERO(data->start_class);
2051 if (compat) {
1aa99e6b 2052 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2053 data->start_class->flags &= ~ANYOF_EOS;
2054 data->start_class->flags |= ANYOF_FOLD;
2055 if (OP(scan) == EXACTFL)
2056 data->start_class->flags |= ANYOF_LOCALE;
2057 }
2058 }
2059 else if (flags & SCF_DO_STCLASS_OR) {
2060 if (data->start_class->flags & ANYOF_FOLD) {
2061 /* false positive possible if the class is case-folded.
2062 Assume that the locale settings are the same... */
1aa99e6b
IH
2063 if (uc < 0x100)
2064 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2065 data->start_class->flags &= ~ANYOF_EOS;
2066 }
2067 cl_and(data->start_class, &and_with);
2068 }
2069 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2070 }
bfed75c6 2071 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2072 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2073 I32 f = flags, pos_before = 0;
d4c19fe8 2074 regnode * const oscan = scan;
653099ff
GS
2075 struct regnode_charclass_class this_class;
2076 struct regnode_charclass_class *oclass = NULL;
727f22e3 2077 I32 next_is_eval = 0;
653099ff 2078
22c35a8c 2079 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2080 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2081 scan = NEXTOPER(scan);
2082 goto finish;
2083 case PLUS:
653099ff 2084 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2085 next = NEXTOPER(scan);
653099ff 2086 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2087 mincount = 1;
2088 maxcount = REG_INFTY;
c277df42
IZ
2089 next = regnext(scan);
2090 scan = NEXTOPER(scan);
2091 goto do_curly;
2092 }
2093 }
2094 if (flags & SCF_DO_SUBSTR)
2095 data->pos_min++;
2096 min++;
2097 /* Fall through. */
2098 case STAR:
653099ff
GS
2099 if (flags & SCF_DO_STCLASS) {
2100 mincount = 0;
b81d288d 2101 maxcount = REG_INFTY;
653099ff
GS
2102 next = regnext(scan);
2103 scan = NEXTOPER(scan);
2104 goto do_curly;
2105 }
b81d288d 2106 is_inf = is_inf_internal = 1;
c277df42
IZ
2107 scan = regnext(scan);
2108 if (flags & SCF_DO_SUBSTR) {
830247a4 2109 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2110 data->longest = &(data->longest_float);
2111 }
2112 goto optimize_curly_tail;
2113 case CURLY:
b81d288d 2114 mincount = ARG1(scan);
c277df42
IZ
2115 maxcount = ARG2(scan);
2116 next = regnext(scan);
cb434fcc
IZ
2117 if (OP(scan) == CURLYX) {
2118 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2119 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2120 }
c277df42 2121 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2122 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2123 do_curly:
2124 if (flags & SCF_DO_SUBSTR) {
830247a4 2125 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2126 pos_before = data->pos_min;
2127 }
2128 if (data) {
2129 fl = data->flags;
2130 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2131 if (is_inf)
2132 data->flags |= SF_IS_INF;
2133 }
653099ff 2134 if (flags & SCF_DO_STCLASS) {
830247a4 2135 cl_init(pRExC_state, &this_class);
653099ff
GS
2136 oclass = data->start_class;
2137 data->start_class = &this_class;
2138 f |= SCF_DO_STCLASS_AND;
2139 f &= ~SCF_DO_STCLASS_OR;
2140 }
e1901655
IZ
2141 /* These are the cases when once a subexpression
2142 fails at a particular position, it cannot succeed
2143 even after backtracking at the enclosing scope.
b81d288d 2144
e1901655
IZ
2145 XXXX what if minimal match and we are at the
2146 initial run of {n,m}? */
2147 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2148 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2149
c277df42 2150 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2151 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2152 (mincount == 0
2153 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2154
2155 if (flags & SCF_DO_STCLASS)
2156 data->start_class = oclass;
2157 if (mincount == 0 || minnext == 0) {
2158 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2159 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2160 }
2161 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2162 /* Switch to OR mode: cache the old value of
653099ff
GS
2163 * data->start_class */
2164 StructCopy(data->start_class, &and_with,
2165 struct regnode_charclass_class);
2166 flags &= ~SCF_DO_STCLASS_AND;
2167 StructCopy(&this_class, data->start_class,
2168 struct regnode_charclass_class);
2169 flags |= SCF_DO_STCLASS_OR;
2170 data->start_class->flags |= ANYOF_EOS;
2171 }
2172 } else { /* Non-zero len */
2173 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2174 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2175 cl_and(data->start_class, &and_with);
2176 }
2177 else if (flags & SCF_DO_STCLASS_AND)
2178 cl_and(data->start_class, &this_class);
2179 flags &= ~SCF_DO_STCLASS;
2180 }
c277df42
IZ
2181 if (!scan) /* It was not CURLYX, but CURLY. */
2182 scan = next;
041457d9
DM
2183 if ( /* ? quantifier ok, except for (?{ ... }) */
2184 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2185 && (minnext == 0) && (deltanext == 0)
99799961 2186 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2187 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2188 && ckWARN(WARN_REGEXP))
b45f050a 2189 {
830247a4 2190 vWARN(RExC_parse,
b45f050a
JF
2191 "Quantifier unexpected on zero-length expression");
2192 }
2193
c277df42 2194 min += minnext * mincount;
b81d288d 2195 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2196 && (minnext + deltanext) > 0)
2197 || deltanext == I32_MAX);
aca2d497 2198 is_inf |= is_inf_internal;
c277df42
IZ
2199 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2200
2201 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2202 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2203 && data->flags & SF_IN_PAR
2204 && !(data->flags & SF_HAS_EVAL)
2205 && !deltanext && minnext == 1 ) {
2206 /* Try to optimize to CURLYN. */
2207 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2208 regnode * const nxt1 = nxt;
497b47a8
JH
2209#ifdef DEBUGGING
2210 regnode *nxt2;
2211#endif
c277df42
IZ
2212
2213 /* Skip open. */
2214 nxt = regnext(nxt);
bfed75c6 2215 if (!strchr((const char*)PL_simple,OP(nxt))
22c35a8c 2216 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 2217 && STR_LEN(nxt) == 1))
c277df42 2218 goto nogo;
497b47a8 2219#ifdef DEBUGGING
c277df42 2220 nxt2 = nxt;
497b47a8 2221#endif
c277df42 2222 nxt = regnext(nxt);
b81d288d 2223 if (OP(nxt) != CLOSE)
c277df42
IZ
2224 goto nogo;
2225 /* Now we know that nxt2 is the only contents: */
eb160463 2226 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2227 OP(oscan) = CURLYN;
2228 OP(nxt1) = NOTHING; /* was OPEN. */
2229#ifdef DEBUGGING
2230 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2231 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2232 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2233 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2234 OP(nxt + 1) = OPTIMIZED; /* was count. */
2235 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2236#endif
c277df42 2237 }
c277df42
IZ
2238 nogo:
2239
2240 /* Try optimization CURLYX => CURLYM. */
b81d288d 2241 if ( OP(oscan) == CURLYX && data
c277df42 2242 && !(data->flags & SF_HAS_PAR)
c277df42 2243 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2244 && !deltanext /* atom is fixed width */
2245 && minnext != 0 /* CURLYM can't handle zero width */
2246 ) {
c277df42
IZ
2247 /* XXXX How to optimize if data == 0? */
2248 /* Optimize to a simpler form. */
2249 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2250 regnode *nxt2;
2251
2252 OP(oscan) = CURLYM;
2253 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2254 && (OP(nxt2) != WHILEM))
c277df42
IZ
2255 nxt = nxt2;
2256 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2257 /* Need to optimize away parenths. */
2258 if (data->flags & SF_IN_PAR) {
2259 /* Set the parenth number. */
2260 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2261
b81d288d 2262 if (OP(nxt) != CLOSE)
b45f050a 2263 FAIL("Panic opt close");
eb160463 2264 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2265 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2266 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2267#ifdef DEBUGGING
2268 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2269 OP(nxt + 1) = OPTIMIZED; /* was count. */
2270 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2271 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2272#endif
c277df42
IZ
2273#if 0
2274 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2275 regnode *nnxt = regnext(nxt1);
b81d288d 2276
c277df42
IZ
2277 if (nnxt == nxt) {
2278 if (reg_off_by_arg[OP(nxt1)])
2279 ARG_SET(nxt1, nxt2 - nxt1);
2280 else if (nxt2 - nxt1 < U16_MAX)
2281 NEXT_OFF(nxt1) = nxt2 - nxt1;
2282 else
2283 OP(nxt) = NOTHING; /* Cannot beautify */
2284 }
2285 nxt1 = nnxt;
2286 }
2287#endif
2288 /* Optimize again: */
b81d288d 2289 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2290 NULL, 0,depth+1);
a0ed51b3
LW
2291 }
2292 else
c277df42 2293 oscan->flags = 0;
c277df42 2294 }
e1901655
IZ
2295 else if ((OP(oscan) == CURLYX)
2296 && (flags & SCF_WHILEM_VISITED_POS)
2297 /* See the comment on a similar expression above.
2298 However, this time it not a subexpression
2299 we care about, but the expression itself. */
2300 && (maxcount == REG_INFTY)
2301 && data && ++data->whilem_c < 16) {
2302 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2303 /* Find WHILEM (as in regexec.c) */
2304 regnode *nxt = oscan + NEXT_OFF(oscan);
2305
2306 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2307 nxt += ARG(nxt);
eb160463
GS
2308 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2309 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2310 }
b81d288d 2311 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2312 pars++;
2313 if (flags & SCF_DO_SUBSTR) {
c445ea15 2314 SV *last_str = NULL;
c277df42
IZ
2315 int counted = mincount != 0;
2316
2317 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2318#if defined(SPARC64_GCC_WORKAROUND)
2319 I32 b = 0;
2320 STRLEN l = 0;
cfd0369c 2321 const char *s = NULL;
5d1c421c
JH
2322 I32 old = 0;
2323
2324 if (pos_before >= data->last_start_min)
2325 b = pos_before;
2326 else
2327 b = data->last_start_min;
2328
2329 l = 0;
cfd0369c 2330 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2331 old = b - data->last_start_min;
2332
2333#else
b81d288d 2334 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2335 ? pos_before : data->last_start_min;
2336 STRLEN l;
d4c19fe8 2337 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2338 I32 old = b - data->last_start_min;
5d1c421c 2339#endif
a0ed51b3
LW
2340
2341 if (UTF)
2342 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2343
a0ed51b3 2344 l -= old;
c277df42 2345 /* Get the added string: */
79cb57f6 2346 last_str = newSVpvn(s + old, l);
0e933229
IH
2347 if (UTF)
2348 SvUTF8_on(last_str);
c277df42
IZ
2349 if (deltanext == 0 && pos_before == b) {
2350 /* What was added is a constant string */
2351 if (mincount > 1) {
2352 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2353 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2354 SvPVX_const(last_str), l, mincount - 1);
b162af07 2355 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2356 /* Add additional parts. */
b81d288d 2357 SvCUR_set(data->last_found,
c277df42
IZ
2358 SvCUR(data->last_found) - l);
2359 sv_catsv(data->last_found, last_str);
0eda9292
JH
2360 {
2361 SV * sv = data->last_found;
2362 MAGIC *mg =
2363 SvUTF8(sv) && SvMAGICAL(sv) ?
2364 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2365 if (mg && mg->mg_len >= 0)
2366 mg->mg_len += CHR_SVLEN(last_str);
2367 }
c277df42
IZ
2368 data->last_end += l * (mincount - 1);
2369 }
2a8d9689
HS
2370 } else {
2371 /* start offset must point into the last copy */
2372 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2373 data->last_start_max += is_inf ? I32_MAX
2374 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2375 }
2376 }
2377 /* It is counted once already... */
2378 data->pos_min += minnext * (mincount - counted);
2379 data->pos_delta += - counted * deltanext +
2380 (minnext + deltanext) * maxcount - minnext * mincount;
2381 if (mincount != maxcount) {
653099ff
GS
2382 /* Cannot extend fixed substrings found inside
2383 the group. */
830247a4 2384 scan_commit(pRExC_state,data);
c277df42 2385 if (mincount && last_str) {
d4c19fe8
AL
2386 SV * const sv = data->last_found;
2387 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
2388 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2389
2390 if (mg)
2391 mg->mg_len = -1;
2392 sv_setsv(sv, last_str);
c277df42 2393 data->last_end = data->pos_min;
b81d288d 2394 data->last_start_min =
a0ed51b3 2395 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2396 data->last_start_max = is_inf
2397 ? I32_MAX
c277df42 2398 : data->pos_min + data->pos_delta
a0ed51b3 2399 - CHR_SVLEN(last_str);
c277df42
IZ
2400 }
2401 data->longest = &(data->longest_float);
2402 }
aca2d497 2403 SvREFCNT_dec(last_str);
c277df42 2404 }
405ff068 2405 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2406 data->flags |= SF_HAS_EVAL;
2407 optimize_curly_tail:
c277df42 2408 if (OP(oscan) != CURLYX) {
22c35a8c 2409 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2410 && NEXT_OFF(next))
2411 NEXT_OFF(oscan) += NEXT_OFF(next);
2412 }
c277df42 2413 continue;
653099ff 2414 default: /* REF and CLUMP only? */
c277df42 2415 if (flags & SCF_DO_SUBSTR) {
830247a4 2416 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2417 data->longest = &(data->longest_float);
2418 }
aca2d497 2419 is_inf = is_inf_internal = 1;
653099ff 2420 if (flags & SCF_DO_STCLASS_OR)
830247a4 2421 cl_anything(pRExC_state, data->start_class);
653099ff 2422 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2423 break;
2424 }
a0ed51b3 2425 }
bfed75c6 2426 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2427 int value = 0;
653099ff 2428
c277df42 2429 if (flags & SCF_DO_SUBSTR) {
830247a4 2430 scan_commit(pRExC_state,data);
c277df42
IZ
2431 data->pos_min++;
2432 }
2433 min++;
653099ff
GS
2434 if (flags & SCF_DO_STCLASS) {
2435 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2436
2437 /* Some of the logic below assumes that switching
2438 locale on will only add false positives. */
2439 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2440 case SANY:
653099ff
GS
2441 default:
2442 do_default:
2443 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2444 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2445 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2446 break;
2447 case REG_ANY:
2448 if (OP(scan) == SANY)
2449 goto do_default;
2450 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2451 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2452 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2453 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2454 }
2455 if (flags & SCF_DO_STCLASS_AND || !value)
2456 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2457 break;
2458 case ANYOF:
2459 if (flags & SCF_DO_STCLASS_AND)
2460 cl_and(data->start_class,
2461 (struct regnode_charclass_class*)scan);
2462 else
830247a4 2463 cl_or(pRExC_state, data->start_class,
653099ff
GS
2464 (struct regnode_charclass_class*)scan);
2465 break;
2466 case ALNUM:
2467 if (flags & SCF_DO_STCLASS_AND) {
2468 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2469 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2470 for (value = 0; value < 256; value++)
2471 if (!isALNUM(value))
2472 ANYOF_BITMAP_CLEAR(data->start_class, value);
2473 }
2474 }
2475 else {
2476 if (data->start_class->flags & ANYOF_LOCALE)
2477 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2478 else {
2479 for (value = 0; value < 256; value++)
2480 if (isALNUM(value))
b81d288d 2481 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2482 }
2483 }
2484 break;
2485 case ALNUML:
2486 if (flags & SCF_DO_STCLASS_AND) {
2487 if (data->start_class->flags & ANYOF_LOCALE)
2488 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2489 }
2490 else {
2491 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2492 data->start_class->flags |= ANYOF_LOCALE;
2493 }
2494 break;
2495 case NALNUM:
2496 if (flags & SCF_DO_STCLASS_AND) {
2497 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2498 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2499 for (value = 0; value < 256; value++)
2500 if (isALNUM(value))
2501 ANYOF_BITMAP_CLEAR(data->start_class, value);
2502 }
2503 }
2504 else {
2505 if (data->start_class->flags & ANYOF_LOCALE)
2506 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2507 else {
2508 for (value = 0; value < 256; value++)
2509 if (!isALNUM(value))
b81d288d 2510 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2511 }
2512 }
2513 break;
2514 case NALNUML:
2515 if (flags & SCF_DO_STCLASS_AND) {
2516 if (data->start_class->flags & ANYOF_LOCALE)
2517 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2518 }
2519 else {
2520 data->start_class->flags |= ANYOF_LOCALE;
2521 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2522 }
2523 break;
2524 case SPACE:
2525 if (flags & SCF_DO_STCLASS_AND) {
2526 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2527 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2528 for (value = 0; value < 256; value++)
2529 if (!isSPACE(value))
2530 ANYOF_BITMAP_CLEAR(data->start_class, value);
2531 }
2532 }
2533 else {
2534 if (data->start_class->flags & ANYOF_LOCALE)
2535 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2536 else {
2537 for (value = 0; value < 256; value++)
2538 if (isSPACE(value))
b81d288d 2539 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2540 }
2541 }
2542 break;
2543 case SPACEL:
2544 if (flags & SCF_DO_STCLASS_AND) {
2545 if (data->start_class->flags & ANYOF_LOCALE)
2546 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2547 }
2548 else {
2549 data->start_class->flags |= ANYOF_LOCALE;
2550 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2551 }
2552 break;
2553 case NSPACE:
2554 if (flags & SCF_DO_STCLASS_AND) {
2555 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2556 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2557 for (value = 0; value < 256; value++)
2558 if (isSPACE(value))
2559 ANYOF_BITMAP_CLEAR(data->start_class, value);
2560 }
2561 }
2562 else {
2563 if (data->start_class->flags & ANYOF_LOCALE)
2564 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2565 else {
2566 for (value = 0; value < 256; value++)
2567 if (!isSPACE(value))
b81d288d 2568 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2569 }
2570 }
2571 break;
2572 case NSPACEL:
2573 if (flags & SCF_DO_STCLASS_AND) {
2574 if (data->start_class->flags & ANYOF_LOCALE) {
2575 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2576 for (value = 0; value < 256; value++)
2577 if (!isSPACE(value))
2578 ANYOF_BITMAP_CLEAR(data->start_class, value);
2579 }
2580 }
2581 else {
2582 data->start_class->flags |= ANYOF_LOCALE;
2583 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2584 }
2585 break;
2586 case DIGIT:
2587 if (flags & SCF_DO_STCLASS_AND) {
2588 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2589 for (value = 0; value < 256; value++)
2590 if (!isDIGIT(value))
2591 ANYOF_BITMAP_CLEAR(data->start_class, value);
2592 }
2593 else {
2594 if (data->start_class->flags & ANYOF_LOCALE)
2595 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2596 else {
2597 for (value = 0; value < 256; value++)
2598 if (isDIGIT(value))
b81d288d 2599 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2600 }
2601 }
2602 break;
2603 case NDIGIT:
2604 if (flags & SCF_DO_STCLASS_AND) {
2605 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2606 for (value = 0; value < 256; value++)
2607 if (isDIGIT(value))
2608 ANYOF_BITMAP_CLEAR(data->start_class, value);
2609 }
2610 else {
2611 if (data->start_class->flags & ANYOF_LOCALE)
2612 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2613 else {
2614 for (value = 0; value < 256; value++)
2615 if (!isDIGIT(value))
b81d288d 2616 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2617 }
2618 }
2619 break;
2620 }
2621 if (flags & SCF_DO_STCLASS_OR)
2622 cl_and(data->start_class, &and_with);
2623 flags &= ~SCF_DO_STCLASS;
2624 }
a0ed51b3 2625 }
22c35a8c 2626 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2627 data->flags |= (OP(scan) == MEOL
2628 ? SF_BEFORE_MEOL
2629 : SF_BEFORE_SEOL);
a0ed51b3 2630 }
653099ff
GS
2631 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2632 /* Lookbehind, or need to calculate parens/evals/stclass: */
2633 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2634 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2635 /* Lookahead/lookbehind */
cb434fcc 2636 I32 deltanext, minnext, fake = 0;
c277df42 2637 regnode *nscan;
653099ff
GS
2638 struct regnode_charclass_class intrnl;
2639 int f = 0;
c277df42
IZ
2640
2641 data_fake.flags = 0;
b81d288d 2642 if (data) {
2c2d71f5 2643 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2644 data_fake.last_closep = data->last_closep;
2645 }
2646 else
2647 data_fake.last_closep = &fake;
653099ff
GS
2648 if ( flags & SCF_DO_STCLASS && !scan->flags
2649 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2650 cl_init(pRExC_state, &intrnl);
653099ff 2651 data_fake.start_class = &intrnl;
e1901655 2652 f |= SCF_DO_STCLASS_AND;
653099ff 2653 }
e1901655
IZ
2654 if (flags & SCF_WHILEM_VISITED_POS)
2655 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2656 next = regnext(scan);
2657 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2658 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2659 if (scan->flags) {
2660 if (deltanext) {
9baa0206 2661 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2662 }
2663 else if (minnext > U8_MAX) {
9baa0206 2664 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2665 }
eb160463 2666 scan->flags = (U8)minnext;
c277df42
IZ
2667 }
2668 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2669 pars++;
405ff068 2670 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2671 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2672 if (data)
2673 data->whilem_c = data_fake.whilem_c;
e1901655 2674 if (f & SCF_DO_STCLASS_AND) {
a28509cc 2675 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
2676
2677 cl_and(data->start_class, &intrnl);
2678 if (was)
2679 data->start_class->flags |= ANYOF_EOS;
2680 }
a0ed51b3
LW
2681 }
2682 else if (OP(scan) == OPEN) {
c277df42 2683 pars++;
a0ed51b3 2684 }
cb434fcc 2685 else if (OP(scan) == CLOSE) {
eb160463 2686 if ((I32)ARG(scan) == is_par) {
cb434fcc 2687 next = regnext(scan);
c277df42 2688
cb434fcc
IZ
2689 if ( next && (OP(next) != WHILEM) && next < last)
2690 is_par = 0; /* Disable optimization */
2691 }
2692 if (data)
2693 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2694 }
2695 else if (OP(scan) == EVAL) {
c277df42
IZ
2696 if (data)
2697 data->flags |= SF_HAS_EVAL;
2698 }
96776eda 2699 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2700 if (flags & SCF_DO_SUBSTR) {
830247a4 2701 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2702 data->longest = &(data->longest_float);
2703 }
2704 is_inf = is_inf_internal = 1;
653099ff 2705 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2706 cl_anything(pRExC_state, data->start_class);
96776eda 2707 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2708 }
c277df42
IZ
2709 /* Else: zero-length, ignore. */
2710 scan = regnext(scan);
2711 }
2712
2713 finish:
2714 *scanp = scan;
aca2d497 2715 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2716 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2717 data->pos_delta = I32_MAX - data->pos_min;
2718 if (is_par > U8_MAX)
2719 is_par = 0;
2720 if (is_par && pars==1 && data) {
2721 data->flags |= SF_IN_PAR;
2722 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2723 }
2724 else if (pars && data) {
c277df42
IZ
2725 data->flags |= SF_HAS_PAR;
2726 data->flags &= ~SF_IN_PAR;
2727 }
653099ff
GS
2728 if (flags & SCF_DO_STCLASS_OR)
2729 cl_and(data->start_class, &and_with);
c277df42
IZ
2730 return min;
2731}
2732
76e3520e 2733STATIC I32
5f66b61c 2734S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2735{
830247a4 2736 if (RExC_rx->data) {
b81d288d
AB
2737 Renewc(RExC_rx->data,
2738 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2739 char, struct reg_data);
830247a4
IZ
2740 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2741 RExC_rx->data->count += n;
a0ed51b3
LW
2742 }
2743 else {
a02a5408 2744 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2745 char, struct reg_data);
a02a5408 2746 Newx(RExC_rx->data->what, n, U8);
830247a4 2747 RExC_rx->data->count = n;
c277df42 2748 }
830247a4
IZ
2749 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2750 return RExC_rx->data->count - n;
c277df42
IZ
2751}
2752
d88dccdf 2753void
864dbfa3 2754Perl_reginitcolors(pTHX)
d88dccdf 2755{
97aff369 2756 dVAR;
1df70142 2757 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 2758 if (s) {
1df70142
AL
2759 char *t = savepv(s);
2760 int i = 0;
2761 PL_colors[0] = t;
d88dccdf 2762 while (++i < 6) {
1df70142
AL
2763 t = strchr(t, '\t');
2764 if (t) {
2765 *t = '\0';
2766 PL_colors[i] = ++t;
d88dccdf
IZ
2767 }
2768 else
1df70142 2769 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
2770 }
2771 } else {
1df70142 2772 int i = 0;
b81d288d 2773 while (i < 6)
06b5626a 2774 PL_colors[i++] = (char *)"";
d88dccdf
IZ
2775 }
2776 PL_colorset = 1;
2777}
2778
8615cb43 2779
a687059c 2780/*
e50aee73 2781 - pregcomp - compile a regular expression into internal code
a687059c
LW
2782 *
2783 * We can't allocate space until we know how big the compiled form will be,
2784 * but we can't compile it (and thus know how big it is) until we've got a
2785 * place to put the code. So we cheat: we compile it twice, once with code
2786 * generation turned off and size counting turned on, and once "for real".
2787 * This also means that we don't allocate space until we are sure that the
2788 * thing really will compile successfully, and we never have to move the
2789 * code and thus invalidate pointers into it. (Note that it has to be in
2790 * one piece because free() must be able to free it all.) [NB: not true in perl]
2791 *
2792 * Beware that the optimization-preparation code in here knows about some
2793 * of the structure of the compiled regexp. [I'll say.]
2794 */
2795regexp *
864dbfa3 2796Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2797{
97aff369 2798 dVAR;
a0d0e21e 2799 register regexp *r;
c277df42 2800 regnode *scan;
c277df42 2801 regnode *first;
a0d0e21e 2802 I32 flags;
a0d0e21e
LW
2803 I32 minlen = 0;
2804 I32 sawplus = 0;
2805 I32 sawopen = 0;
2c2d71f5 2806 scan_data_t data;
830247a4
IZ
2807 RExC_state_t RExC_state;
2808 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 2809
a3621e74
YO
2810 GET_RE_DEBUG_FLAGS_DECL;
2811
a0d0e21e 2812 if (exp == NULL)
c277df42 2813 FAIL("NULL regexp argument");
a0d0e21e 2814
a5961de5 2815 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 2816
5cfc7842 2817 RExC_precomp = exp;
a3621e74
YO
2818 DEBUG_r(if (!PL_colorset) reginitcolors());
2819 DEBUG_COMPILE_r({
2820 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
2821 PL_colors[4],PL_colors[5],PL_colors[0],
2822 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2823 });
e2509266 2824 RExC_flags = pm->op_pmflags;
830247a4 2825 RExC_sawback = 0;
bbce6d69 2826
830247a4
IZ
2827 RExC_seen = 0;
2828 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2829 RExC_seen_evals = 0;
2830 RExC_extralen = 0;
c277df42 2831
bbce6d69 2832 /* First pass: determine size, legality. */
830247a4 2833 RExC_parse = exp;
fac92740 2834 RExC_start = exp;
830247a4
IZ
2835 RExC_end = xend;
2836 RExC_naughty = 0;
2837 RExC_npar = 1;
2838 RExC_size = 0L;
2839 RExC_emit = &PL_regdummy;
2840 RExC_whilem_seen = 0;
85ddcde9
JH
2841#if 0 /* REGC() is (currently) a NOP at the first pass.
2842 * Clever compilers notice this and complain. --jhi */
830247a4 2843 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 2844#endif
830247a4 2845 if (reg(pRExC_state, 0, &flags) == NULL) {
c445ea15 2846 RExC_precomp = NULL;
a0d0e21e
LW
2847 return(NULL);
2848 }
a3621e74 2849 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 2850
c277df42
IZ
2851 /* Small enough for pointer-storage convention?
2852 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
2853 if (RExC_size >= 0x10000L && RExC_extralen)
2854 RExC_size += RExC_extralen;
c277df42 2855 else
830247a4
IZ
2856 RExC_extralen = 0;
2857 if (RExC_whilem_seen > 15)
2858 RExC_whilem_seen = 15;
a0d0e21e 2859
bbce6d69 2860 /* Allocate space and initialize. */
a02a5408 2861 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 2862 char, regexp);
a0d0e21e 2863 if (r == NULL)
b45f050a
JF
2864 FAIL("Regexp out of space");
2865
0f79a09d
GS
2866#ifdef DEBUGGING
2867 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 2868 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 2869#endif
c277df42 2870 r->refcnt = 1;
bbce6d69 2871 r->prelen = xend - exp;
5cfc7842 2872 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 2873 r->subbeg = NULL;
f8c7b90f 2874#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 2875 r->saved_copy = NULL;
ed252734 2876#endif
cf93c79d 2877 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 2878 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 2879 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
2880
2881 r->substrs = 0; /* Useful during FAIL. */
2882 r->startp = 0; /* Useful during FAIL. */
2883 r->endp = 0; /* Useful during FAIL. */
2884
a02a5408 2885 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 2886 if (r->offsets) {
2af232bd 2887 r->offsets[0] = RExC_size;
fac92740 2888 }
a3621e74 2889 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
2890 "%s %"UVuf" bytes for offset annotations.\n",
2891 r->offsets ? "Got" : "Couldn't get",
392fbf5d 2892 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 2893
830247a4 2894 RExC_rx = r;
bbce6d69 2895
2896 /* Second pass: emit code. */
e2509266 2897 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
2898 RExC_parse = exp;
2899 RExC_end = xend;
2900 RExC_naughty = 0;
2901 RExC_npar = 1;
fac92740 2902 RExC_emit_start = r->program;
830247a4 2903 RExC_emit = r->program;
2cd61cdb 2904 /* Store the count of eval-groups for security checks: */
eb160463 2905 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 2906 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 2907 r->data = 0;
830247a4 2908 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
2909 return(NULL);
2910
a3621e74 2911
a0d0e21e 2912 /* Dig out information for optimizations. */
cf93c79d 2913 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 2914 pm->op_pmflags = RExC_flags;
a0ed51b3 2915 if (UTF)
5ff6fc6d 2916 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 2917 r->regstclass = NULL;
830247a4 2918 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 2919 r->reganch |= ROPT_NAUGHTY;
c277df42 2920 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
2921
2922 /* XXXX To minimize changes to RE engine we always allocate
2923 3-units-long substrs field. */
a02a5408 2924 Newxz(r->substrs, 1, struct reg_substr_data);
2779dcf1 2925
2c2d71f5 2926 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 2927 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 2928 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 2929 I32 fake;
c5254dd6 2930 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
2931 struct regnode_charclass_class ch_class;
2932 int stclass_flag;
cb434fcc 2933 I32 last_close = 0;
a0d0e21e
LW
2934
2935 first = scan;
c277df42 2936 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 2937 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 2938 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
2939 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2940 (OP(first) == PLUS) ||
2941 (OP(first) == MINMOD) ||
653099ff 2942 /* An {n,m} with n>0 */
22c35a8c 2943 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
2944 if (OP(first) == PLUS)
2945 sawplus = 1;
2946 else
2947 first += regarglen[(U8)OP(first)];
2948 first = NEXTOPER(first);
a687059c
LW
2949 }
2950
a0d0e21e
LW
2951 /* Starting-point info. */
2952 again:
653099ff 2953 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b 2954 if (OP(first) == EXACT)
bb263b4e 2955 /*EMPTY*/; /* Empty, get anchored substr later. */
1aa99e6b 2956 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
2957 r->regstclass = first;
2958 }
bfed75c6 2959 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 2960 r->regstclass = first;
22c35a8c
GS
2961 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2962 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 2963 r->regstclass = first;
22c35a8c 2964 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
2965 r->reganch |= (OP(first) == MBOL
2966 ? ROPT_ANCH_MBOL
2967 : (OP(first) == SBOL
2968 ? ROPT_ANCH_SBOL
2969 : ROPT_ANCH_BOL));
a0d0e21e 2970 first = NEXTOPER(first);
774d564b 2971 goto again;
2972 }
2973 else if (OP(first) == GPOS) {
2974 r->reganch |= ROPT_ANCH_GPOS;
2975 first = NEXTOPER(first);
2976 goto again;
a0d0e21e 2977 }
e09294f4 2978 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 2979 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
2980 !(r->reganch & ROPT_ANCH) )
2981 {
2982 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
2983 const int type =
2984 (OP(NEXTOPER(first)) == REG_ANY)
2985 ? ROPT_ANCH_MBOL
2986 : ROPT_ANCH_SBOL;
cad2e5aa 2987 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 2988 first = NEXTOPER(first);
774d564b 2989 goto again;
a0d0e21e 2990 }
b81d288d 2991 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 2992 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
2993 /* x+ must match at the 1st pos of run of x's */
2994 r->reganch |= ROPT_SKIP;
a0d0e21e 2995
c277df42 2996 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 2997 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 2998 (IV)(first - scan + 1)));
a0d0e21e
LW
2999 /*
3000 * If there's something expensive in the r.e., find the
3001 * longest literal string that must appear and make it the
3002 * regmust. Resolve ties in favor of later strings, since
3003 * the regstart check works with the beginning of the r.e.
3004 * and avoiding duplication strengthens checking. Not a
3005 * strong reason, but sufficient in the absence of others.
3006 * [Now we resolve ties in favor of the earlier string if
c277df42 3007 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3008 * earlier string may buy us something the later one won't.]
3009 */
a0d0e21e 3010 minlen = 0;
a687059c 3011
396482e1
GA
3012 data.longest_fixed = newSVpvs("");
3013 data.longest_float = newSVpvs("");
3014 data.last_found = newSVpvs("");
c277df42
IZ
3015 data.longest = &(data.longest_fixed);
3016 first = scan;
653099ff 3017 if (!r->regstclass) {
830247a4 3018 cl_init(pRExC_state, &ch_class);
653099ff
GS
3019 data.start_class = &ch_class;
3020 stclass_flag = SCF_DO_STCLASS_AND;
3021 } else /* XXXX Check for BOUND? */
3022 stclass_flag = 0;
cb434fcc 3023 data.last_closep = &last_close;
653099ff 3024
830247a4 3025 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3026 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3027 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3028 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3029 && !RExC_seen_zerolen
3030 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3031 r->reganch |= ROPT_CHECK_ALL;
830247a4 3032 scan_commit(pRExC_state, &data);
c277df42
IZ
3033 SvREFCNT_dec(data.last_found);
3034
a0ed51b3 3035 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3036 if (longest_float_length
c277df42
IZ
3037 || (data.flags & SF_FL_BEFORE_EOL
3038 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3039 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3040 int t;
3041
a0ed51b3 3042 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3043 && data.offset_fixed == data.offset_float_min
3044 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3045 goto remove_float; /* As in (a)+. */
3046
33b8afdf
JH
3047 if (SvUTF8(data.longest_float)) {
3048 r->float_utf8 = data.longest_float;
c445ea15 3049 r->float_substr = NULL;
33b8afdf
JH
3050 } else {
3051 r->float_substr = data.longest_float;
c445ea15 3052 r->float_utf8 = NULL;
33b8afdf 3053 }
c277df42
IZ
3054 r->float_min_offset = data.offset_float_min;
3055 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3056 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3057 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3058 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3059 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3060 }
3061 else {
aca2d497 3062 remove_float:
c445ea15 3063 r->float_substr = r->float_utf8 = NULL;
c277df42 3064 SvREFCNT_dec(data.longest_float);
c5254dd6 3065 longest_float_length = 0;
a0d0e21e 3066 }
c277df42 3067
a0ed51b3 3068 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3069 if (longest_fixed_length
c277df42
IZ
3070 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3071 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3072 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3073 int t;
3074
33b8afdf
JH
3075 if (SvUTF8(data.longest_fixed)) {
3076 r->anchored_utf8 = data.longest_fixed;
c445ea15 3077 r->anchored_substr = NULL;
33b8afdf
JH
3078 } else {
3079 r->anchored_substr = data.longest_fixed;
c445ea15 3080 r->anchored_utf8 = NULL;
33b8afdf 3081 }
c277df42 3082 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3083 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3084 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3085 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3086 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3087 }
3088 else {
c445ea15 3089 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 3090 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3091 longest_fixed_length = 0;
a0d0e21e 3092 }
b81d288d 3093 if (r->regstclass
ffc61ed2 3094 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3095 r->regstclass = NULL;
33b8afdf
JH
3096 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3097 && stclass_flag
653099ff 3098 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3099 && !cl_is_anything(data.start_class))
3100 {
1df70142 3101 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3102
a02a5408 3103 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3104 struct regnode_charclass_class);
3105 StructCopy(data.start_class,
830247a4 3106 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3107 struct regnode_charclass_class);
830247a4 3108 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3109 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 3110 PL_regdata = r->data; /* for regprop() */
a3621e74 3111 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
9c5ffd7c
JH
3112 regprop(sv, (regnode*)data.start_class);
3113 PerlIO_printf(Perl_debug_log,
a0288114 3114 "synthetic stclass \"%s\".\n",
3f7c398e 3115 SvPVX_const(sv));});
653099ff 3116 }
c277df42
IZ
3117
3118 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3119 if (longest_fixed_length > longest_float_length) {
c277df42 3120 r->check_substr = r->anchored_substr;
33b8afdf 3121 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3122 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3123 if (r->reganch & ROPT_ANCH_SINGLE)
3124 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3125 }
3126 else {
c277df42 3127 r->check_substr = r->float_substr;
33b8afdf 3128 r->check_utf8 = r->float_utf8;
c277df42
IZ
3129 r->check_offset_min = data.offset_float_min;
3130 r->check_offset_max = data.offset_float_max;
a0d0e21e 3131 }
30382c73
IZ
3132 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3133 This should be changed ASAP! */
33b8afdf 3134 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3135 r->reganch |= RE_USE_INTUIT;
33b8afdf 3136 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3137 r->reganch |= RE_INTUIT_TAIL;
3138 }
a0ed51b3
LW
3139 }
3140 else {
c277df42
IZ
3141 /* Several toplevels. Best we can is to set minlen. */
3142 I32 fake;
653099ff 3143 struct regnode_charclass_class ch_class;
cb434fcc 3144 I32 last_close = 0;
c277df42 3145
a3621e74 3146 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3147 scan = r->program + 1;
830247a4 3148 cl_init(pRExC_state, &ch_class);
653099ff 3149 data.start_class = &ch_class;
cb434fcc 3150 data.last_closep = &last_close;
a3621e74 3151 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf 3152 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 3153 = r->float_substr = r->float_utf8 = NULL;
653099ff 3154 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3155 && !cl_is_anything(data.start_class))
3156 {
1df70142 3157 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3158
a02a5408 3159 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3160 struct regnode_charclass_class);
3161 StructCopy(data.start_class,
830247a4 3162 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3163 struct regnode_charclass_class);
830247a4 3164 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3165 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3166 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
9c5ffd7c
JH
3167 regprop(sv, (regnode*)data.start_class);
3168 PerlIO_printf(Perl_debug_log,
a0288114 3169 "synthetic stclass \"%s\".\n",
3f7c398e 3170 SvPVX_const(sv));});
653099ff 3171 }
a0d0e21e
LW
3172 }
3173
a0d0e21e 3174 r->minlen = minlen;
b81d288d 3175 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3176 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3177 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3178 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3179 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3180 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3181 if (RExC_seen & REG_SEEN_CANY)
3182 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3183 Newxz(r->startp, RExC_npar, I32);
3184 Newxz(r->endp, RExC_npar, I32);
ffc61ed2 3185 PL_regdata = r->data; /* for regprop() */
a3621e74 3186 DEBUG_COMPILE_r(regdump(r));
a0d0e21e 3187 return(r);
a687059c
LW
3188}
3189
3190/*
3191 - reg - regular expression, i.e. main body or parenthesized thing
3192 *
3193 * Caller must absorb opening parenthesis.
3194 *
3195 * Combining parenthesis handling with the base level of regular expression
3196 * is a trifle forced, but the need to tie the tails of the branches to what
3197 * follows makes it hard to avoid.
3198 */
76e3520e 3199STATIC regnode *
830247a4 3200S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 3201 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3202{
27da23d5 3203 dVAR;
c277df42
IZ
3204 register regnode *ret; /* Will be the head of the group. */
3205 register regnode *br;
3206 register regnode *lastbr;
cbbf8932 3207 register regnode *ender = NULL;
a0d0e21e 3208 register I32 parno = 0;
cbbf8932
AL
3209 I32 flags;
3210 const I32 oregflags = RExC_flags;
6136c704
AL
3211 bool have_branch = 0;
3212 bool is_open = 0;
9d1d55b5
JP
3213
3214 /* for (?g), (?gc), and (?o) warnings; warning
3215 about (?c) will warn about (?g) -- japhy */
3216
6136c704
AL
3217#define WASTED_O 0x01
3218#define WASTED_G 0x02
3219#define WASTED_C 0x04
3220#define WASTED_GC (0x02|0x04)
cbbf8932 3221 I32 wastedflags = 0x00;
9d1d55b5 3222
fac92740 3223 char * parse_start = RExC_parse; /* MJD */
a28509cc 3224 char * const oregcomp_parse = RExC_parse;
a0d0e21e 3225
821b33a5 3226 *flagp = 0; /* Tentatively. */
a0d0e21e 3227
9d1d55b5 3228
a0d0e21e
LW
3229 /* Make an OPEN node, if parenthesized. */
3230 if (paren) {
fac92740 3231 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3232 U32 posflags = 0, negflags = 0;
3233 U32 *flagsp = &posflags;
6136c704 3234 bool is_logical = 0;
a28509cc 3235 const char * const seqstart = RExC_parse;
ca9dfc88 3236
830247a4
IZ
3237 RExC_parse++;
3238 paren = *RExC_parse++;
c277df42 3239 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3240 switch (paren) {
fac92740 3241 case '<': /* (?<...) */
830247a4 3242 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3243 if (*RExC_parse == '!')
c277df42 3244 paren = ',';
b81d288d 3245 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3246 goto unknown;
830247a4 3247 RExC_parse++;
fac92740
MJD
3248 case '=': /* (?=...) */
3249 case '!': /* (?!...) */
830247a4 3250 RExC_seen_zerolen++;
fac92740
MJD
3251 case ':': /* (?:...) */
3252 case '>': /* (?>...) */
a0d0e21e 3253 break;
fac92740
MJD
3254 case '$': /* (?$...) */
3255 case '@': /* (?@...) */
8615cb43 3256 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3257 break;
fac92740 3258 case '#': /* (?#...) */
830247a4
IZ
3259 while (*RExC_parse && *RExC_parse != ')')
3260 RExC_parse++;
3261 if (*RExC_parse != ')')
c277df42 3262 FAIL("Sequence (?#... not terminated");
830247a4 3263 nextchar(pRExC_state);
a0d0e21e
LW
3264 *flagp = TRYAGAIN;
3265 return NULL;
fac92740 3266 case 'p': /* (?p...) */
9014280d 3267 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3268 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3269 /* FALL THROUGH*/
fac92740 3270 case '?': /* (??...) */
6136c704 3271 is_logical = 1;
438a3801
YST
3272 if (*RExC_parse != '{')
3273 goto unknown;
830247a4 3274 paren = *RExC_parse++;
0f5d15d6 3275 /* FALL THROUGH */
fac92740 3276 case '{': /* (?{...}) */
c277df42 3277 {
c277df42
IZ
3278 I32 count = 1, n = 0;
3279 char c;
830247a4 3280 char *s = RExC_parse;
c277df42 3281
830247a4
IZ
3282 RExC_seen_zerolen++;
3283 RExC_seen |= REG_SEEN_EVAL;
3284 while (count && (c = *RExC_parse)) {
6136c704
AL
3285 if (c == '\\') {
3286 if (RExC_parse[1])
3287 RExC_parse++;
3288 }
b81d288d 3289 else if (c == '{')
c277df42 3290 count++;
b81d288d 3291 else if (c == '}')
c277df42 3292 count--;
830247a4 3293 RExC_parse++;
c277df42 3294 }
6136c704 3295 if (*RExC_parse != ')') {
b81d288d 3296 RExC_parse = s;
b45f050a
JF
3297 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3298 }
c277df42 3299 if (!SIZE_ONLY) {
f3548bdc 3300 PAD *pad;
6136c704
AL
3301 OP_4tree *sop, *rop;
3302 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3303
569233ed
SB
3304 ENTER;
3305 Perl_save_re_context(aTHX);
f3548bdc 3306 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3307 sop->op_private |= OPpREFCOUNTED;
3308 /* re_dup will OpREFCNT_inc */
3309 OpREFCNT_set(sop, 1);
569233ed 3310 LEAVE;
c277df42 3311
830247a4
IZ
3312 n = add_data(pRExC_state, 3, "nop");
3313 RExC_rx->data->data[n] = (void*)rop;
3314 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3315 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3316 SvREFCNT_dec(sv);
a0ed51b3 3317 }
e24b16f9 3318 else { /* First pass */
830247a4 3319 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3320 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3321 /* No compiled RE interpolated, has runtime
3322 components ===> unsafe. */
3323 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3324 if (PL_tainting && PL_tainted)
cc6b7395 3325 FAIL("Eval-group in insecure regular expression");
923e4eb5 3326 if (IN_PERL_COMPILETIME)
b5c19bd7 3327 PL_cv_has_eval = 1;
c277df42 3328 }
b5c19bd7 3329
830247a4 3330 nextchar(pRExC_state);
6136c704 3331 if (is_logical) {
830247a4 3332 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3333 if (!SIZE_ONLY)
3334 ret->flags = 2;
830247a4 3335 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3336 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3337 return ret;
3338 }
ccb2c380
MP
3339 ret = reganode(pRExC_state, EVAL, n);
3340 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3341 Set_Node_Offset(ret, parse_start);
3342 return ret;
c277df42 3343 }
fac92740 3344 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3345 {
fac92740 3346 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3347 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3348 || RExC_parse[1] == '<'
830247a4 3349 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3350 I32 flag;
3351
830247a4 3352 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3353 if (!SIZE_ONLY)
3354 ret->flags = 1;
830247a4 3355 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 3356 goto insert_if;
b81d288d 3357 }
a0ed51b3 3358 }
830247a4 3359 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3360 /* (?(1)...) */
6136c704 3361 char c;
830247a4 3362 parno = atoi(RExC_parse++);
c277df42 3363
830247a4
IZ
3364 while (isDIGIT(*RExC_parse))
3365 RExC_parse++;
fac92740 3366 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3367
830247a4 3368 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3369 vFAIL("Switch condition not recognized");
c277df42 3370 insert_if:
830247a4
IZ
3371 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3372 br = regbranch(pRExC_state, &flags, 1);
c277df42 3373 if (br == NULL)
830247a4 3374 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3375 else
830247a4
IZ
3376 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3377 c = *nextchar(pRExC_state);
d1b80229
IZ
3378 if (flags&HASWIDTH)
3379 *flagp |= HASWIDTH;
c277df42 3380 if (c == '|') {
830247a4
IZ
3381 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3382 regbranch(pRExC_state, &flags, 1);
3383 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
3384 if (flags&HASWIDTH)
3385 *flagp |= HASWIDTH;
830247a4 3386 c = *nextchar(pRExC_state);
a0ed51b3
LW
3387 }
3388 else
c277df42
IZ
3389 lastbr = NULL;
3390 if (c != ')')
8615cb43 3391 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
3392 ender = reg_node(pRExC_state, TAIL);
3393 regtail(pRExC_state, br, ender);
c277df42 3394 if (lastbr) {
830247a4
IZ
3395 regtail(pRExC_state, lastbr, ender);
3396 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3397 }
3398 else
830247a4 3399 regtail(pRExC_state, ret, ender);
c277df42 3400 return ret;
a0ed51b3
LW
3401 }
3402 else {
830247a4 3403 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3404 }
3405 }
1b1626e4 3406 case 0:
830247a4 3407 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3408 vFAIL("Sequence (? incomplete");
1b1626e4 3409 break;
a0d0e21e 3410 default:
830247a4 3411 --RExC_parse;
fac92740 3412 parse_flags: /* (?i) */
830247a4 3413 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3414 /* (?g), (?gc) and (?o) are useless here
3415 and must be globally applied -- japhy */
3416
3417 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3418 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 3419 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
3420 if (! (wastedflags & wflagbit) ) {
3421 wastedflags |= wflagbit;
3422 vWARN5(
3423 RExC_parse + 1,
3424 "Useless (%s%c) - %suse /%c modifier",
3425 flagsp == &negflags ? "?-" : "?",
3426 *RExC_parse,
3427 flagsp == &negflags ? "don't " : "",
3428 *RExC_parse
3429 );
3430 }
3431 }
3432 }
3433 else if (*RExC_parse == 'c') {
3434 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
3435 if (! (wastedflags & WASTED_C) ) {
3436 wastedflags |= WASTED_GC;
9d1d55b5
JP
3437 vWARN3(
3438 RExC_parse + 1,
3439 "Useless (%sc) - %suse /gc modifier",
3440 flagsp == &negflags ? "?-" : "?",
3441 flagsp == &negflags ? "don't " : ""
3442 );
3443 }
3444 }
3445 }
3446 else { pmflag(flagsp, *RExC_parse); }
3447
830247a4 3448 ++RExC_parse;
ca9dfc88 3449 }
830247a4 3450 if (*RExC_parse == '-') {
ca9dfc88 3451 flagsp = &negflags;
9d1d55b5 3452 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3453 ++RExC_parse;
ca9dfc88 3454 goto parse_flags;
48c036b1 3455 }
e2509266
JH
3456 RExC_flags |= posflags;
3457 RExC_flags &= ~negflags;
830247a4
IZ
3458 if (*RExC_parse == ':') {
3459 RExC_parse++;
ca9dfc88
IZ
3460 paren = ':';
3461 break;
3462 }
c277df42 3463 unknown:
830247a4
IZ
3464 if (*RExC_parse != ')') {
3465 RExC_parse++;
3466 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3467 }
830247a4 3468 nextchar(pRExC_state);
a0d0e21e
LW
3469 *flagp = TRYAGAIN;
3470 return NULL;
3471 }
3472 }
fac92740 3473 else { /* (...) */
830247a4
IZ
3474 parno = RExC_npar;
3475 RExC_npar++;
3476 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3477 Set_Node_Length(ret, 1); /* MJD */
3478 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 3479 is_open = 1;
a0d0e21e 3480 }
a0ed51b3 3481 }
fac92740 3482 else /* ! paren */
a0d0e21e
LW
3483 ret = NULL;
3484
3485 /* Pick up the branches, linking them together. */
fac92740 3486 parse_start = RExC_parse; /* MJD */
830247a4 3487 br = regbranch(pRExC_state, &flags, 1);
fac92740 3488 /* branch_len = (paren != 0); */
2af232bd 3489
a0d0e21e
LW
3490 if (br == NULL)
3491 return(NULL);
830247a4
IZ
3492 if (*RExC_parse == '|') {
3493 if (!SIZE_ONLY && RExC_extralen) {
3494 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3495 }
fac92740 3496 else { /* MJD */
830247a4 3497 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3498 Set_Node_Length(br, paren != 0);
3499 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3500 }
c277df42
IZ
3501 have_branch = 1;
3502 if (SIZE_ONLY)
830247a4 3503 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3504 }
3505 else if (paren == ':') {
c277df42
IZ
3506 *flagp |= flags&SIMPLE;
3507 }
6136c704 3508 if (is_open) { /* Starts with OPEN. */
830247a4 3509 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3510 }
3511 else if (paren != '?') /* Not Conditional */
a0d0e21e 3512 ret = br;
32a0ca98 3513 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3514 lastbr = br;
830247a4
IZ
3515 while (*RExC_parse == '|') {
3516 if (!SIZE_ONLY && RExC_extralen) {
3517 ender = reganode(pRExC_state, LONGJMP,0);
3518 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3519 }
3520 if (SIZE_ONLY)
830247a4
IZ
3521 RExC_extralen += 2; /* Account for LONGJMP. */
3522 nextchar(pRExC_state);
3523 br = regbranch(pRExC_state, &flags, 0);
2af232bd 3524
a687059c 3525 if (br == NULL)
a0d0e21e 3526 return(NULL);
830247a4 3527 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3528 lastbr = br;
821b33a5
IZ
3529 if (flags&HASWIDTH)
3530 *flagp |= HASWIDTH;
a687059c 3531 *flagp |= flags&SPSTART;
a0d0e21e
LW
3532 }
3533
c277df42
IZ
3534 if (have_branch || paren != ':') {
3535 /* Make a closing node, and hook it on the end. */
3536 switch (paren) {
3537 case ':':
830247a4 3538 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3539 break;
3540 case 1:
830247a4 3541 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3542 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3543 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3544 break;
3545 case '<':
c277df42
IZ
3546 case ',':
3547 case '=':
3548 case '!':
c277df42 3549 *flagp &= ~HASWIDTH;
821b33a5
IZ
3550 /* FALL THROUGH */
3551 case '>':
830247a4 3552 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3553 break;
3554 case 0:
830247a4 3555 ender = reg_node(pRExC_state, END);
c277df42
IZ
3556 break;
3557 }
830247a4 3558 regtail(pRExC_state, lastbr, ender);
a0d0e21e 3559
c277df42
IZ
3560 if (have_branch) {
3561 /* Hook the tails of the branches to the closing node. */
3562 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 3563 regoptail(pRExC_state, br, ender);
c277df42
IZ
3564 }
3565 }
a0d0e21e 3566 }
c277df42
IZ
3567
3568 {
e1ec3a88
AL
3569 const char *p;
3570 static const char parens[] = "=!<,>";
c277df42
IZ
3571
3572 if (paren && (p = strchr(parens, paren))) {
eb160463 3573 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
3574 int flag = (p - parens) > 1;
3575
3576 if (paren == '>')
3577 node = SUSPEND, flag = 0;
830247a4 3578 reginsert(pRExC_state, node,ret);
45948336
EP
3579 Set_Node_Cur_Length(ret);
3580 Set_Node_Offset(ret, parse_start + 1);
c277df42 3581 ret->flags = flag;
830247a4 3582 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 3583 }
a0d0e21e
LW
3584 }
3585
3586 /* Check for proper termination. */
ce3e6498 3587 if (paren) {
e2509266 3588 RExC_flags = oregflags;
830247a4
IZ
3589 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3590 RExC_parse = oregcomp_parse;
380a0633 3591 vFAIL("Unmatched (");
ce3e6498 3592 }
a0ed51b3 3593 }
830247a4
IZ
3594 else if (!paren && RExC_parse < RExC_end) {
3595 if (*RExC_parse == ')') {
3596 RExC_parse++;
380a0633 3597 vFAIL("Unmatched )");
a0ed51b3
LW
3598 }
3599 else
b45f050a 3600 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
3601 /* NOTREACHED */
3602 }
a687059c 3603
a0d0e21e 3604 return(ret);
a687059c
LW
3605}
3606
3607/*
3608 - regbranch - one alternative of an | operator
3609 *
3610 * Implements the concatenation operator.
3611 */
76e3520e 3612STATIC regnode *
830247a4 3613S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 3614{
97aff369 3615 dVAR;
c277df42
IZ
3616 register regnode *ret;
3617 register regnode *chain = NULL;
3618 register regnode *latest;
3619 I32 flags = 0, c = 0;
a0d0e21e 3620
b81d288d 3621 if (first)
c277df42
IZ
3622 ret = NULL;
3623 else {
b81d288d 3624 if (!SIZE_ONLY && RExC_extralen)
830247a4 3625 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 3626 else {
830247a4 3627 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
3628 Set_Node_Length(ret, 1);
3629 }
c277df42
IZ
3630 }
3631
b81d288d 3632 if (!first && SIZE_ONLY)
830247a4 3633 RExC_extralen += 1; /* BRANCHJ */
b81d288d 3634
c277df42 3635 *flagp = WORST; /* Tentatively. */
a0d0e21e 3636
830247a4
IZ
3637 RExC_parse--;
3638 nextchar(pRExC_state);
3639 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 3640 flags &= ~TRYAGAIN;
830247a4 3641 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
3642 if (latest == NULL) {
3643 if (flags & TRYAGAIN)
3644 continue;
3645 return(NULL);
a0ed51b3
LW
3646 }
3647 else if (ret == NULL)
c277df42 3648 ret = latest;
a0d0e21e 3649 *flagp |= flags&HASWIDTH;
c277df42 3650 if (chain == NULL) /* First piece. */
a0d0e21e
LW
3651 *flagp |= flags&SPSTART;
3652 else {
830247a4
IZ
3653 RExC_naughty++;
3654 regtail(pRExC_state, chain, latest);
a687059c 3655 }
a0d0e21e 3656 chain = latest;
c277df42
IZ
3657 c++;
3658 }
3659 if (chain == NULL) { /* Loop ran zero times. */
830247a4 3660 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
3661 if (ret == NULL)
3662 ret = chain;
3663 }
3664 if (c == 1) {
3665 *flagp |= flags&SIMPLE;
a0d0e21e 3666 }
a687059c 3667
d4c19fe8 3668 return ret;
a687059c
LW
3669}
3670
3671/*
3672 - regpiece - something followed by possible [*+?]
3673 *
3674 * Note that the branching code sequences used for ? and the general cases
3675 * of * and + are somewhat optimized: they use the same NOTHING node as
3676 * both the endmarker for their branch list and the body of the last branch.
3677 * It might seem that this node could be dispensed with entirely, but the
3678 * endmarker role is not redundant.
3679 */
76e3520e 3680STATIC regnode *
830247a4 3681S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3682{
97aff369 3683 dVAR;
c277df42 3684 register regnode *ret;
a0d0e21e
LW
3685 register char op;
3686 register char *next;
3687 I32 flags;
1df70142 3688 const char * const origparse = RExC_parse;
a0d0e21e
LW
3689 char *maxpos;
3690 I32 min;
c277df42 3691 I32 max = REG_INFTY;
fac92740 3692 char *parse_start;
a0d0e21e 3693
830247a4 3694 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
3695 if (ret == NULL) {
3696 if (flags & TRYAGAIN)
3697 *flagp |= TRYAGAIN;
3698 return(NULL);
3699 }
3700
830247a4 3701 op = *RExC_parse;
a0d0e21e 3702
830247a4 3703 if (op == '{' && regcurly(RExC_parse)) {
fac92740 3704 parse_start = RExC_parse; /* MJD */
830247a4 3705 next = RExC_parse + 1;
c445ea15 3706 maxpos = NULL;
a0d0e21e
LW
3707 while (isDIGIT(*next) || *next == ',') {
3708 if (*next == ',') {
3709 if (maxpos)
3710 break;
3711 else
3712 maxpos = next;
a687059c 3713 }
a0d0e21e
LW
3714 next++;
3715 }
3716 if (*next == '}') { /* got one */
3717 if (!maxpos)
3718 maxpos = next;
830247a4
IZ
3719 RExC_parse++;
3720 min = atoi(RExC_parse);
a0d0e21e
LW
3721 if (*maxpos == ',')
3722 maxpos++;
3723 else
830247a4 3724 maxpos = RExC_parse;
a0d0e21e
LW
3725 max = atoi(maxpos);
3726 if (!max && *maxpos != '0')
c277df42
IZ
3727 max = REG_INFTY; /* meaning "infinity" */
3728 else if (max >= REG_INFTY)
8615cb43 3729 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
3730 RExC_parse = next;
3731 nextchar(pRExC_state);
a0d0e21e
LW
3732
3733 do_curly:
3734 if ((flags&SIMPLE)) {
830247a4
IZ
3735 RExC_naughty += 2 + RExC_naughty / 2;
3736 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
3737 Set_Node_Offset(ret, parse_start+1); /* MJD */
3738 Set_Node_Cur_Length(ret);
a0d0e21e
LW
3739 }
3740 else {
830247a4 3741 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
3742
3743 w->flags = 0;
830247a4
IZ
3744 regtail(pRExC_state, ret, w);
3745 if (!SIZE_ONLY && RExC_extralen) {
3746 reginsert(pRExC_state, LONGJMP,ret);
3747 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
3748 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3749 }
830247a4 3750 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
3751 /* MJD hk */
3752 Set_Node_Offset(ret, parse_start+1);
2af232bd 3753 Set_Node_Length(ret,
fac92740 3754 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 3755
830247a4 3756 if (!SIZE_ONLY && RExC_extralen)
c277df42 3757 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 3758 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 3759 if (SIZE_ONLY)
830247a4
IZ
3760 RExC_whilem_seen++, RExC_extralen += 3;
3761 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 3762 }
c277df42 3763 ret->flags = 0;
a0d0e21e
LW
3764
3765 if (min > 0)
821b33a5
IZ
3766 *flagp = WORST;
3767 if (max > 0)
3768 *flagp |= HASWIDTH;
a0d0e21e 3769 if (max && max < min)
8615cb43 3770 vFAIL("Can't do {n,m} with n > m");
c277df42 3771 if (!SIZE_ONLY) {
eb160463
GS
3772 ARG1_SET(ret, (U16)min);
3773 ARG2_SET(ret, (U16)max);
a687059c 3774 }
a687059c 3775
a0d0e21e 3776 goto nest_check;
a687059c 3777 }
a0d0e21e 3778 }
a687059c 3779
a0d0e21e
LW
3780 if (!ISMULT1(op)) {
3781 *flagp = flags;
a687059c 3782 return(ret);
a0d0e21e 3783 }
bb20fd44 3784
c277df42 3785#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
3786
3787 /* if this is reinstated, don't forget to put this back into perldiag:
3788
3789 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3790
3791 (F) The part of the regexp subject to either the * or + quantifier
3792 could match an empty string. The {#} shows in the regular
3793 expression about where the problem was discovered.
3794
3795 */
3796
bb20fd44 3797 if (!(flags&HASWIDTH) && op != '?')
b45f050a 3798 vFAIL("Regexp *+ operand could be empty");
b81d288d 3799#endif
bb20fd44 3800
fac92740 3801 parse_start = RExC_parse;
830247a4 3802 nextchar(pRExC_state);
a0d0e21e 3803
821b33a5 3804 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
3805
3806 if (op == '*' && (flags&SIMPLE)) {
830247a4 3807 reginsert(pRExC_state, STAR, ret);
c277df42 3808 ret->flags = 0;
830247a4 3809 RExC_naughty += 4;
a0d0e21e
LW
3810 }
3811 else if (op == '*') {
3812 min = 0;
3813 goto do_curly;
a0ed51b3
LW
3814 }
3815 else if (op == '+' && (flags&SIMPLE)) {
830247a4 3816 reginsert(pRExC_state, PLUS, ret);
c277df42 3817 ret->flags = 0;
830247a4 3818 RExC_naughty += 3;
a0d0e21e
LW
3819 }
3820 else if (op == '+') {
3821 min = 1;
3822 goto do_curly;
a0ed51b3
LW
3823 }
3824 else if (op == '?') {
a0d0e21e
LW
3825 min = 0; max = 1;
3826 goto do_curly;
3827 }
3828 nest_check:
041457d9 3829 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 3830 vWARN3(RExC_parse,
b45f050a 3831 "%.*s matches null string many times",
afd78fd5 3832 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 3833 origparse);
a0d0e21e
LW
3834 }
3835
830247a4
IZ
3836 if (*RExC_parse == '?') {
3837 nextchar(pRExC_state);
3838 reginsert(pRExC_state, MINMOD, ret);
3839 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 3840 }
830247a4
IZ
3841 if (ISMULT2(RExC_parse)) {
3842 RExC_parse++;
b45f050a
JF
3843 vFAIL("Nested quantifiers");
3844 }
a0d0e21e
LW
3845
3846 return(ret);
a687059c
LW
3847}
3848
3849/*
3850 - regatom - the lowest level
3851 *
3852 * Optimization: gobbles an entire sequence of ordinary characters so that
3853 * it can turn them into a single node, which is smaller to store and
3854 * faster to run. Backslashed characters are exceptions, each becoming a
3855 * separate node; the code is simpler that way and it's not worth fixing.
3856 *
b45f050a 3857 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 3858STATIC regnode *
830247a4 3859S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3860{
97aff369 3861 dVAR;
cbbf8932 3862 register regnode *ret = NULL;
a0d0e21e 3863 I32 flags;
45948336 3864 char *parse_start = RExC_parse;
a0d0e21e
LW
3865
3866 *flagp = WORST; /* Tentatively. */
3867
3868tryagain:
830247a4 3869 switch (*RExC_parse) {
a0d0e21e 3870 case '^':
830247a4
IZ
3871 RExC_seen_zerolen++;
3872 nextchar(pRExC_state);
e2509266 3873 if (RExC_flags & PMf_MULTILINE)
830247a4 3874 ret = reg_node(pRExC_state, MBOL);
e2509266 3875 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3876 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3877 else
830247a4 3878 ret = reg_node(pRExC_state, BOL);
fac92740 3879 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3880 break;
3881 case '$':
830247a4 3882 nextchar(pRExC_state);
b81d288d 3883 if (*RExC_parse)
830247a4 3884 RExC_seen_zerolen++;
e2509266 3885 if (RExC_flags & PMf_MULTILINE)
830247a4 3886 ret = reg_node(pRExC_state, MEOL);
e2509266 3887 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3888 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3889 else
830247a4 3890 ret = reg_node(pRExC_state, EOL);
fac92740 3891 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3892 break;
3893 case '.':
830247a4 3894 nextchar(pRExC_state);
e2509266 3895 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
3896 ret = reg_node(pRExC_state, SANY);
3897 else
3898 ret = reg_node(pRExC_state, REG_ANY);
3899 *flagp |= HASWIDTH|SIMPLE;
830247a4 3900 RExC_naughty++;
fac92740 3901 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3902 break;
3903 case '[':
b45f050a 3904 {
830247a4 3905 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 3906 ret = regclass(pRExC_state);
830247a4
IZ
3907 if (*RExC_parse != ']') {
3908 RExC_parse = oregcomp_parse;
b45f050a
JF
3909 vFAIL("Unmatched [");
3910 }
830247a4 3911 nextchar(pRExC_state);
a0d0e21e 3912 *flagp |= HASWIDTH|SIMPLE;
fac92740 3913 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 3914 break;
b45f050a 3915 }
a0d0e21e 3916 case '(':
830247a4
IZ
3917 nextchar(pRExC_state);
3918 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 3919 if (ret == NULL) {
bf93d4cc 3920 if (flags & TRYAGAIN) {
830247a4 3921 if (RExC_parse == RExC_end) {
bf93d4cc
GS
3922 /* Make parent create an empty node if needed. */
3923 *flagp |= TRYAGAIN;
3924 return(NULL);
3925 }
a0d0e21e 3926 goto tryagain;
bf93d4cc 3927 }
a0d0e21e
LW
3928 return(NULL);
3929 }
c277df42 3930 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
3931 break;
3932 case '|':
3933 case ')':
3934 if (flags & TRYAGAIN) {
3935 *flagp |= TRYAGAIN;
3936 return NULL;
3937 }
b45f050a 3938 vFAIL("Internal urp");
a0d0e21e
LW
3939 /* Supposed to be caught earlier. */
3940 break;
85afd4ae 3941 case '{':
830247a4
IZ
3942 if (!regcurly(RExC_parse)) {
3943 RExC_parse++;
85afd4ae
CS
3944 goto defchar;
3945 }
3946 /* FALL THROUGH */
a0d0e21e
LW
3947 case '?':
3948 case '+':
3949 case '*':
830247a4 3950 RExC_parse++;
b45f050a 3951 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
3952 break;
3953 case '\\':
830247a4 3954 switch (*++RExC_parse) {
a0d0e21e 3955 case 'A':
830247a4
IZ
3956 RExC_seen_zerolen++;
3957 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3958 *flagp |= SIMPLE;
830247a4 3959 nextchar(pRExC_state);
fac92740 3960 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3961 break;
3962 case 'G':
830247a4
IZ
3963 ret = reg_node(pRExC_state, GPOS);
3964 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 3965 *flagp |= SIMPLE;
830247a4 3966 nextchar(pRExC_state);
fac92740 3967 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3968 break;
3969 case 'Z':
830247a4 3970 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3971 *flagp |= SIMPLE;
a1917ab9 3972 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 3973 nextchar(pRExC_state);
a0d0e21e 3974 break;
b85d18e9 3975 case 'z':
830247a4 3976 ret = reg_node(pRExC_state, EOS);
b85d18e9 3977 *flagp |= SIMPLE;
830247a4
IZ
3978 RExC_seen_zerolen++; /* Do not optimize RE away */
3979 nextchar(pRExC_state);
fac92740 3980 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 3981 break;
4a2d328f 3982 case 'C':
f33976b4
DB
3983 ret = reg_node(pRExC_state, CANY);
3984 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 3985 *flagp |= HASWIDTH|SIMPLE;
830247a4 3986 nextchar(pRExC_state);
fac92740 3987 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
3988 break;
3989 case 'X':
830247a4 3990 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 3991 *flagp |= HASWIDTH;
830247a4 3992 nextchar(pRExC_state);
fac92740 3993 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 3994 break;
a0d0e21e 3995 case 'w':
eb160463 3996 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 3997 *flagp |= HASWIDTH|SIMPLE;
830247a4 3998 nextchar(pRExC_state);
fac92740 3999 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4000 break;
4001 case 'W':
eb160463 4002 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4003 *flagp |= HASWIDTH|SIMPLE;
830247a4 4004 nextchar(pRExC_state);
fac92740 4005 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4006 break;
4007 case 'b':
830247a4
IZ
4008 RExC_seen_zerolen++;
4009 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4010 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4011 *flagp |= SIMPLE;
830247a4 4012 nextchar(pRExC_state);
fac92740 4013 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4014 break;
4015 case 'B':
830247a4
IZ
4016 RExC_seen_zerolen++;
4017 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4018 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4019 *flagp |= SIMPLE;
830247a4 4020 nextchar(pRExC_state);
fac92740 4021 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4022 break;
4023 case 's':
eb160463 4024 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4025 *flagp |= HASWIDTH|SIMPLE;
830247a4 4026 nextchar(pRExC_state);
fac92740 4027 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4028 break;
4029 case 'S':
eb160463 4030 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4031 *flagp |= HASWIDTH|SIMPLE;
830247a4 4032 nextchar(pRExC_state);
fac92740 4033 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4034 break;
4035 case 'd':
ffc61ed2 4036 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4037 *flagp |= HASWIDTH|SIMPLE;
830247a4 4038 nextchar(pRExC_state);
fac92740 4039 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4040 break;
4041 case 'D':
ffc61ed2 4042 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4043 *flagp |= HASWIDTH|SIMPLE;
830247a4 4044 nextchar(pRExC_state);
fac92740 4045 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4046 break;
a14b48bc
LW
4047 case 'p':
4048 case 'P':
3568d838 4049 {
830247a4 4050 char* oldregxend = RExC_end;
ccb2c380 4051 char* parse_start = RExC_parse - 2;
a14b48bc 4052
830247a4 4053 if (RExC_parse[1] == '{') {
3568d838 4054 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4055 RExC_end = strchr(RExC_parse, '}');
4056 if (!RExC_end) {
0da60cf5 4057 U8 c = (U8)*RExC_parse;
830247a4
IZ
4058 RExC_parse += 2;
4059 RExC_end = oldregxend;
0da60cf5 4060 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4061 }
830247a4 4062 RExC_end++;
a14b48bc 4063 }
af6f566e 4064 else {
830247a4 4065 RExC_end = RExC_parse + 2;
af6f566e
HS
4066 if (RExC_end > oldregxend)
4067 RExC_end = oldregxend;
4068 }
830247a4 4069 RExC_parse--;
a14b48bc 4070
ffc61ed2 4071 ret = regclass(pRExC_state);
a14b48bc 4072
830247a4
IZ
4073 RExC_end = oldregxend;
4074 RExC_parse--;
ccb2c380
MP
4075
4076 Set_Node_Offset(ret, parse_start + 2);
4077 Set_Node_Cur_Length(ret);
830247a4 4078 nextchar(pRExC_state);
a14b48bc
LW
4079 *flagp |= HASWIDTH|SIMPLE;
4080 }
4081 break;
a0d0e21e
LW
4082 case 'n':
4083 case 'r':
4084 case 't':
4085 case 'f':
4086 case 'e':
4087 case 'a':
4088 case 'x':
4089 case 'c':
4090 case '0':
4091 goto defchar;
4092 case '1': case '2': case '3': case '4':
4093 case '5': case '6': case '7': case '8': case '9':
4094 {
1df70142 4095 const I32 num = atoi(RExC_parse);
a0d0e21e 4096
830247a4 4097 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4098 goto defchar;
4099 else {
fac92740 4100 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4101 while (isDIGIT(*RExC_parse))
4102 RExC_parse++;
b45f050a 4103
eb160463 4104 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4105 vFAIL("Reference to nonexistent group");
830247a4 4106 RExC_sawback = 1;
eb160463
GS
4107 ret = reganode(pRExC_state,
4108 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4109 num);
a0d0e21e 4110 *flagp |= HASWIDTH;
2af232bd 4111
fac92740 4112 /* override incorrect value set in reganode MJD */
2af232bd 4113 Set_Node_Offset(ret, parse_start+1);
fac92740 4114 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4115 RExC_parse--;
4116 nextchar(pRExC_state);
a0d0e21e
LW
4117 }
4118 }
4119 break;
4120 case '\0':
830247a4 4121 if (RExC_parse >= RExC_end)
b45f050a 4122 FAIL("Trailing \\");
a0d0e21e
LW
4123 /* FALL THROUGH */
4124 default:
a0288114 4125 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 4126 back into the quick-grab loop below */
45948336 4127 parse_start--;
a0d0e21e
LW
4128 goto defchar;
4129 }
4130 break;
4633a7c4
LW
4131
4132 case '#':
e2509266 4133 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4134 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4135 if (RExC_parse < RExC_end)
4633a7c4
LW
4136 goto tryagain;
4137 }
4138 /* FALL THROUGH */
4139
a0d0e21e 4140 default: {
ba210ebe 4141 register STRLEN len;
58ae7d3f 4142 register UV ender;
a0d0e21e 4143 register char *p;
c277df42 4144 char *oldp, *s;
80aecb99 4145 STRLEN foldlen;
89ebb4a3 4146 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4147
4148 parse_start = RExC_parse - 1;
a0d0e21e 4149
830247a4 4150 RExC_parse++;
a0d0e21e
LW
4151
4152 defchar:
58ae7d3f 4153 ender = 0;
eb160463
GS
4154 ret = reg_node(pRExC_state,
4155 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4156 s = STRING(ret);
830247a4
IZ
4157 for (len = 0, p = RExC_parse - 1;
4158 len < 127 && p < RExC_end;
a0d0e21e
LW
4159 len++)
4160 {
4161 oldp = p;
5b5a24f7 4162
e2509266 4163 if (RExC_flags & PMf_EXTENDED)
830247a4 4164 p = regwhite(p, RExC_end);
a0d0e21e
LW
4165 switch (*p) {
4166 case '^':
4167 case '$':
4168 case '.':
4169 case '[':
4170 case '(':
4171 case ')':
4172 case '|':
4173 goto loopdone;
4174 case '\\':
4175 switch (*++p) {
4176 case 'A':
1ed8eac0
JF
4177 case 'C':
4178 case 'X':
a0d0e21e
LW
4179 case 'G':
4180 case 'Z':
b85d18e9 4181 case 'z':
a0d0e21e
LW
4182 case 'w':
4183 case 'W':
4184 case 'b':
4185 case 'B':
4186 case 's':
4187 case 'S':
4188 case 'd':
4189 case 'D':
a14b48bc
LW
4190 case 'p':
4191 case 'P':
a0d0e21e
LW
4192 --p;
4193 goto loopdone;
4194 case 'n':
4195 ender = '\n';
4196 p++;
a687059c 4197 break;
a0d0e21e
LW
4198 case 'r':
4199 ender = '\r';
4200 p++;
a687059c 4201 break;
a0d0e21e
LW
4202 case 't':
4203 ender = '\t';
4204 p++;
a687059c 4205 break;
a0d0e21e
LW
4206 case 'f':
4207 ender = '\f';
4208 p++;
a687059c 4209 break;
a0d0e21e 4210 case 'e':
c7f1f016 4211 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4212 p++;
a687059c 4213 break;
a0d0e21e 4214 case 'a':
c7f1f016 4215 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4216 p++;
a687059c 4217 break;
a0d0e21e 4218 case 'x':
a0ed51b3 4219 if (*++p == '{') {
1df70142 4220 char* const e = strchr(p, '}');
b81d288d 4221
b45f050a 4222 if (!e) {
830247a4 4223 RExC_parse = p + 1;
b45f050a
JF
4224 vFAIL("Missing right brace on \\x{}");
4225 }
de5f0749 4226 else {
a4c04bdc
NC
4227 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4228 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 4229 STRLEN numlen = e - p - 1;
53305cf1 4230 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4231 if (ender > 0xff)
4232 RExC_utf8 = 1;
a0ed51b3
LW
4233 p = e + 1;
4234 }
a0ed51b3
LW
4235 }
4236 else {
a4c04bdc 4237 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 4238 STRLEN numlen = 2;
53305cf1 4239 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4240 p += numlen;
4241 }
a687059c 4242 break;
a0d0e21e
LW
4243 case 'c':
4244 p++;
bbce6d69 4245 ender = UCHARAT(p++);
4246 ender = toCTRL(ender);
a687059c 4247 break;
a0d0e21e
LW
4248 case '0': case '1': case '2': case '3':case '4':
4249 case '5': case '6': case '7': case '8':case '9':
4250 if (*p == '0' ||
830247a4 4251 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 4252 I32 flags = 0;
1df70142 4253 STRLEN numlen = 3;
53305cf1 4254 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4255 p += numlen;
4256 }
4257 else {
4258 --p;
4259 goto loopdone;
a687059c
LW
4260 }
4261 break;
a0d0e21e 4262 case '\0':
830247a4 4263 if (p >= RExC_end)
b45f050a 4264 FAIL("Trailing \\");
a687059c 4265 /* FALL THROUGH */
a0d0e21e 4266 default:
041457d9 4267 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 4268 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4269 goto normal_default;
a0d0e21e
LW
4270 }
4271 break;
a687059c 4272 default:
a0ed51b3 4273 normal_default:
fd400ab9 4274 if (UTF8_IS_START(*p) && UTF) {
1df70142 4275 STRLEN numlen;
5e12f4fb 4276 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 4277 &numlen, 0);
a0ed51b3
LW
4278 p += numlen;
4279 }
4280 else
4281 ender = *p++;
a0d0e21e 4282 break;
a687059c 4283 }
e2509266 4284 if (RExC_flags & PMf_EXTENDED)
830247a4 4285 p = regwhite(p, RExC_end);
60a8b682
JH
4286 if (UTF && FOLD) {
4287 /* Prime the casefolded buffer. */
ac7e0132 4288 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4289 }
a0d0e21e
LW
4290 if (ISMULT2(p)) { /* Back off on ?+*. */
4291 if (len)
4292 p = oldp;
16ea2a2e 4293 else if (UTF) {
0ebc6274
JH
4294 STRLEN unilen;
4295
80aecb99 4296 if (FOLD) {
60a8b682 4297 /* Emit all the Unicode characters. */
1df70142 4298 STRLEN numlen;
80aecb99
JH
4299 for (foldbuf = tmpbuf;
4300 foldlen;
4301 foldlen -= numlen) {
4302 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4303 if (numlen > 0) {
0ebc6274
JH
4304 reguni(pRExC_state, ender, s, &unilen);
4305 s += unilen;
4306 len += unilen;
4307 /* In EBCDIC the numlen
4308 * and unilen can differ. */
9dc45d57 4309 foldbuf += numlen;
47654450
JH
4310 if (numlen >= foldlen)
4311 break;
9dc45d57
JH
4312 }
4313 else
4314 break; /* "Can't happen." */
80aecb99
JH
4315 }
4316 }
4317 else {
0ebc6274 4318 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4319 if (unilen > 0) {
0ebc6274
JH
4320 s += unilen;
4321 len += unilen;
9dc45d57 4322 }
80aecb99 4323 }
a0ed51b3 4324 }
a0d0e21e
LW
4325 else {
4326 len++;
eb160463 4327 REGC((char)ender, s++);
a0d0e21e
LW
4328 }
4329 break;
a687059c 4330 }
16ea2a2e 4331 if (UTF) {
0ebc6274
JH
4332 STRLEN unilen;
4333
80aecb99 4334 if (FOLD) {
60a8b682 4335 /* Emit all the Unicode characters. */
1df70142 4336 STRLEN numlen;
80aecb99
JH
4337 for (foldbuf = tmpbuf;
4338 foldlen;
4339 foldlen -= numlen) {
4340 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4341 if (numlen > 0) {
0ebc6274
JH
4342 reguni(pRExC_state, ender, s, &unilen);
4343 len += unilen;
4344 s += unilen;
4345 /* In EBCDIC the numlen
4346 * and unilen can differ. */
9dc45d57 4347 foldbuf += numlen;
47654450
JH
4348 if (numlen >= foldlen)
4349 break;
9dc45d57
JH
4350 }
4351 else
4352 break;
80aecb99
JH
4353 }
4354 }
4355 else {
0ebc6274 4356 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4357 if (unilen > 0) {
0ebc6274
JH
4358 s += unilen;
4359 len += unilen;
9dc45d57 4360 }
80aecb99
JH
4361 }
4362 len--;
a0ed51b3
LW
4363 }
4364 else
eb160463 4365 REGC((char)ender, s++);
a0d0e21e
LW
4366 }
4367 loopdone:
830247a4 4368 RExC_parse = p - 1;
fac92740 4369 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4370 nextchar(pRExC_state);
793db0cb
JH
4371 {
4372 /* len is STRLEN which is unsigned, need to copy to signed */
4373 IV iv = len;
4374 if (iv < 0)
4375 vFAIL("Internal disaster");
4376 }
a0d0e21e
LW
4377 if (len > 0)
4378 *flagp |= HASWIDTH;
090f7165 4379 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4380 *flagp |= SIMPLE;
c277df42 4381 if (!SIZE_ONLY)
cd439c50
IZ
4382 STR_LEN(ret) = len;
4383 if (SIZE_ONLY)
830247a4 4384 RExC_size += STR_SZ(len);
cd439c50 4385 else
830247a4 4386 RExC_emit += STR_SZ(len);
a687059c 4387 }
a0d0e21e
LW
4388 break;
4389 }
a687059c 4390
60a8b682
JH
4391 /* If the encoding pragma is in effect recode the text of
4392 * any EXACT-kind nodes. */
22c54be3 4393 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
4394 STRLEN oldlen = STR_LEN(ret);
4395 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4396
4397 if (RExC_utf8)
4398 SvUTF8_on(sv);
4399 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
4400 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4401 const STRLEN newlen = SvCUR(sv);
d0063567
DK
4402
4403 if (SvUTF8(sv))
4404 RExC_utf8 = 1;
4405 if (!SIZE_ONLY) {
a3621e74
YO
4406 GET_RE_DEBUG_FLAGS_DECL;
4407 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
4408 (int)oldlen, STRING(ret),
4409 (int)newlen, s));
4410 Copy(s, STRING(ret), newlen, char);
4411 STR_LEN(ret) += newlen - oldlen;
4412 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4413 } else
4414 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4415 }
a72c7584
JH
4416 }
4417
a0d0e21e 4418 return(ret);
a687059c
LW
4419}
4420
873ef191 4421STATIC char *
5f66b61c 4422S_regwhite(char *p, const char *e)
5b5a24f7
CS
4423{
4424 while (p < e) {
4425 if (isSPACE(*p))
4426 ++p;
4427 else if (*p == '#') {
4428 do {
4429 p++;
4430 } while (p < e && *p != '\n');
4431 }
4432 else
4433 break;
4434 }
4435 return p;
4436}
4437
b8c5462f
JH
4438/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4439 Character classes ([:foo:]) can also be negated ([:^foo:]).
4440 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4441 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 4442 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
4443
4444#define POSIXCC_DONE(c) ((c) == ':')
4445#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4446#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4447
b8c5462f 4448STATIC I32
830247a4 4449S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 4450{
97aff369 4451 dVAR;
936ed897 4452 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 4453
830247a4 4454 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 4455 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 4456 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 4457 const char c = UCHARAT(RExC_parse);
830247a4 4458 char* s = RExC_parse++;
b81d288d 4459
9a86a77b 4460 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
4461 RExC_parse++;
4462 if (RExC_parse == RExC_end)
620e46c5 4463 /* Grandfather lone [:, [=, [. */
830247a4 4464 RExC_parse = s;
620e46c5 4465 else {
1df70142 4466 const char* t = RExC_parse++; /* skip over the c */
a28509cc 4467 const char *posixcc;
b8c5462f 4468
80916619
NC
4469 assert(*t == c);
4470
9a86a77b 4471 if (UCHARAT(RExC_parse) == ']') {
830247a4 4472 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
4473 posixcc = s + 1;
4474 if (*s == ':') {
1df70142
AL
4475 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4476 const I32 skip = t - posixcc;
80916619
NC
4477
4478 /* Initially switch on the length of the name. */
4479 switch (skip) {
4480 case 4:
4481 if (memEQ(posixcc, "word", 4)) {
4482 /* this is not POSIX, this is the Perl \w */;
4483 namedclass
4484 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4485 }
cc4319de 4486 break;
80916619
NC
4487 case 5:
4488 /* Names all of length 5. */
4489 /* alnum alpha ascii blank cntrl digit graph lower
4490 print punct space upper */
4491 /* Offset 4 gives the best switch position. */
4492 switch (posixcc[4]) {
4493 case 'a':
4494 if (memEQ(posixcc, "alph", 4)) {
4495 /* a */
4496 namedclass
4497 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4498 }
4499 break;
4500 case 'e':
4501 if (memEQ(posixcc, "spac", 4)) {
4502 /* e */
4503 namedclass
4504 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4505 }
4506 break;
4507 case 'h':
4508 if (memEQ(posixcc, "grap", 4)) {
4509 /* h */
4510 namedclass
4511 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4512 }
4513 break;
4514 case 'i':
4515 if (memEQ(posixcc, "asci", 4)) {
4516 /* i */
4517 namedclass
4518 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4519 }
4520 break;
4521 case 'k':
4522 if (memEQ(posixcc, "blan", 4)) {
4523 /* k */
4524 namedclass
4525 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4526 }
4527 break;
4528 case 'l':
4529 if (memEQ(posixcc, "cntr", 4)) {
4530 /* l */
4531 namedclass
4532 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4533 }
4534 break;
4535 case 'm':
4536 if (memEQ(posixcc, "alnu", 4)) {
4537 /* m */
4538 namedclass
4539 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4540 }
4541 break;
4542 case 'r':
4543 if (memEQ(posixcc, "lowe", 4)) {
4544 /* r */
4545 namedclass
4546 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4547 }
4548 if (memEQ(posixcc, "uppe", 4)) {
8fdec511 4549 /* r */
80916619
NC
4550 namedclass
4551 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4552 }
4553 break;
4554 case 't':
4555 if (memEQ(posixcc, "digi", 4)) {
4556 /* t */
4557 namedclass
4558 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4559 }
4560 if (memEQ(posixcc, "prin", 4)) {
8fdec511 4561 /* t */
80916619
NC
4562 namedclass
4563 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4564 }
4565 if (memEQ(posixcc, "punc", 4)) {
4566 /* t */
4567 namedclass
4568 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4569 }
4570 break;
b8c5462f
JH
4571 }
4572 break;
80916619
NC
4573 case 6:
4574 if (memEQ(posixcc, "xdigit", 6)) {
4575 namedclass
4576 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
4577 }
4578 break;
4579 }
80916619
NC
4580
4581 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
4582 {
4583 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4584 t - s - 1, s + 1);
4585 }
80916619
NC
4586 assert (posixcc[skip] == ':');
4587 assert (posixcc[skip+1] == ']');
b45f050a 4588 } else if (!SIZE_ONLY) {
b8c5462f 4589 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 4590
830247a4 4591 /* adjust RExC_parse so the warning shows after
b45f050a 4592 the class closes */
9a86a77b 4593 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 4594 RExC_parse++;
b45f050a
JF
4595 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4596 }
b8c5462f
JH
4597 } else {
4598 /* Maternal grandfather:
4599 * "[:" ending in ":" but not in ":]" */
830247a4 4600 RExC_parse = s;
767d463e 4601 }
620e46c5
JH
4602 }
4603 }
4604
b8c5462f
JH
4605 return namedclass;
4606}
4607
4608STATIC void
830247a4 4609S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 4610{
97aff369 4611 dVAR;
b938889d 4612 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
4613 const char *s = RExC_parse;
4614 const char c = *s++;
b8c5462f
JH
4615
4616 while(*s && isALNUM(*s))
4617 s++;
4618 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
4619 if (ckWARN(WARN_REGEXP))
4620 vWARN3(s+2,
4621 "POSIX syntax [%c %c] belongs inside character classes",
4622 c, c);
b45f050a
JF
4623
4624 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 4625 if (POSIXCC_NOTYET(c)) {
830247a4 4626 /* adjust RExC_parse so the error shows after
b45f050a 4627 the class closes */
9a86a77b 4628 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
4629 ;
4630 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4631 }
b8c5462f
JH
4632 }
4633 }
620e46c5
JH
4634}
4635
76e3520e 4636STATIC regnode *
830247a4 4637S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 4638{
97aff369 4639 dVAR;
ffc61ed2 4640 register UV value;
9a86a77b 4641 register UV nextvalue;
3568d838 4642 register IV prevvalue = OOB_UNICODE;
ffc61ed2 4643 register IV range = 0;
c277df42 4644 register regnode *ret;
ba210ebe 4645 STRLEN numlen;
ffc61ed2 4646 IV namedclass;
cbbf8932 4647 char *rangebegin = NULL;
936ed897 4648 bool need_class = 0;
c445ea15 4649 SV *listsv = NULL;
ffc61ed2
JH
4650 register char *e;
4651 UV n;
9e55ce06 4652 bool optimize_invert = TRUE;
cbbf8932 4653 AV* unicode_alternate = NULL;
1b2d223b
JH
4654#ifdef EBCDIC
4655 UV literal_endpoint = 0;
4656#endif
ffc61ed2
JH
4657
4658 ret = reganode(pRExC_state, ANYOF, 0);
4659
4660 if (!SIZE_ONLY)
4661 ANYOF_FLAGS(ret) = 0;
4662
9a86a77b 4663 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
4664 RExC_naughty++;
4665 RExC_parse++;
4666 if (!SIZE_ONLY)
4667 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4668 }
a0d0e21e 4669
936ed897 4670 if (SIZE_ONLY)
830247a4 4671 RExC_size += ANYOF_SKIP;
936ed897 4672 else {
830247a4 4673 RExC_emit += ANYOF_SKIP;
936ed897
IZ
4674 if (FOLD)
4675 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4676 if (LOC)
4677 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 4678 ANYOF_BITMAP_ZERO(ret);
396482e1 4679 listsv = newSVpvs("# comment\n");
a0d0e21e 4680 }
b8c5462f 4681
9a86a77b
JH
4682 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4683
b938889d 4684 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 4685 checkposixcc(pRExC_state);
b8c5462f 4686
f064b6ad
HS
4687 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4688 if (UCHARAT(RExC_parse) == ']')
4689 goto charclassloop;
ffc61ed2 4690
9a86a77b 4691 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
4692
4693 charclassloop:
4694
4695 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4696
73b437c8 4697 if (!range)
830247a4 4698 rangebegin = RExC_parse;
ffc61ed2 4699 if (UTF) {
5e12f4fb 4700 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
4701 RExC_end - RExC_parse,
4702 &numlen, 0);
ffc61ed2
JH
4703 RExC_parse += numlen;
4704 }
4705 else
4706 value = UCHARAT(RExC_parse++);
9a86a77b
JH
4707 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4708 if (value == '[' && POSIXCC(nextvalue))
830247a4 4709 namedclass = regpposixcc(pRExC_state, value);
620e46c5 4710 else if (value == '\\') {
ffc61ed2 4711 if (UTF) {
5e12f4fb 4712 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
4713 RExC_end - RExC_parse,
4714 &numlen, 0);
4715 RExC_parse += numlen;
4716 }
4717 else
4718 value = UCHARAT(RExC_parse++);
470c3474 4719 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 4720 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
4721 * be a problem later if we want switch on Unicode.
4722 * A similar issue a little bit later when switching on
4723 * namedclass. --jhi */
ffc61ed2 4724 switch ((I32)value) {
b8c5462f
JH
4725 case 'w': namedclass = ANYOF_ALNUM; break;
4726 case 'W': namedclass = ANYOF_NALNUM; break;
4727 case 's': namedclass = ANYOF_SPACE; break;
4728 case 'S': namedclass = ANYOF_NSPACE; break;
4729 case 'd': namedclass = ANYOF_DIGIT; break;
4730 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
4731 case 'p':
4732 case 'P':
af6f566e 4733 if (RExC_parse >= RExC_end)
2a4859cd 4734 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 4735 if (*RExC_parse == '{') {
1df70142 4736 const U8 c = (U8)value;
ffc61ed2
JH
4737 e = strchr(RExC_parse++, '}');
4738 if (!e)
0da60cf5 4739 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
4740 while (isSPACE(UCHARAT(RExC_parse)))
4741 RExC_parse++;
4742 if (e == RExC_parse)
0da60cf5 4743 vFAIL2("Empty \\%c{}", c);
ffc61ed2 4744 n = e - RExC_parse;
ab13f0c7
JH
4745 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4746 n--;
ffc61ed2
JH
4747 }
4748 else {
4749 e = RExC_parse;
4750 n = 1;
4751 }
4752 if (!SIZE_ONLY) {
ab13f0c7
JH
4753 if (UCHARAT(RExC_parse) == '^') {
4754 RExC_parse++;
4755 n--;
4756 value = value == 'p' ? 'P' : 'p'; /* toggle */
4757 while (isSPACE(UCHARAT(RExC_parse))) {
4758 RExC_parse++;
4759 n--;
4760 }
4761 }
ffc61ed2 4762 if (value == 'p')
ab13f0c7
JH
4763 Perl_sv_catpvf(aTHX_ listsv,
4764 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 4765 else
ab13f0c7
JH
4766 Perl_sv_catpvf(aTHX_ listsv,
4767 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
4768 }
4769 RExC_parse = e + 1;
4770 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2
JP
4771 namedclass = ANYOF_MAX; /* no official name, but it's named */
4772 break;
b8c5462f
JH
4773 case 'n': value = '\n'; break;
4774 case 'r': value = '\r'; break;
4775 case 't': value = '\t'; break;
4776 case 'f': value = '\f'; break;
4777 case 'b': value = '\b'; break;
c7f1f016
NIS
4778 case 'e': value = ASCII_TO_NATIVE('\033');break;
4779 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 4780 case 'x':
ffc61ed2 4781 if (*RExC_parse == '{') {
a4c04bdc
NC
4782 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4783 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 4784 e = strchr(RExC_parse++, '}');
b81d288d 4785 if (!e)
ffc61ed2 4786 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
4787
4788 numlen = e - RExC_parse;
4789 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4790 RExC_parse = e + 1;
4791 }
4792 else {
a4c04bdc 4793 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4794 numlen = 2;
4795 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4796 RExC_parse += numlen;
4797 }
b8c5462f
JH
4798 break;
4799 case 'c':
830247a4 4800 value = UCHARAT(RExC_parse++);
b8c5462f
JH
4801 value = toCTRL(value);
4802 break;
4803 case '0': case '1': case '2': case '3': case '4':
4804 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
4805 {
4806 I32 flags = 0;
4807 numlen = 3;
4808 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 4809 RExC_parse += numlen;
b8c5462f 4810 break;
53305cf1 4811 }
1028017a 4812 default:
041457d9 4813 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
4814 vWARN2(RExC_parse,
4815 "Unrecognized escape \\%c in character class passed through",
4816 (int)value);
1028017a 4817 break;
b8c5462f 4818 }
ffc61ed2 4819 } /* end of \blah */
1b2d223b
JH
4820#ifdef EBCDIC
4821 else
4822 literal_endpoint++;
4823#endif
ffc61ed2
JH
4824
4825 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4826
4827 if (!SIZE_ONLY && !need_class)
936ed897 4828 ANYOF_CLASS_ZERO(ret);
ffc61ed2 4829
936ed897 4830 need_class = 1;
ffc61ed2
JH
4831
4832 /* a bad range like a-\d, a-[:digit:] ? */
4833 if (range) {
73b437c8 4834 if (!SIZE_ONLY) {
afd78fd5
JH
4835 if (ckWARN(WARN_REGEXP)) {
4836 int w =
4837 RExC_parse >= rangebegin ?
4838 RExC_parse - rangebegin : 0;
830247a4 4839 vWARN4(RExC_parse,
b45f050a 4840 "False [] range \"%*.*s\"",
afd78fd5
JH
4841 w,
4842 w,
b45f050a 4843 rangebegin);
afd78fd5 4844 }
3568d838
JH
4845 if (prevvalue < 256) {
4846 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
4847 ANYOF_BITMAP_SET(ret, '-');
4848 }
4849 else {
4850 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4851 Perl_sv_catpvf(aTHX_ listsv,
3568d838 4852 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 4853 }
b8c5462f 4854 }
ffc61ed2
JH
4855
4856 range = 0; /* this was not a true range */
73b437c8 4857 }
ffc61ed2 4858
73b437c8 4859 if (!SIZE_ONLY) {
c49a72a9
NC
4860 const char *what = NULL;
4861 char yesno = 0;
4862
3568d838
JH
4863 if (namedclass > OOB_NAMEDCLASS)
4864 optimize_invert = FALSE;
e2962f66
JH
4865 /* Possible truncation here but in some 64-bit environments
4866 * the compiler gets heartburn about switch on 64-bit values.
4867 * A similar issue a little earlier when switching on value.
98f323fa 4868 * --jhi */
e2962f66 4869 switch ((I32)namedclass) {
73b437c8
JH
4870 case ANYOF_ALNUM:
4871 if (LOC)
936ed897 4872 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
4873 else {
4874 for (value = 0; value < 256; value++)
4875 if (isALNUM(value))
936ed897 4876 ANYOF_BITMAP_SET(ret, value);
73b437c8 4877 }
c49a72a9
NC
4878 yesno = '+';
4879 what = "Word";
73b437c8
JH
4880 break;
4881 case ANYOF_NALNUM:
4882 if (LOC)
936ed897 4883 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
4884 else {
4885 for (value = 0; value < 256; value++)
4886 if (!isALNUM(value))
936ed897 4887 ANYOF_BITMAP_SET(ret, value);
73b437c8 4888 }
c49a72a9
NC
4889 yesno = '!';
4890 what = "Word";
73b437c8 4891 break;
ffc61ed2 4892 case ANYOF_ALNUMC:
73b437c8 4893 if (LOC)
ffc61ed2 4894 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
4895 else {
4896 for (value = 0; value < 256; value++)
ffc61ed2 4897 if (isALNUMC(value))
936ed897 4898 ANYOF_BITMAP_SET(ret, value);
73b437c8 4899 }
c49a72a9
NC
4900 yesno = '+';
4901 what = "Alnum";
73b437c8
JH
4902 break;
4903 case ANYOF_NALNUMC:
4904 if (LOC)
936ed897 4905 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
4906 else {
4907 for (value = 0; value < 256; value++)
4908 if (!isALNUMC(value))
936ed897 4909 ANYOF_BITMAP_SET(ret, value);
73b437c8 4910 }
c49a72a9
NC
4911 yesno = '!';
4912 what = "Alnum";
73b437c8
JH
4913 break;
4914 case ANYOF_ALPHA:
4915 if (LOC)
936ed897 4916 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
4917 else {
4918 for (value = 0; value < 256; value++)
4919 if (isALPHA(value))
936ed897 4920 ANYOF_BITMAP_SET(ret, value);
73b437c8 4921 }
c49a72a9
NC
4922 yesno = '+';
4923 what = "Alpha";
73b437c8
JH
4924 break;
4925 case ANYOF_NALPHA:
4926 if (LOC)
936ed897 4927 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
4928 else {
4929 for (value = 0; value < 256; value++)
4930 if (!isALPHA(value))
936ed897 4931 ANYOF_BITMAP_SET(ret, value);
73b437c8 4932 }
c49a72a9
NC
4933 yesno = '!';
4934 what = "Alpha";
73b437c8
JH
4935 break;
4936 case ANYOF_ASCII:
4937 if (LOC)
936ed897 4938 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 4939 else {
c7f1f016 4940#ifndef EBCDIC
1ba5c669
JH
4941 for (value = 0; value < 128; value++)
4942 ANYOF_BITMAP_SET(ret, value);
4943#else /* EBCDIC */
ffbc6a93 4944 for (value = 0; value < 256; value++) {
3a3c4447
JH
4945 if (isASCII(value))
4946 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4947 }
1ba5c669 4948#endif /* EBCDIC */
73b437c8 4949 }
c49a72a9
NC
4950 yesno = '+';
4951 what = "ASCII";
73b437c8
JH
4952 break;
4953 case ANYOF_NASCII:
4954 if (LOC)
936ed897 4955 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 4956 else {
c7f1f016 4957#ifndef EBCDIC
1ba5c669
JH
4958 for (value = 128; value < 256; value++)
4959 ANYOF_BITMAP_SET(ret, value);
4960#else /* EBCDIC */
ffbc6a93 4961 for (value = 0; value < 256; value++) {
3a3c4447
JH
4962 if (!isASCII(value))
4963 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4964 }
1ba5c669 4965#endif /* EBCDIC */
73b437c8 4966 }
c49a72a9
NC
4967 yesno = '!';
4968 what = "ASCII";
73b437c8 4969 break;
aaa51d5e
JF
4970 case ANYOF_BLANK:
4971 if (LOC)
4972 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4973 else {
4974 for (value = 0; value < 256; value++)
4975 if (isBLANK(value))
4976 ANYOF_BITMAP_SET(ret, value);
4977 }
c49a72a9
NC
4978 yesno = '+';
4979 what = "Blank";
aaa51d5e
JF
4980 break;
4981 case ANYOF_NBLANK:
4982 if (LOC)
4983 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4984 else {
4985 for (value = 0; value < 256; value++)
4986 if (!isBLANK(value))
4987 ANYOF_BITMAP_SET(ret, value);
4988 }
c49a72a9
NC
4989 yesno = '!';
4990 what = "Blank";
aaa51d5e 4991 break;
73b437c8
JH
4992 case ANYOF_CNTRL:
4993 if (LOC)
936ed897 4994 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
4995 else {
4996 for (value = 0; value < 256; value++)
4997 if (isCNTRL(value))
936ed897 4998 ANYOF_BITMAP_SET(ret, value);
73b437c8 4999 }
c49a72a9
NC
5000 yesno = '+';
5001 what = "Cntrl";
73b437c8
JH
5002 break;
5003 case ANYOF_NCNTRL:
5004 if (LOC)
936ed897 5005 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5006 else {
5007 for (value = 0; value < 256; value++)
5008 if (!isCNTRL(value))
936ed897 5009 ANYOF_BITMAP_SET(ret, value);
73b437c8 5010 }
c49a72a9
NC
5011 yesno = '!';
5012 what = "Cntrl";
ffc61ed2
JH
5013 break;
5014 case ANYOF_DIGIT:
5015 if (LOC)
5016 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5017 else {
5018 /* consecutive digits assumed */
5019 for (value = '0'; value <= '9'; value++)
5020 ANYOF_BITMAP_SET(ret, value);
5021 }
c49a72a9
NC
5022 yesno = '+';
5023 what = "Digit";
ffc61ed2
JH
5024 break;
5025 case ANYOF_NDIGIT:
5026 if (LOC)
5027 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5028 else {
5029 /* consecutive digits assumed */
5030 for (value = 0; value < '0'; value++)
5031 ANYOF_BITMAP_SET(ret, value);
5032 for (value = '9' + 1; value < 256; value++)
5033 ANYOF_BITMAP_SET(ret, value);
5034 }
c49a72a9
NC
5035 yesno = '!';
5036 what = "Digit";
73b437c8
JH
5037 break;
5038 case ANYOF_GRAPH:
5039 if (LOC)
936ed897 5040 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5041 else {
5042 for (value = 0; value < 256; value++)
5043 if (isGRAPH(value))
936ed897 5044 ANYOF_BITMAP_SET(ret, value);
73b437c8 5045 }
c49a72a9
NC
5046 yesno = '+';
5047 what = "Graph";
73b437c8
JH
5048 break;
5049 case ANYOF_NGRAPH:
5050 if (LOC)
936ed897 5051 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5052 else {
5053 for (value = 0; value < 256; value++)
5054 if (!isGRAPH(value))
936ed897 5055 ANYOF_BITMAP_SET(ret, value);
73b437c8 5056 }
c49a72a9
NC
5057 yesno = '!';
5058 what = "Graph";
73b437c8
JH
5059 break;
5060 case ANYOF_LOWER:
5061 if (LOC)
936ed897 5062 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5063 else {
5064 for (value = 0; value < 256; value++)
5065 if (isLOWER(value))
936ed897 5066 ANYOF_BITMAP_SET(ret, value);
73b437c8 5067 }
c49a72a9
NC
5068 yesno = '+';
5069 what = "Lower";
73b437c8
JH
5070 break;
5071 case ANYOF_NLOWER:
5072 if (LOC)
936ed897 5073 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5074 else {
5075 for (value = 0; value < 256; value++)
5076 if (!isLOWER(value))
936ed897 5077 ANYOF_BITMAP_SET(ret, value);
73b437c8 5078 }
c49a72a9
NC
5079 yesno = '!';
5080 what = "Lower";
73b437c8
JH
5081 break;
5082 case ANYOF_PRINT:
5083 if (LOC)
936ed897 5084 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5085 else {
5086 for (value = 0; value < 256; value++)
5087 if (isPRINT(value))
936ed897 5088 ANYOF_BITMAP_SET(ret, value);
73b437c8 5089 }
c49a72a9
NC
5090 yesno = '+';
5091 what = "Print";
73b437c8
JH
5092 break;
5093 case ANYOF_NPRINT:
5094 if (LOC)
936ed897 5095 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5096 else {
5097 for (value = 0; value < 256; value++)
5098 if (!isPRINT(value))
936ed897 5099 ANYOF_BITMAP_SET(ret, value);
73b437c8 5100 }
c49a72a9
NC
5101 yesno = '!';
5102 what = "Print";
73b437c8 5103 break;
aaa51d5e
JF
5104 case ANYOF_PSXSPC:
5105 if (LOC)
5106 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5107 else {
5108 for (value = 0; value < 256; value++)
5109 if (isPSXSPC(value))
5110 ANYOF_BITMAP_SET(ret, value);
5111 }
c49a72a9
NC
5112 yesno = '+';
5113 what = "Space";
aaa51d5e
JF
5114 break;
5115 case ANYOF_NPSXSPC:
5116 if (LOC)
5117 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5118 else {
5119 for (value = 0; value < 256; value++)
5120 if (!isPSXSPC(value))
5121 ANYOF_BITMAP_SET(ret, value);
5122 }
c49a72a9
NC
5123 yesno = '!';
5124 what = "Space";
aaa51d5e 5125 break;
73b437c8
JH
5126 case ANYOF_PUNCT:
5127 if (LOC)
936ed897 5128 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5129 else {
5130 for (value = 0; value < 256; value++)
5131 if (isPUNCT(value))
936ed897 5132 ANYOF_BITMAP_SET(ret, value);
73b437c8 5133 }
c49a72a9
NC
5134 yesno = '+';
5135 what = "Punct";
73b437c8
JH
5136 break;
5137 case ANYOF_NPUNCT:
5138 if (LOC)
936ed897 5139 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5140 else {
5141 for (value = 0; value < 256; value++)
5142 if (!isPUNCT(value))
936ed897 5143 ANYOF_BITMAP_SET(ret, value);
73b437c8 5144 }
c49a72a9
NC
5145 yesno = '!';
5146 what = "Punct";
ffc61ed2
JH
5147 break;
5148 case ANYOF_SPACE:
5149 if (LOC)
5150 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5151 else {
5152 for (value = 0; value < 256; value++)
5153 if (isSPACE(value))
5154 ANYOF_BITMAP_SET(ret, value);
5155 }
c49a72a9
NC
5156 yesno = '+';
5157 what = "SpacePerl";
ffc61ed2
JH
5158 break;
5159 case ANYOF_NSPACE:
5160 if (LOC)
5161 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5162 else {
5163 for (value = 0; value < 256; value++)
5164 if (!isSPACE(value))
5165 ANYOF_BITMAP_SET(ret, value);
5166 }
c49a72a9
NC
5167 yesno = '!';
5168 what = "SpacePerl";
73b437c8
JH
5169 break;
5170 case ANYOF_UPPER:
5171 if (LOC)
936ed897 5172 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5173 else {
5174 for (value = 0; value < 256; value++)
5175 if (isUPPER(value))
936ed897 5176 ANYOF_BITMAP_SET(ret, value);
73b437c8 5177 }
c49a72a9
NC
5178 yesno = '+';
5179 what = "Upper";
73b437c8
JH
5180 break;
5181 case ANYOF_NUPPER:
5182 if (LOC)
936ed897 5183 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5184 else {
5185 for (value = 0; value < 256; value++)
5186 if (!isUPPER(value))
936ed897 5187 ANYOF_BITMAP_SET(ret, value);
73b437c8 5188 }
c49a72a9
NC
5189 yesno = '!';
5190 what = "Upper";
73b437c8
JH
5191 break;
5192 case ANYOF_XDIGIT:
5193 if (LOC)
936ed897 5194 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5195 else {
5196 for (value = 0; value < 256; value++)
5197 if (isXDIGIT(value))
936ed897 5198 ANYOF_BITMAP_SET(ret, value);
73b437c8 5199 }
c49a72a9
NC
5200 yesno = '+';
5201 what = "XDigit";
73b437c8
JH
5202 break;
5203 case ANYOF_NXDIGIT:
5204 if (LOC)
936ed897 5205 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5206 else {
5207 for (value = 0; value < 256; value++)
5208 if (!isXDIGIT(value))
936ed897 5209 ANYOF_BITMAP_SET(ret, value);
73b437c8 5210 }
c49a72a9
NC
5211 yesno = '!';
5212 what = "XDigit";
73b437c8 5213 break;
f81125e2
JP
5214 case ANYOF_MAX:
5215 /* this is to handle \p and \P */
5216 break;
73b437c8 5217 default:
b45f050a 5218 vFAIL("Invalid [::] class");
73b437c8 5219 break;
b8c5462f 5220 }
c49a72a9
NC
5221 if (what) {
5222 /* Strings such as "+utf8::isWord\n" */
5223 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5224 }
b8c5462f 5225 if (LOC)
936ed897 5226 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5227 continue;
a0d0e21e 5228 }
ffc61ed2
JH
5229 } /* end of namedclass \blah */
5230
a0d0e21e 5231 if (range) {
eb160463 5232 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
5233 const int w = RExC_parse - rangebegin;
5234 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 5235 range = 0; /* not a valid range */
73b437c8 5236 }
a0d0e21e
LW
5237 }
5238 else {
3568d838 5239 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5240 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5241 RExC_parse[1] != ']') {
5242 RExC_parse++;
ffc61ed2
JH
5243
5244 /* a bad range like \w-, [:word:]- ? */
5245 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 5246 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 5247 const int w =
afd78fd5
JH
5248 RExC_parse >= rangebegin ?
5249 RExC_parse - rangebegin : 0;
830247a4 5250 vWARN4(RExC_parse,
b45f050a 5251 "False [] range \"%*.*s\"",
afd78fd5
JH
5252 w,
5253 w,
b45f050a 5254 rangebegin);
afd78fd5 5255 }
73b437c8 5256 if (!SIZE_ONLY)
936ed897 5257 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5258 } else
ffc61ed2
JH
5259 range = 1; /* yeah, it's a range! */
5260 continue; /* but do it the next time */
a0d0e21e 5261 }
a687059c 5262 }
ffc61ed2 5263
93733859 5264 /* now is the next time */
ae5c130c 5265 if (!SIZE_ONLY) {
3568d838
JH
5266 IV i;
5267
5268 if (prevvalue < 256) {
1df70142 5269 const IV ceilvalue = value < 256 ? value : 255;
3568d838
JH
5270
5271#ifdef EBCDIC
1b2d223b
JH
5272 /* In EBCDIC [\x89-\x91] should include
5273 * the \x8e but [i-j] should not. */
5274 if (literal_endpoint == 2 &&
5275 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5276 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5277 {
3568d838
JH
5278 if (isLOWER(prevvalue)) {
5279 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5280 if (isLOWER(i))
5281 ANYOF_BITMAP_SET(ret, i);
5282 } else {
3568d838 5283 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5284 if (isUPPER(i))
5285 ANYOF_BITMAP_SET(ret, i);
5286 }
8ada0baa 5287 }
ffc61ed2 5288 else
8ada0baa 5289#endif
a5961de5
JH
5290 for (i = prevvalue; i <= ceilvalue; i++)
5291 ANYOF_BITMAP_SET(ret, i);
3568d838 5292 }
a5961de5 5293 if (value > 255 || UTF) {
1df70142
AL
5294 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5295 const UV natvalue = NATIVE_TO_UNI(value);
b08decb7 5296
ffc61ed2 5297 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5298 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5299 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5300 prevnatvalue, natvalue);
5301 }
5302 else if (prevnatvalue == natvalue) {
5303 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5304 if (FOLD) {
89ebb4a3 5305 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5306 STRLEN foldlen;
1df70142 5307 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5308
c840d2a2
JH
5309 /* If folding and foldable and a single
5310 * character, insert also the folded version
5311 * to the charclass. */
9e55ce06 5312 if (f != value) {
eb160463 5313 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5314 Perl_sv_catpvf(aTHX_ listsv,
5315 "%04"UVxf"\n", f);
5316 else {
5317 /* Any multicharacter foldings
5318 * require the following transform:
5319 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5320 * where E folds into "pq" and F folds
5321 * into "rst", all other characters
5322 * fold to single characters. We save
5323 * away these multicharacter foldings,
5324 * to be later saved as part of the
5325 * additional "s" data. */
5326 SV *sv;
5327
5328 if (!unicode_alternate)
5329 unicode_alternate = newAV();
5330 sv = newSVpvn((char*)foldbuf, foldlen);
5331 SvUTF8_on(sv);
5332 av_push(unicode_alternate, sv);
5333 }
5334 }
254ba52a 5335
60a8b682
JH
5336 /* If folding and the value is one of the Greek
5337 * sigmas insert a few more sigmas to make the
5338 * folding rules of the sigmas to work right.
5339 * Note that not all the possible combinations
5340 * are handled here: some of them are handled
9e55ce06
JH
5341 * by the standard folding rules, and some of
5342 * them (literal or EXACTF cases) are handled
5343 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5344 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5345 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5346 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5347 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5348 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5349 }
5350 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5351 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5352 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5353 }
5354 }
ffc61ed2 5355 }
1b2d223b
JH
5356#ifdef EBCDIC
5357 literal_endpoint = 0;
5358#endif
8ada0baa 5359 }
ffc61ed2
JH
5360
5361 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5362 }
ffc61ed2 5363
936ed897 5364 if (need_class) {
4f66b38d 5365 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5366 if (SIZE_ONLY)
830247a4 5367 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5368 else
830247a4 5369 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5370 }
ffc61ed2 5371
ae5c130c 5372 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 5373 if (!SIZE_ONLY &&
ffc61ed2 5374 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5375 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5376 ) {
a0ed51b3 5377 for (value = 0; value < 256; ++value) {
936ed897 5378 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5379 UV fold = PL_fold[value];
ffc61ed2
JH
5380
5381 if (fold != value)
5382 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5383 }
5384 }
936ed897 5385 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5386 }
ffc61ed2 5387
ae5c130c 5388 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 5389 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
5390 /* If the only flag is inversion. */
5391 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5392 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5393 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5394 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5395 }
a0d0e21e 5396
b81d288d 5397 if (!SIZE_ONLY) {
fde631ed 5398 AV *av = newAV();
ffc61ed2
JH
5399 SV *rv;
5400
9e55ce06 5401 /* The 0th element stores the character class description
6a0407ee 5402 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5403 * to initialize the appropriate swash (which gets stored in
5404 * the 1st element), and also useful for dumping the regnode.
5405 * The 2nd element stores the multicharacter foldings,
6a0407ee 5406 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
5407 av_store(av, 0, listsv);
5408 av_store(av, 1, NULL);
9e55ce06 5409 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 5410 rv = newRV_noinc((SV*)av);
19860706 5411 n = add_data(pRExC_state, 1, "s");
830247a4 5412 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 5413 ARG_SET(ret, n);
a0ed51b3
LW
5414 }
5415
5416 return ret;
5417}
5418
76e3520e 5419STATIC char*
830247a4 5420S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 5421{
830247a4 5422 char* retval = RExC_parse++;
a0d0e21e 5423
4633a7c4 5424 for (;;) {
830247a4
IZ
5425 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5426 RExC_parse[2] == '#') {
e994fd66
AE
5427 while (*RExC_parse != ')') {
5428 if (RExC_parse == RExC_end)
5429 FAIL("Sequence (?#... not terminated");
830247a4 5430 RExC_parse++;
e994fd66 5431 }
830247a4 5432 RExC_parse++;
4633a7c4
LW
5433 continue;
5434 }
e2509266 5435 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
5436 if (isSPACE(*RExC_parse)) {
5437 RExC_parse++;
748a9306
LW
5438 continue;
5439 }
830247a4 5440 else if (*RExC_parse == '#') {
e994fd66
AE
5441 while (RExC_parse < RExC_end)
5442 if (*RExC_parse++ == '\n') break;
748a9306
LW
5443 continue;
5444 }
748a9306 5445 }
4633a7c4 5446 return retval;
a0d0e21e 5447 }
a687059c
LW
5448}
5449
5450/*
c277df42 5451- reg_node - emit a node
a0d0e21e 5452*/
76e3520e 5453STATIC regnode * /* Location. */
830247a4 5454S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 5455{
97aff369 5456 dVAR;
c277df42 5457 register regnode *ptr;
504618e9 5458 regnode * const ret = RExC_emit;
a687059c 5459
c277df42 5460 if (SIZE_ONLY) {
830247a4
IZ
5461 SIZE_ALIGN(RExC_size);
5462 RExC_size += 1;
a0d0e21e
LW
5463 return(ret);
5464 }
a687059c 5465
c277df42 5466 NODE_ALIGN_FILL(ret);
a0d0e21e 5467 ptr = ret;
c277df42 5468 FILL_ADVANCE_NODE(ptr, op);
fac92740 5469 if (RExC_offsets) { /* MJD */
ccb2c380 5470 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
5471 "reg_node", __LINE__,
5472 reg_name[op],
5473 RExC_emit - RExC_emit_start > RExC_offsets[0]
5474 ? "Overwriting end of array!\n" : "OK",
5475 RExC_emit - RExC_emit_start,
5476 RExC_parse - RExC_start,
5477 RExC_offsets[0]));
ccb2c380 5478 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
5479 }
5480
830247a4 5481 RExC_emit = ptr;
a687059c 5482
a0d0e21e 5483 return(ret);
a687059c
LW
5484}
5485
5486/*
a0d0e21e
LW
5487- reganode - emit a node with an argument
5488*/
76e3520e 5489STATIC regnode * /* Location. */
830247a4 5490S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 5491{
97aff369 5492 dVAR;
c277df42 5493 register regnode *ptr;
504618e9 5494 regnode * const ret = RExC_emit;
fe14fcc3 5495
c277df42 5496 if (SIZE_ONLY) {
830247a4
IZ
5497 SIZE_ALIGN(RExC_size);
5498 RExC_size += 2;
a0d0e21e
LW
5499 return(ret);
5500 }
fe14fcc3 5501
c277df42 5502 NODE_ALIGN_FILL(ret);
a0d0e21e 5503 ptr = ret;
c277df42 5504 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 5505 if (RExC_offsets) { /* MJD */
ccb2c380 5506 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5507 "reganode",
ccb2c380
MP
5508 __LINE__,
5509 reg_name[op],
fac92740
MJD
5510 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5511 "Overwriting end of array!\n" : "OK",
5512 RExC_emit - RExC_emit_start,
5513 RExC_parse - RExC_start,
5514 RExC_offsets[0]));
ccb2c380 5515 Set_Cur_Node_Offset;
fac92740
MJD
5516 }
5517
830247a4 5518 RExC_emit = ptr;
fe14fcc3 5519
a0d0e21e 5520 return(ret);
fe14fcc3
LW
5521}
5522
5523/*
cd439c50 5524- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
5525*/
5526STATIC void
a28509cc 5527S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 5528{
97aff369 5529 dVAR;
5e12f4fb 5530 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
5531}
5532
5533/*
a0d0e21e
LW
5534- reginsert - insert an operator in front of already-emitted operand
5535*
5536* Means relocating the operand.
5537*/
76e3520e 5538STATIC void
830247a4 5539S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 5540{
97aff369 5541 dVAR;
c277df42
IZ
5542 register regnode *src;
5543 register regnode *dst;
5544 register regnode *place;
504618e9 5545 const int offset = regarglen[(U8)op];
b81d288d 5546
22c35a8c 5547/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
5548
5549 if (SIZE_ONLY) {
830247a4 5550 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
5551 return;
5552 }
a687059c 5553
830247a4
IZ
5554 src = RExC_emit;
5555 RExC_emit += NODE_STEP_REGNODE + offset;
5556 dst = RExC_emit;
fac92740 5557 while (src > opnd) {
c277df42 5558 StructCopy(--src, --dst, regnode);
fac92740 5559 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 5560 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 5561 "reg_insert",
ccb2c380
MP
5562 __LINE__,
5563 reg_name[op],
fac92740
MJD
5564 dst - RExC_emit_start > RExC_offsets[0]
5565 ? "Overwriting end of array!\n" : "OK",
5566 src - RExC_emit_start,
5567 dst - RExC_emit_start,
5568 RExC_offsets[0]));
ccb2c380
MP
5569 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5570 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
5571 }
5572 }
5573
a0d0e21e
LW
5574
5575 place = opnd; /* Op node, where operand used to be. */
fac92740 5576 if (RExC_offsets) { /* MJD */
ccb2c380 5577 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5578 "reginsert",
ccb2c380
MP
5579 __LINE__,
5580 reg_name[op],
fac92740
MJD
5581 place - RExC_emit_start > RExC_offsets[0]
5582 ? "Overwriting end of array!\n" : "OK",
5583 place - RExC_emit_start,
5584 RExC_parse - RExC_start,
5585 RExC_offsets[0]));
ccb2c380 5586 Set_Node_Offset(place, RExC_parse);
45948336 5587 Set_Node_Length(place, 1);
fac92740 5588 }
c277df42
IZ
5589 src = NEXTOPER(place);
5590 FILL_ADVANCE_NODE(place, op);
5591 Zero(src, offset, regnode);
a687059c
LW
5592}
5593
5594/*
c277df42 5595- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 5596*/
76e3520e 5597STATIC void
830247a4 5598S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5599{
97aff369 5600 dVAR;
c277df42 5601 register regnode *scan;
a0d0e21e 5602
c277df42 5603 if (SIZE_ONLY)
a0d0e21e
LW
5604 return;
5605
5606 /* Find last node. */
5607 scan = p;
5608 for (;;) {
504618e9 5609 regnode * const temp = regnext(scan);
a0d0e21e
LW
5610 if (temp == NULL)
5611 break;
5612 scan = temp;
5613 }
a687059c 5614
c277df42
IZ
5615 if (reg_off_by_arg[OP(scan)]) {
5616 ARG_SET(scan, val - scan);
a0ed51b3
LW
5617 }
5618 else {
c277df42
IZ
5619 NEXT_OFF(scan) = val - scan;
5620 }
a687059c
LW
5621}
5622
5623/*
a0d0e21e
LW
5624- regoptail - regtail on operand of first argument; nop if operandless
5625*/
76e3520e 5626STATIC void
830247a4 5627S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5628{
97aff369 5629 dVAR;
a0d0e21e 5630 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
5631 if (p == NULL || SIZE_ONLY)
5632 return;
22c35a8c 5633 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 5634 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 5635 }
22c35a8c 5636 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 5637 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
5638 }
5639 else
a0d0e21e 5640 return;
a687059c
LW
5641}
5642
5643/*
5644 - regcurly - a little FSA that accepts {\d+,?\d*}
5645 */
79072805 5646STATIC I32
5f66b61c 5647S_regcurly(register const char *s)
a687059c
LW
5648{
5649 if (*s++ != '{')
5650 return FALSE;
f0fcb552 5651 if (!isDIGIT(*s))
a687059c 5652 return FALSE;
f0fcb552 5653 while (isDIGIT(*s))
a687059c
LW
5654 s++;
5655 if (*s == ',')
5656 s++;
f0fcb552 5657 while (isDIGIT(*s))
a687059c
LW
5658 s++;
5659 if (*s != '}')
5660 return FALSE;
5661 return TRUE;
5662}
5663
a687059c
LW
5664
5665/*
fd181c75 5666 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
5667 */
5668void
864dbfa3 5669Perl_regdump(pTHX_ regexp *r)
a687059c 5670{
35ff7856 5671#ifdef DEBUGGING
97aff369 5672 dVAR;
c445ea15 5673 SV * const sv = sv_newmortal();
a687059c 5674
c277df42 5675 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
5676
5677 /* Header fields of interest. */
c277df42 5678 if (r->anchored_substr)
7b0972df 5679 PerlIO_printf(Perl_debug_log,
a0288114 5680 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
3280af22 5681 PL_colors[0],
7b0972df 5682 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
3f7c398e 5683 SvPVX_const(r->anchored_substr),
3280af22 5684 PL_colors[1],
c277df42 5685 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 5686 (IV)r->anchored_offset);
33b8afdf
JH
5687 else if (r->anchored_utf8)
5688 PerlIO_printf(Perl_debug_log,
a0288114 5689 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
33b8afdf
JH
5690 PL_colors[0],
5691 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
3f7c398e 5692 SvPVX_const(r->anchored_utf8),
33b8afdf
JH
5693 PL_colors[1],
5694 SvTAIL(r->anchored_utf8) ? "$" : "",
5695 (IV)r->anchored_offset);
c277df42 5696 if (r->float_substr)
7b0972df 5697 PerlIO_printf(Perl_debug_log,
a0288114 5698 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
3280af22 5699 PL_colors[0],
b81d288d 5700 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
3f7c398e 5701 SvPVX_const(r->float_substr),
3280af22 5702 PL_colors[1],
c277df42 5703 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 5704 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
5705 else if (r->float_utf8)
5706 PerlIO_printf(Perl_debug_log,
a0288114 5707 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
33b8afdf
JH
5708 PL_colors[0],
5709 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
3f7c398e 5710 SvPVX_const(r->float_utf8),
33b8afdf
JH
5711 PL_colors[1],
5712 SvTAIL(r->float_utf8) ? "$" : "",
5713 (IV)r->float_min_offset, (UV)r->float_max_offset);
5714 if (r->check_substr || r->check_utf8)
b81d288d
AB
5715 PerlIO_printf(Perl_debug_log,
5716 r->check_substr == r->float_substr
33b8afdf 5717 && r->check_utf8 == r->float_utf8
c277df42
IZ
5718 ? "(checking floating" : "(checking anchored");
5719 if (r->reganch & ROPT_NOSCAN)
5720 PerlIO_printf(Perl_debug_log, " noscan");
5721 if (r->reganch & ROPT_CHECK_ALL)
5722 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 5723 if (r->check_substr || r->check_utf8)
c277df42
IZ
5724 PerlIO_printf(Perl_debug_log, ") ");
5725
46fc3d4c 5726 if (r->regstclass) {
5727 regprop(sv, r->regstclass);
3f7c398e 5728 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
46fc3d4c 5729 }
774d564b 5730 if (r->reganch & ROPT_ANCH) {
5731 PerlIO_printf(Perl_debug_log, "anchored");
5732 if (r->reganch & ROPT_ANCH_BOL)
5733 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
5734 if (r->reganch & ROPT_ANCH_MBOL)
5735 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
5736 if (r->reganch & ROPT_ANCH_SBOL)
5737 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 5738 if (r->reganch & ROPT_ANCH_GPOS)
5739 PerlIO_printf(Perl_debug_log, "(GPOS)");
5740 PerlIO_putc(Perl_debug_log, ' ');
5741 }
c277df42
IZ
5742 if (r->reganch & ROPT_GPOS_SEEN)
5743 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 5744 if (r->reganch & ROPT_SKIP)
760ac839 5745 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 5746 if (r->reganch & ROPT_IMPLICIT)
760ac839 5747 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 5748 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
5749 if (r->reganch & ROPT_EVAL_SEEN)
5750 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 5751 PerlIO_printf(Perl_debug_log, "\n");
fac92740 5752 if (r->offsets) {
e4584336 5753 const U32 len = r->offsets[0];
a3621e74
YO
5754 GET_RE_DEBUG_FLAGS_DECL;
5755 DEBUG_OFFSETS_r({
1df70142 5756 U32 i;
e4584336
RB
5757 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5758 for (i = 1; i <= len; i++)
5759 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5760 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5761 PerlIO_printf(Perl_debug_log, "\n");
a3621e74 5762 });
fac92740 5763 }
65e66c80 5764#else
96a5add6 5765 PERL_UNUSED_CONTEXT;
65e66c80 5766 PERL_UNUSED_ARG(r);
17c3b450 5767#endif /* DEBUGGING */
a687059c
LW
5768}
5769
5770/*
a0d0e21e
LW
5771- regprop - printable representation of opcode
5772*/
46fc3d4c 5773void
a3b680e6 5774Perl_regprop(pTHX_ SV *sv, const regnode *o)
a687059c 5775{
35ff7856 5776#ifdef DEBUGGING
97aff369 5777 dVAR;
9b155405 5778 register int k;
a0d0e21e 5779
54dc92de 5780 sv_setpvn(sv, "", 0);
9b155405 5781 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
5782 /* It would be nice to FAIL() here, but this may be called from
5783 regexec.c, and it would be hard to supply pRExC_state. */
5784 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 5785 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405
IZ
5786
5787 k = PL_regkind[(U8)OP(o)];
5788
2a782b5b 5789 if (k == EXACT) {
396482e1 5790 SV * const dsv = sv_2mortal(newSVpvs(""));
c728cb41
JH
5791 /* Using is_utf8_string() is a crude hack but it may
5792 * be the best for now since we have no flag "this EXACTish
5793 * node was UTF-8" --jhi */
1df70142 5794 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
2d03de9c 5795 const char * const s = do_utf8 ?
c728cb41
JH
5796 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5797 UNI_DISPLAY_REGEX) :
2a782b5b 5798 STRING(o);
e1ec3a88 5799 const int len = do_utf8 ?
2a782b5b
JH
5800 strlen(s) :
5801 STR_LEN(o);
5802 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5803 PL_colors[0],
5804 len, s,
5805 PL_colors[1]);
bb263b4e
AL
5806 } else if (k == TRIE) {
5807 /*EMPTY*/;
5808 /*
a3621e74
YO
5809 this isn't always safe, as Pl_regdata may not be for this regex yet
5810 (depending on where its called from) so its being moved to dumpuntil
5811 I32 n = ARG(o);
5812 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5813 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5814 trie->wordcount,
5815 trie->charcount,
5816 trie->uniquecharcount,
5817 trie->laststate);
5818 */
5819 } else if (k == CURLY) {
cb434fcc 5820 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
5821 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5822 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 5823 }
2c2d71f5
JH
5824 else if (k == WHILEM && o->flags) /* Ordinal/of */
5825 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 5826 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 5827 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 5828 else if (k == LOGICAL)
04ebc1ab 5829 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
5830 else if (k == ANYOF) {
5831 int i, rangestart = -1;
2d03de9c 5832 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
5833
5834 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
5835 static const char * const anyofs[] = {
653099ff
GS
5836 "\\w",
5837 "\\W",
5838 "\\s",
5839 "\\S",
5840 "\\d",
5841 "\\D",
5842 "[:alnum:]",
5843 "[:^alnum:]",
5844 "[:alpha:]",
5845 "[:^alpha:]",
5846 "[:ascii:]",
5847 "[:^ascii:]",
5848 "[:ctrl:]",
5849 "[:^ctrl:]",
5850 "[:graph:]",
5851 "[:^graph:]",
5852 "[:lower:]",
5853 "[:^lower:]",
5854 "[:print:]",
5855 "[:^print:]",
5856 "[:punct:]",
5857 "[:^punct:]",
5858 "[:upper:]",
aaa51d5e 5859 "[:^upper:]",
653099ff 5860 "[:xdigit:]",
aaa51d5e
JF
5861 "[:^xdigit:]",
5862 "[:space:]",
5863 "[:^space:]",
5864 "[:blank:]",
5865 "[:^blank:]"
653099ff
GS
5866 };
5867
19860706 5868 if (flags & ANYOF_LOCALE)
396482e1 5869 sv_catpvs(sv, "{loc}");
19860706 5870 if (flags & ANYOF_FOLD)
396482e1 5871 sv_catpvs(sv, "{i}");
653099ff 5872 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 5873 if (flags & ANYOF_INVERT)
396482e1 5874 sv_catpvs(sv, "^");
ffc61ed2
JH
5875 for (i = 0; i <= 256; i++) {
5876 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5877 if (rangestart == -1)
5878 rangestart = i;
5879 } else if (rangestart != -1) {
5880 if (i <= rangestart + 3)
5881 for (; rangestart < i; rangestart++)
653099ff 5882 put_byte(sv, rangestart);
ffc61ed2
JH
5883 else {
5884 put_byte(sv, rangestart);
396482e1 5885 sv_catpvs(sv, "-");
ffc61ed2 5886 put_byte(sv, i - 1);
653099ff 5887 }
ffc61ed2 5888 rangestart = -1;
653099ff 5889 }
847a199f 5890 }
ffc61ed2
JH
5891
5892 if (o->flags & ANYOF_CLASS)
5893 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
5894 if (ANYOF_CLASS_TEST(o,i))
5895 sv_catpv(sv, anyofs[i]);
5896
5897 if (flags & ANYOF_UNICODE)
396482e1 5898 sv_catpvs(sv, "{unicode}");
1aa99e6b 5899 else if (flags & ANYOF_UNICODE_ALL)
396482e1 5900 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
5901
5902 {
5903 SV *lv;
2d03de9c 5904 SV * const sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 5905
ffc61ed2
JH
5906 if (lv) {
5907 if (sw) {
89ebb4a3 5908 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 5909
ffc61ed2 5910 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 5911 uvchr_to_utf8(s, i);
ffc61ed2 5912
3568d838 5913 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
5914 if (rangestart == -1)
5915 rangestart = i;
5916 } else if (rangestart != -1) {
ffc61ed2
JH
5917 if (i <= rangestart + 3)
5918 for (; rangestart < i; rangestart++) {
2d03de9c
AL
5919 const U8 * const e = uvchr_to_utf8(s,rangestart);
5920 U8 *p;
5921 for(p = s; p < e; p++)
ffc61ed2
JH
5922 put_byte(sv, *p);
5923 }
5924 else {
2d03de9c
AL
5925 const U8 *e = uvchr_to_utf8(s,rangestart);
5926 U8 *p;
5927 for (p = s; p < e; p++)
ffc61ed2 5928 put_byte(sv, *p);
396482e1 5929 sv_catpvs(sv, "-");
2d03de9c
AL
5930 e = uvchr_to_utf8(s, i-1);
5931 for (p = s; p < e; p++)
1df70142 5932 put_byte(sv, *p);
ffc61ed2
JH
5933 }
5934 rangestart = -1;
5935 }
19860706 5936 }
ffc61ed2 5937
396482e1 5938 sv_catpvs(sv, "..."); /* et cetera */
19860706 5939 }
fde631ed 5940
ffc61ed2 5941 {
2e0de35c 5942 char *s = savesvpv(lv);
c445ea15 5943 char * const origs = s;
b81d288d 5944
ffc61ed2 5945 while(*s && *s != '\n') s++;
b81d288d 5946
ffc61ed2 5947 if (*s == '\n') {
2d03de9c 5948 const char * const t = ++s;
ffc61ed2
JH
5949
5950 while (*s) {
5951 if (*s == '\n')
5952 *s = ' ';
5953 s++;
5954 }
5955 if (s[-1] == ' ')
5956 s[-1] = 0;
5957
5958 sv_catpv(sv, t);
fde631ed 5959 }
b81d288d 5960
ffc61ed2 5961 Safefree(origs);
fde631ed
JH
5962 }
5963 }
653099ff 5964 }
ffc61ed2 5965
653099ff
GS
5966 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5967 }
9b155405 5968 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 5969 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
65e66c80 5970#else
96a5add6 5971 PERL_UNUSED_CONTEXT;
65e66c80
SP
5972 PERL_UNUSED_ARG(sv);
5973 PERL_UNUSED_ARG(o);
17c3b450 5974#endif /* DEBUGGING */
35ff7856 5975}
a687059c 5976
cad2e5aa
JH
5977SV *
5978Perl_re_intuit_string(pTHX_ regexp *prog)
5979{ /* Assume that RE_INTUIT is set */
97aff369 5980 dVAR;
a3621e74 5981 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
5982 PERL_UNUSED_CONTEXT;
5983
a3621e74 5984 DEBUG_COMPILE_r(
cfd0369c 5985 {
2d03de9c 5986 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 5987 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
5988
5989 if (!PL_colorset) reginitcolors();
5990 PerlIO_printf(Perl_debug_log,
a0288114 5991 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
5992 PL_colors[4],
5993 prog->check_substr ? "" : "utf8 ",
5994 PL_colors[5],PL_colors[0],
cad2e5aa
JH
5995 s,
5996 PL_colors[1],
5997 (strlen(s) > 60 ? "..." : ""));
5998 } );
5999
33b8afdf 6000 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
6001}
6002
2b69d0c2 6003void
864dbfa3 6004Perl_pregfree(pTHX_ struct regexp *r)
a687059c 6005{
27da23d5 6006 dVAR;
9e55ce06 6007#ifdef DEBUGGING
c445ea15
AL
6008 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
6009 SV * const re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
9e55ce06 6010#endif
7821416a 6011
a3621e74 6012
7821416a
IZ
6013 if (!r || (--r->refcnt > 0))
6014 return;
a3621e74 6015 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
c445ea15 6016 const char * const s = (r->reganch & ROPT_UTF8)
e1ec3a88 6017 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 6018 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
e1ec3a88 6019 const int len = SvCUR(dsv);
9e55ce06
JH
6020 if (!PL_colorset)
6021 reginitcolors();
6022 PerlIO_printf(Perl_debug_log,
a3621e74 6023 "%sFreeing REx:%s %s%*.*s%s%s\n",
9e55ce06
JH
6024 PL_colors[4],PL_colors[5],PL_colors[0],
6025 len, len, s,
6026 PL_colors[1],
6027 len > 60 ? "..." : "");
6028 });
cad2e5aa 6029
43c5f42d
NC
6030 /* gcov results gave these as non-null 100% of the time, so there's no
6031 optimisation in checking them before calling Safefree */
6032 Safefree(r->precomp);
6033 Safefree(r->offsets); /* 20010421 MJD */
ed252734 6034 RX_MATCH_COPY_FREE(r);
f8c7b90f 6035#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
6036 if (r->saved_copy)
6037 SvREFCNT_dec(r->saved_copy);
6038#endif
a193d654
GS
6039 if (r->substrs) {
6040 if (r->anchored_substr)
6041 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6042 if (r->anchored_utf8)
6043 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6044 if (r->float_substr)
6045 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6046 if (r->float_utf8)
6047 SvREFCNT_dec(r->float_utf8);
2779dcf1 6048 Safefree(r->substrs);
a193d654 6049 }
c277df42
IZ
6050 if (r->data) {
6051 int n = r->data->count;
f3548bdc
DM
6052 PAD* new_comppad = NULL;
6053 PAD* old_comppad;
4026c95a 6054 PADOFFSET refcnt;
dfad63ad 6055
c277df42 6056 while (--n >= 0) {
261faec3 6057 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6058 switch (r->data->what[n]) {
6059 case 's':
6060 SvREFCNT_dec((SV*)r->data->data[n]);
6061 break;
653099ff
GS
6062 case 'f':
6063 Safefree(r->data->data[n]);
6064 break;
dfad63ad
HS
6065 case 'p':
6066 new_comppad = (AV*)r->data->data[n];
6067 break;
c277df42 6068 case 'o':
dfad63ad 6069 if (new_comppad == NULL)
cea2e8a9 6070 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6071 PAD_SAVE_LOCAL(old_comppad,
6072 /* Watch out for global destruction's random ordering. */
c445ea15 6073 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 6074 );
b34c0dd4 6075 OP_REFCNT_LOCK;
4026c95a
SH
6076 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6077 OP_REFCNT_UNLOCK;
6078 if (!refcnt)
9b978d73 6079 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6080
f3548bdc 6081 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6082 SvREFCNT_dec((SV*)new_comppad);
6083 new_comppad = NULL;
c277df42
IZ
6084 break;
6085 case 'n':
9e55ce06 6086 break;
a3621e74
YO
6087 case 't':
6088 {
c445ea15 6089 reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
a3621e74
YO
6090 U32 refcount;
6091 OP_REFCNT_LOCK;
e27afef8 6092 refcount = --trie->refcount;
a3621e74
YO
6093 OP_REFCNT_UNLOCK;
6094 if ( !refcount ) {
43c5f42d 6095 Safefree(trie->charmap);
a3621e74
YO
6096 if (trie->widecharmap)
6097 SvREFCNT_dec((SV*)trie->widecharmap);
43c5f42d
NC
6098 Safefree(trie->states);
6099 Safefree(trie->trans);
a3621e74
YO
6100#ifdef DEBUGGING
6101 if (trie->words)
6102 SvREFCNT_dec((SV*)trie->words);
6103 if (trie->revcharmap)
6104 SvREFCNT_dec((SV*)trie->revcharmap);
6105#endif
6106 Safefree(r->data->data[n]); /* do this last!!!! */
6107 }
6108 break;
6109 }
c277df42 6110 default:
830247a4 6111 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6112 }
6113 }
6114 Safefree(r->data->what);
6115 Safefree(r->data);
a0d0e21e
LW
6116 }
6117 Safefree(r->startp);
6118 Safefree(r->endp);
6119 Safefree(r);
a687059c 6120}
c277df42
IZ
6121
6122/*
6123 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6124 */
6125regnode *
864dbfa3 6126Perl_regnext(pTHX_ register regnode *p)
c277df42 6127{
97aff369 6128 dVAR;
c277df42
IZ
6129 register I32 offset;
6130
3280af22 6131 if (p == &PL_regdummy)
c277df42
IZ
6132 return(NULL);
6133
6134 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6135 if (offset == 0)
6136 return(NULL);
6137
c277df42 6138 return(p+offset);
c277df42
IZ
6139}
6140
01f988be 6141STATIC void
cea2e8a9 6142S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6143{
6144 va_list args;
6145 STRLEN l1 = strlen(pat1);
6146 STRLEN l2 = strlen(pat2);
6147 char buf[512];
06bf62c7 6148 SV *msv;
73d840c0 6149 const char *message;
c277df42
IZ
6150
6151 if (l1 > 510)
6152 l1 = 510;
6153 if (l1 + l2 > 510)
6154 l2 = 510 - l1;
6155 Copy(pat1, buf, l1 , char);
6156 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6157 buf[l1 + l2] = '\n';
6158 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6159#ifdef I_STDARG
6160 /* ANSI variant takes additional second argument */
c277df42 6161 va_start(args, pat2);
8736538c
AS
6162#else
6163 va_start(args);
6164#endif
5a844595 6165 msv = vmess(buf, &args);
c277df42 6166 va_end(args);
cfd0369c 6167 message = SvPV_const(msv,l1);
c277df42
IZ
6168 if (l1 > 512)
6169 l1 = 512;
6170 Copy(message, buf, l1 , char);
197cf9b9 6171 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6172 Perl_croak(aTHX_ "%s", buf);
c277df42 6173}
a0ed51b3
LW
6174
6175/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6176
6177void
864dbfa3 6178Perl_save_re_context(pTHX)
b81d288d 6179{
97aff369 6180 dVAR;
830247a4 6181 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 6182 SAVEPPTR(PL_bostr);
a0ed51b3
LW
6183 SAVEPPTR(PL_reginput); /* String-input pointer. */
6184 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6185 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
6186 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6187 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6188 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a5db57d6 6189 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
a0ed51b3 6190 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 6191 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 6192 PL_reg_start_tmp = 0;
a0ed51b3
LW
6193 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6194 PL_reg_start_tmpl = 0;
7766f137 6195 SAVEVPTR(PL_regdata);
a0ed51b3
LW
6196 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6197 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 6198 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 6199 SAVEINT(PL_regindent); /* from regexec.c */
7766f137 6200 SAVEVPTR(PL_curcop);
7766f137
GS
6201 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6202 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
6203 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6204 SAVESPTR(PL_reg_sv); /* from regexec.c */
9febdf04 6205 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
7766f137 6206 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 6207 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
6208 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6209 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
a5db57d6 6210 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
c445ea15 6211 PL_reg_oldsaved = NULL;
a5db57d6
GS
6212 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6213 PL_reg_oldsavedlen = 0;
f8c7b90f 6214#ifdef PERL_OLD_COPY_ON_WRITE
ed252734 6215 SAVESPTR(PL_nrs);
c445ea15 6216 PL_nrs = NULL;
ed252734 6217#endif
a5db57d6
GS
6218 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6219 PL_reg_maxiter = 0;
6220 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6221 PL_reg_leftiter = 0;
6222 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
c445ea15 6223 PL_reg_poscache = NULL;
a5db57d6
GS
6224 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6225 PL_reg_poscache_size = 0;
6226 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5fb7366e 6227 SAVEI32(PL_regnpar); /* () count. */
e49a9654 6228 SAVEI32(PL_regsize); /* from regexec.c */
ada6e8a9 6229
c445ea15
AL
6230 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6231 if (PL_curpm) {
6232 const REGEXP * const rx = PM_GETRE(PL_curpm);
6233 if (rx) {
1df70142 6234 U32 i;
ada6e8a9 6235 for (i = 1; i <= rx->nparens; i++) {
1df70142 6236 char digits[TYPE_CHARS(long)];
e5105eda 6237 const STRLEN len = my_sprintf(digits, "%lu", (long)i);
49f27e4b
NC
6238 GV *const *const gvp
6239 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6240
b37c2d43
AL
6241 if (gvp) {
6242 GV * const gv = *gvp;
6243 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6244 save_scalar(gv);
49f27e4b 6245 }
ada6e8a9
AMS
6246 }
6247 }
6248 }
6249
54b6e2fa 6250#ifdef DEBUGGING
b81d288d 6251 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 6252#endif
a0ed51b3 6253}
51371543 6254
51371543 6255static void
acfe0abc 6256clear_re(pTHX_ void *r)
51371543 6257{
97aff369 6258 dVAR;
51371543
GS
6259 ReREFCNT_dec((regexp *)r);
6260}
ffbc6a93 6261
a28509cc
AL
6262#ifdef DEBUGGING
6263
6264STATIC void
6265S_put_byte(pTHX_ SV *sv, int c)
6266{
6267 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6268 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6269 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6270 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6271 else
6272 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6273}
6274
6275
6276STATIC regnode *
6277S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
6278{
97aff369 6279 dVAR;
a28509cc
AL
6280 register U8 op = EXACT; /* Arbitrary non-END op. */
6281 register regnode *next;
6282
6283 while (op != END && (!last || node < last)) {
6284 /* While that wasn't END last time... */
6285
6286 NODE_ALIGN(node);
6287 op = OP(node);
6288 if (op == CLOSE)
6289 l--;
6290 next = regnext(node);
6291 /* Where, what. */
6292 if (OP(node) == OPTIMIZED)
6293 goto after_print;
6294 regprop(sv, node);
6295 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6296 (int)(2*l + 1), "", SvPVX_const(sv));
6297 if (next == NULL) /* Next ptr. */
6298 PerlIO_printf(Perl_debug_log, "(0)");
6299 else
6300 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6301 (void)PerlIO_putc(Perl_debug_log, '\n');
6302 after_print:
6303 if (PL_regkind[(U8)op] == BRANCHJ) {
6304 register regnode *nnode = (OP(next) == LONGJMP
6305 ? regnext(next)
6306 : next);
6307 if (last && nnode > last)
6308 nnode = last;
6309 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6310 }
6311 else if (PL_regkind[(U8)op] == BRANCH) {
6312 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
6313 }
6314 else if ( PL_regkind[(U8)op] == TRIE ) {
6315 const I32 n = ARG(node);
6316 const reg_trie_data * const trie = (reg_trie_data*)PL_regdata->data[n];
6317 const I32 arry_len = av_len(trie->words)+1;
6318 I32 word_idx;
6319 PerlIO_printf(Perl_debug_log,
6320 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6321 (int)(2*(l+3)),
6322 "",
6323 trie->wordcount,
6324 (int)trie->charcount,
6325 trie->uniquecharcount,
6326 (IV)trie->laststate-1,
6327 node->flags ? " EVAL mode" : "");
6328
6329 for (word_idx=0; word_idx < arry_len; word_idx++) {
6330 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
6331 if (elem_ptr) {
6332 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6333 (int)(2*(l+4)), "",
6334 PL_colors[0],
cfd0369c 6335 SvPV_nolen_const(*elem_ptr),
a28509cc
AL
6336 PL_colors[1]
6337 );
6338 /*
6339 if (next == NULL)
6340 PerlIO_printf(Perl_debug_log, "(0)\n");
6341 else
6342 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6343 */
6344 }
6345
6346 }
6347
6348 node = NEXTOPER(node);
6349 node += regarglen[(U8)op];
6350
6351 }
6352 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6353 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6354 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6355 }
6356 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6357 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6358 next, sv, l + 1);
6359 }
6360 else if ( op == PLUS || op == STAR) {
6361 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6362 }
6363 else if (op == ANYOF) {
6364 /* arglen 1 + class block */
6365 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6366 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6367 node = NEXTOPER(node);
6368 }
6369 else if (PL_regkind[(U8)op] == EXACT) {
6370 /* Literal string, where present. */
6371 node += NODE_SZ_STR(node) - 1;
6372 node = NEXTOPER(node);
6373 }
6374 else {
6375 node = NEXTOPER(node);
6376 node += regarglen[(U8)op];
6377 }
6378 if (op == CURLYX || op == OPEN)
6379 l++;
6380 else if (op == WHILEM)
6381 l--;
6382 }
6383 return node;
6384}
6385
6386#endif /* DEBUGGING */
6387
241d1a3b
NC
6388/*
6389 * Local variables:
6390 * c-indentation-style: bsd
6391 * c-basic-offset: 4
6392 * indent-tabs-mode: t
6393 * End:
6394 *
37442d52
RGS
6395 * ex: set ts=8 sts=4 sw=4 noet:
6396 */