This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
performance tweaking op.c
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e
AD
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
cad2e5aa 38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
d06ea78c 44/* We *really* need to overwrite these symbols: */
56953603
IZ
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
d06ea78c 48# define Perl_pregfree my_regfree
cad2e5aa
JH
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
d06ea78c 51# define Perl_regnext my_regnext
f0b8d043 52# define Perl_save_re_context my_save_re_context
b81d288d 53# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
b81d288d 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 *
79 **** Alterations to Henry's code are...
80 ****
4bb101f2 81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 82 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 83 ****
9ef589d8
LW
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
86
a687059c
LW
87 *
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
91 */
92#include "EXTERN.h"
864dbfa3 93#define PERL_IN_REGCOMP_C
a687059c 94#include "perl.h"
d06ea78c 95
acfe0abc 96#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
97# include "INTERN.h"
98#endif
c277df42
IZ
99
100#define REG_COMP_C
a687059c
LW
101#include "regcomp.h"
102
d4cce5f1 103#ifdef op
11343788 104#undef op
d4cce5f1 105#endif /* op */
11343788 106
fe14fcc3 107#ifdef MSDOS
7e4e8c89 108# if defined(BUGGY_MSC6)
fe14fcc3 109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 110# pragma optimize("a",off)
fe14fcc3 111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
112# pragma optimize("w",on )
113# endif /* BUGGY_MSC6 */
fe14fcc3
LW
114#endif /* MSDOS */
115
a687059c
LW
116#ifndef STATIC
117#define STATIC static
118#endif
119
830247a4 120typedef struct RExC_state_t {
e2509266 121 U32 flags; /* are we folding, multilining? */
830247a4
IZ
122 char *precomp; /* uncompiled string. */
123 regexp *rx;
fac92740 124 char *start; /* Start of input for compile */
830247a4
IZ
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 128 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 129 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
132 U32 seen;
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
135 I32 extralen;
136 I32 seen_zerolen;
137 I32 seen_evals;
1aa99e6b 138 I32 utf8;
830247a4
IZ
139#if ADD_TO_REGEXEC
140 char *starttry; /* -Dr: where regtry was called. */
141#define RExC_starttry (pRExC_state->starttry)
142#endif
143} RExC_state_t;
144
e2509266 145#define RExC_flags (pRExC_state->flags)
830247a4
IZ
146#define RExC_precomp (pRExC_state->precomp)
147#define RExC_rx (pRExC_state->rx)
fac92740 148#define RExC_start (pRExC_state->start)
830247a4
IZ
149#define RExC_end (pRExC_state->end)
150#define RExC_parse (pRExC_state->parse)
151#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 152#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 153#define RExC_emit (pRExC_state->emit)
fac92740 154#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
155#define RExC_naughty (pRExC_state->naughty)
156#define RExC_sawback (pRExC_state->sawback)
157#define RExC_seen (pRExC_state->seen)
158#define RExC_size (pRExC_state->size)
159#define RExC_npar (pRExC_state->npar)
160#define RExC_extralen (pRExC_state->extralen)
161#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 163#define RExC_utf8 (pRExC_state->utf8)
830247a4 164
a687059c
LW
165#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
a687059c 168
35c8bce7
LW
169#ifdef SPSTART
170#undef SPSTART /* dratted cpp namespace... */
171#endif
a687059c
LW
172/*
173 * Flags to be passed up and down.
174 */
a687059c 175#define WORST 0 /* Worst case. */
821b33a5 176#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
177#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178#define SPSTART 0x4 /* Starts with * or +. */
179#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 180
2c2d71f5
JH
181/* Length of a variant. */
182
183typedef struct scan_data_t {
184 I32 len_min;
185 I32 len_delta;
186 I32 pos_min;
187 I32 pos_delta;
188 SV *last_found;
189 I32 last_end; /* min value, <0 unless valid. */
190 I32 last_start_min;
191 I32 last_start_max;
192 SV **longest; /* Either &l_fixed, or &l_float. */
193 SV *longest_fixed;
194 I32 offset_fixed;
195 SV *longest_float;
196 I32 offset_float_min;
197 I32 offset_float_max;
198 I32 flags;
199 I32 whilem_c;
cb434fcc 200 I32 *last_closep;
653099ff 201 struct regnode_charclass_class *start_class;
2c2d71f5
JH
202} scan_data_t;
203
a687059c 204/*
e50aee73 205 * Forward declarations for pregcomp()'s friends.
a687059c 206 */
a0d0e21e 207
27da23d5
JH
208static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
210
211#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212#define SF_BEFORE_SEOL 0x1
213#define SF_BEFORE_MEOL 0x2
214#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
216
09b7f37c
CB
217#ifdef NO_UNARY_PLUS
218# define SF_FIX_SHIFT_EOL (0+2)
219# define SF_FL_SHIFT_EOL (0+4)
220#else
221# define SF_FIX_SHIFT_EOL (+2)
222# define SF_FL_SHIFT_EOL (+4)
223#endif
c277df42
IZ
224
225#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
227
228#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230#define SF_IS_INF 0x40
231#define SF_HAS_PAR 0x80
232#define SF_IN_PAR 0x100
233#define SF_HAS_EVAL 0x200
4bfe0158 234#define SCF_DO_SUBSTR 0x400
653099ff
GS
235#define SCF_DO_STCLASS_AND 0x0800
236#define SCF_DO_STCLASS_OR 0x1000
237#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 238#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 239
eb160463 240#define UTF (RExC_utf8 != 0)
e2509266
JH
241#define LOC ((RExC_flags & PMf_LOCALE) != 0)
242#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 243
ffc61ed2 244#define OOB_UNICODE 12345678
93733859 245#define OOB_NAMEDCLASS -1
b8c5462f 246
a0ed51b3
LW
247#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
249
8615cb43 250
b45f050a
JF
251/* length of regex to show in messages that don't mark a position within */
252#define RegexLengthToShowInErrorMessages 127
253
254/*
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
258 */
7253e4e3
RK
259#define MARKER1 "<-- HERE" /* marker as it appears in the description */
260#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 261
7253e4e3 262#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
263
264/*
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
267 * "...".
268 */
ccb2c380 269#define FAIL(msg) STMT_START { \
bfed75c6 270 const char *ellipses = ""; \
ccb2c380
MP
271 IV len = RExC_end - RExC_precomp; \
272 \
273 if (!SIZE_ONLY) \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
278 ellipses = "..."; \
279 } \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
282} STMT_END
8615cb43 283
b45f050a
JF
284/*
285 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
286 * args. Show regex, up to a maximum length. If it's too long, chop and add
287 * "...".
288 */
ccb2c380 289#define FAIL2(pat,msg) STMT_START { \
bfed75c6 290 const char *ellipses = ""; \
ccb2c380
MP
291 IV len = RExC_end - RExC_precomp; \
292 \
293 if (!SIZE_ONLY) \
294 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
295 if (len > RegexLengthToShowInErrorMessages) { \
296 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
297 len = RegexLengthToShowInErrorMessages - 10; \
298 ellipses = "..."; \
299 } \
300 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
301 msg, (int)len, RExC_precomp, ellipses); \
302} STMT_END
b45f050a
JF
303
304
305/*
306 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
307 */
ccb2c380 308#define Simple_vFAIL(m) STMT_START { \
a28509cc 309 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
312} STMT_END
b45f050a
JF
313
314/*
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316 */
ccb2c380
MP
317#define vFAIL(m) STMT_START { \
318 if (!SIZE_ONLY) \
319 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
320 Simple_vFAIL(m); \
321} STMT_END
b45f050a
JF
322
323/*
324 * Like Simple_vFAIL(), but accepts two arguments.
325 */
ccb2c380 326#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 327 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
328 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
329 (int)offset, RExC_precomp, RExC_precomp + offset); \
330} STMT_END
b45f050a
JF
331
332/*
333 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
334 */
ccb2c380
MP
335#define vFAIL2(m,a1) STMT_START { \
336 if (!SIZE_ONLY) \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
339} STMT_END
b45f050a
JF
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
ccb2c380 345#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 346 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
347 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
348 (int)offset, RExC_precomp, RExC_precomp + offset); \
349} STMT_END
b45f050a
JF
350
351/*
352 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
353 */
ccb2c380
MP
354#define vFAIL3(m,a1,a2) STMT_START { \
355 if (!SIZE_ONLY) \
356 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
357 Simple_vFAIL3(m, a1, a2); \
358} STMT_END
b45f050a
JF
359
360/*
361 * Like Simple_vFAIL(), but accepts four arguments.
362 */
ccb2c380 363#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 364 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
367} STMT_END
b45f050a 368
ccb2c380 369#define vWARN(loc,m) STMT_START { \
a28509cc 370 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
373} STMT_END
374
375#define vWARNdep(loc,m) STMT_START { \
a28509cc 376 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
380} STMT_END
381
382
383#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 384 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
387} STMT_END
388
389#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 390 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
393} STMT_END
394
395#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 396 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
399} STMT_END
400
401#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 402 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
405} STMT_END
9d1d55b5 406
8615cb43 407
cd439c50 408/* Allow for side effects in s */
ccb2c380
MP
409#define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
411} STMT_END
cd439c50 412
fac92740
MJD
413/* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
417 */
418
419#define MJD_OFFSET_DEBUG(x)
a3621e74 420/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
421
422
423#define Set_Node_Offset_To_R(node,byte) STMT_START { \
424 if (! SIZE_ONLY) { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
427 if((node) < 0) { \
551405c4 428 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
429 } else { \
430 RExC_offsets[2*(node)-1] = (byte); \
431 } \
432 } \
433} STMT_END
434
435#define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
438
439#define Set_Node_Length_To_R(node,len) STMT_START { \
440 if (! SIZE_ONLY) { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 442 __LINE__, (int)(node), (int)(len))); \
ccb2c380 443 if((node) < 0) { \
551405c4 444 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
445 } else { \
446 RExC_offsets[2*(node)] = (len); \
447 } \
448 } \
449} STMT_END
450
451#define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454#define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
456
457/* Get offsets and lengths */
458#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
460
acfe0abc 461static void clear_re(pTHX_ void *r);
4327152a 462
653099ff
GS
463/* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
466
4327152a 467STATIC void
830247a4 468S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 469{
e1ec3a88
AL
470 const STRLEN l = CHR_SVLEN(data->last_found);
471 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 472
c277df42 473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 474 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
b81d288d 478 data->flags
c277df42
IZ
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
480 else
481 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
482 }
483 else {
c277df42 484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
485 data->offset_float_max = (l
486 ? data->last_start_max
c277df42 487 : data->pos_min + data->pos_delta);
9051bda5
HS
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
c277df42 490 if (data->flags & SF_BEFORE_EOL)
b81d288d 491 data->flags
c277df42
IZ
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
493 else
494 data->flags &= ~SF_FL_BEFORE_EOL;
495 }
496 }
497 SvCUR_set(data->last_found, 0);
0eda9292 498 {
a28509cc
AL
499 SV * const sv = data->last_found;
500 MAGIC * const mg =
0eda9292
JH
501 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
502 if (mg && mg->mg_len > 0)
503 mg->mg_len = 0;
504 }
c277df42
IZ
505 data->last_end = -1;
506 data->flags &= ~SF_BEFORE_EOL;
507}
508
653099ff
GS
509/* Can match anything (initialization) */
510STATIC void
830247a4 511S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 512{
653099ff 513 ANYOF_CLASS_ZERO(cl);
f8bef550 514 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 515 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
516 if (LOC)
517 cl->flags |= ANYOF_LOCALE;
518}
519
520/* Can match anything (initialization) */
521STATIC int
a28509cc 522S_cl_is_anything(pTHX_ const struct regnode_charclass_class *cl)
653099ff
GS
523{
524 int value;
525
aaa51d5e 526 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
527 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
528 return 1;
1aa99e6b
IH
529 if (!(cl->flags & ANYOF_UNICODE_ALL))
530 return 0;
f8bef550
NC
531 if (!ANYOF_BITMAP_TESTALLSET(cl))
532 return 0;
653099ff
GS
533 return 1;
534}
535
536/* Can match anything (initialization) */
537STATIC void
830247a4 538S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 539{
8ecf7187 540 Zero(cl, 1, struct regnode_charclass_class);
653099ff 541 cl->type = ANYOF;
830247a4 542 cl_anything(pRExC_state, cl);
653099ff
GS
543}
544
545STATIC void
830247a4 546S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 547{
8ecf7187 548 Zero(cl, 1, struct regnode_charclass_class);
653099ff 549 cl->type = ANYOF;
830247a4 550 cl_anything(pRExC_state, cl);
653099ff
GS
551 if (LOC)
552 cl->flags |= ANYOF_LOCALE;
553}
554
555/* 'And' a given class with another one. Can create false positives */
556/* We assume that cl is not inverted */
557STATIC void
558S_cl_and(pTHX_ struct regnode_charclass_class *cl,
a28509cc 559 const struct regnode_charclass_class *and_with)
653099ff 560{
653099ff
GS
561 if (!(and_with->flags & ANYOF_CLASS)
562 && !(cl->flags & ANYOF_CLASS)
563 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
564 && !(and_with->flags & ANYOF_FOLD)
565 && !(cl->flags & ANYOF_FOLD)) {
566 int i;
567
568 if (and_with->flags & ANYOF_INVERT)
569 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
570 cl->bitmap[i] &= ~and_with->bitmap[i];
571 else
572 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573 cl->bitmap[i] &= and_with->bitmap[i];
574 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
575 if (!(and_with->flags & ANYOF_EOS))
576 cl->flags &= ~ANYOF_EOS;
1aa99e6b 577
14ebb1a2
JH
578 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
579 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
580 cl->flags &= ~ANYOF_UNICODE_ALL;
581 cl->flags |= ANYOF_UNICODE;
582 ARG_SET(cl, ARG(and_with));
583 }
14ebb1a2
JH
584 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
585 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 586 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
587 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
588 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 589 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
590}
591
592/* 'OR' a given class with another one. Can create false positives */
593/* We assume that cl is not inverted */
594STATIC void
a28509cc 595S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 596{
653099ff
GS
597 if (or_with->flags & ANYOF_INVERT) {
598 /* We do not use
599 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
600 * <= (B1 | !B2) | (CL1 | !CL2)
601 * which is wasteful if CL2 is small, but we ignore CL2:
602 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
603 * XXXX Can we handle case-fold? Unclear:
604 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
605 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
606 */
607 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
608 && !(or_with->flags & ANYOF_FOLD)
609 && !(cl->flags & ANYOF_FOLD) ) {
610 int i;
611
612 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613 cl->bitmap[i] |= ~or_with->bitmap[i];
614 } /* XXXX: logic is complicated otherwise */
615 else {
830247a4 616 cl_anything(pRExC_state, cl);
653099ff
GS
617 }
618 } else {
619 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 621 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
622 || (cl->flags & ANYOF_FOLD)) ) {
623 int i;
624
625 /* OR char bitmap and class bitmap separately */
626 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
627 cl->bitmap[i] |= or_with->bitmap[i];
628 if (or_with->flags & ANYOF_CLASS) {
629 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
630 cl->classflags[i] |= or_with->classflags[i];
631 cl->flags |= ANYOF_CLASS;
632 }
633 }
634 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 635 cl_anything(pRExC_state, cl);
653099ff
GS
636 }
637 }
638 if (or_with->flags & ANYOF_EOS)
639 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
640
641 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
642 ARG(cl) != ARG(or_with)) {
643 cl->flags |= ANYOF_UNICODE_ALL;
644 cl->flags &= ~ANYOF_UNICODE;
645 }
646 if (or_with->flags & ANYOF_UNICODE_ALL) {
647 cl->flags |= ANYOF_UNICODE_ALL;
648 cl->flags &= ~ANYOF_UNICODE;
649 }
653099ff
GS
650}
651
5d1c421c 652/*
a3621e74
YO
653
654 make_trie(startbranch,first,last,tail,flags)
655 startbranch: the first branch in the whole branch sequence
656 first : start branch of sequence of branch-exact nodes.
657 May be the same as startbranch
658 last : Thing following the last branch.
659 May be the same as tail.
660 tail : item following the branch sequence
661 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
662
663Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
664
665A trie is an N'ary tree where the branches are determined by digital
666decomposition of the key. IE, at the root node you look up the 1st character and
667follow that branch repeat until you find the end of the branches. Nodes can be
668marked as "accepting" meaning they represent a complete word. Eg:
669
670 /he|she|his|hers/
671
672would convert into the following structure. Numbers represent states, letters
673following numbers represent valid transitions on the letter from that state, if
674the number is in square brackets it represents an accepting state, otherwise it
675will be in parenthesis.
676
677 +-h->+-e->[3]-+-r->(8)-+-s->[9]
678 | |
679 | (2)
680 | |
681 (1) +-i->(6)-+-s->[7]
682 |
683 +-s->(3)-+-h->(4)-+-e->[5]
684
685 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
686
687This shows that when matching against the string 'hers' we will begin at state 1
688read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
689then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
690is also accepting. Thus we know that we can match both 'he' and 'hers' with a
691single traverse. We store a mapping from accepting to state to which word was
692matched, and then when we have multiple possibilities we try to complete the
693rest of the regex in the order in which they occured in the alternation.
694
695The only prior NFA like behaviour that would be changed by the TRIE support is
696the silent ignoring of duplicate alternations which are of the form:
697
698 / (DUPE|DUPE) X? (?{ ... }) Y /x
699
700Thus EVAL blocks follwing a trie may be called a different number of times with
701and without the optimisation. With the optimisations dupes will be silently
702ignored. This inconsistant behaviour of EVAL type nodes is well established as
703the following demonstrates:
704
705 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
706
707which prints out 'word' three times, but
708
709 'words'=~/(word|word|word)(?{ print $1 })S/
710
711which doesnt print it out at all. This is due to other optimisations kicking in.
712
713Example of what happens on a structural level:
714
715The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
716
717 1: CURLYM[1] {1,32767}(18)
718 5: BRANCH(8)
719 6: EXACT <ac>(16)
720 8: BRANCH(11)
721 9: EXACT <ad>(16)
722 11: BRANCH(14)
723 12: EXACT <ab>(16)
724 16: SUCCEED(0)
725 17: NOTHING(18)
726 18: END(0)
727
728This would be optimizable with startbranch=5, first=5, last=16, tail=16
729and should turn into:
730
731 1: CURLYM[1] {1,32767}(18)
732 5: TRIE(16)
733 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
734 <ac>
735 <ad>
736 <ab>
737 16: SUCCEED(0)
738 17: NOTHING(18)
739 18: END(0)
740
741Cases where tail != last would be like /(?foo|bar)baz/:
742
743 1: BRANCH(4)
744 2: EXACT <foo>(8)
745 4: BRANCH(7)
746 5: EXACT <bar>(8)
747 7: TAIL(8)
748 8: EXACT <baz>(10)
749 10: END(0)
750
751which would be optimizable with startbranch=1, first=1, last=7, tail=8
752and would end up looking like:
753
754 1: TRIE(8)
755 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
756 <foo>
757 <bar>
758 7: TAIL(8)
759 8: EXACT <baz>(10)
760 10: END(0)
761
762*/
763
764#define TRIE_DEBUG_CHAR \
765 DEBUG_TRIE_COMPILE_r({ \
766 SV *tmp; \
767 if ( UTF ) { \
d0043bd1 768 tmp = newSVpvn( "", 0 ); \
a3621e74
YO
769 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
770 } else { \
e4584336 771 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
a3621e74
YO
772 } \
773 av_push( trie->revcharmap, tmp ); \
774 })
775
776#define TRIE_READ_CHAR STMT_START { \
777 if ( UTF ) { \
778 if ( folder ) { \
779 if ( foldlen > 0 ) { \
780 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
781 foldlen -= len; \
782 scan += len; \
783 len = 0; \
784 } else { \
e1ec3a88 785 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
786 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
787 foldlen -= UNISKIP( uvc ); \
788 scan = foldbuf + UNISKIP( uvc ); \
789 } \
790 } else { \
e1ec3a88 791 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
792 } \
793 } else { \
794 uvc = (U32)*uc; \
795 len = 1; \
796 } \
797} STMT_END
798
799
800#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
801#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
802#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
803#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
804
805#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
806 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
807 TRIE_LIST_LEN( state ) *= 2; \
808 Renew( trie->states[ state ].trans.list, \
809 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
810 } \
811 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
812 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
813 TRIE_LIST_CUR( state )++; \
814} STMT_END
815
816#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 817 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
818 4, reg_trie_trans_le ); \
819 TRIE_LIST_CUR( state ) = 1; \
820 TRIE_LIST_LEN( state ) = 4; \
821} STMT_END
822
823STATIC I32
824S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
825{
27da23d5 826 dVAR;
a3621e74
YO
827 /* first pass, loop through and scan words */
828 reg_trie_data *trie;
829 regnode *cur;
e1ec3a88 830 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
a3621e74
YO
831 STRLEN len = 0;
832 UV uvc = 0;
833 U16 curword = 0;
834 U32 next_alloc = 0;
835 /* we just use folder as a flag in utf8 */
e1ec3a88 836 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
837 ? PL_fold
838 : ( flags == EXACTFL
839 ? PL_fold_locale
840 : NULL
841 )
842 );
843
e1ec3a88 844 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74
YO
845 SV *re_trie_maxbuff;
846
847 GET_RE_DEBUG_FLAGS_DECL;
848
a02a5408 849 Newxz( trie, 1, reg_trie_data );
a3621e74
YO
850 trie->refcount = 1;
851 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 852 Newxz( trie->charmap, 256, U16 );
a3621e74
YO
853 DEBUG_r({
854 trie->words = newAV();
855 trie->revcharmap = newAV();
856 });
857
858
0111c4fd 859 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 860 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 861 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74
YO
862 }
863
864 /* -- First loop and Setup --
865
866 We first traverse the branches and scan each word to determine if it
867 contains widechars, and how many unique chars there are, this is
868 important as we have to build a table with at least as many columns as we
869 have unique chars.
870
871 We use an array of integers to represent the character codes 0..255
872 (trie->charmap) and we use a an HV* to store unicode characters. We use the
873 native representation of the character value as the key and IV's for the
874 coded index.
875
876 *TODO* If we keep track of how many times each character is used we can
877 remap the columns so that the table compression later on is more
878 efficient in terms of memory by ensuring most common value is in the
879 middle and the least common are on the outside. IMO this would be better
880 than a most to least common mapping as theres a decent chance the most
881 common letter will share a node with the least common, meaning the node
882 will not be compressable. With a middle is most common approach the worst
883 case is when we have the least common nodes twice.
884
885 */
886
887
888 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 889 regnode * const noper = NEXTOPER( cur );
e1ec3a88 890 const U8 *uc = (U8*)STRING( noper );
a28509cc 891 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
892 STRLEN foldlen = 0;
893 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 894 const U8 *scan = (U8*)NULL;
a3621e74
YO
895
896 for ( ; uc < e ; uc += len ) {
897 trie->charcount++;
898 TRIE_READ_CHAR;
899 if ( uvc < 256 ) {
900 if ( !trie->charmap[ uvc ] ) {
901 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
902 if ( folder )
903 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
904 TRIE_DEBUG_CHAR;
905 }
906 } else {
907 SV** svpp;
908 if ( !trie->widecharmap )
909 trie->widecharmap = newHV();
910
911 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
912
913 if ( !svpp )
e4584336 914 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
915
916 if ( !SvTRUE( *svpp ) ) {
917 sv_setiv( *svpp, ++trie->uniquecharcount );
918 TRIE_DEBUG_CHAR;
919 }
920 }
921 }
922 trie->wordcount++;
923 } /* end first pass */
924 DEBUG_TRIE_COMPILE_r(
925 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
926 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
5d7488b2 927 (int)trie->charcount, trie->uniquecharcount )
a3621e74
YO
928 );
929
930
931 /*
932 We now know what we are dealing with in terms of unique chars and
933 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
934 representation using a flat table will take. If it's over a reasonable
935 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
936 conservative but potentially much slower representation using an array
937 of lists.
938
939 At the end we convert both representations into the same compressed
940 form that will be used in regexec.c for matching with. The latter
941 is a form that cannot be used to construct with but has memory
942 properties similar to the list form and access properties similar
943 to the table form making it both suitable for fast searches and
944 small enough that its feasable to store for the duration of a program.
945
946 See the comment in the code where the compressed table is produced
947 inplace from the flat tabe representation for an explanation of how
948 the compression works.
949
950 */
951
952
953 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
954 /*
955 Second Pass -- Array Of Lists Representation
956
957 Each state will be represented by a list of charid:state records
958 (reg_trie_trans_le) the first such element holds the CUR and LEN
959 points of the allocated array. (See defines above).
960
961 We build the initial structure using the lists, and then convert
962 it into the compressed table form which allows faster lookups
963 (but cant be modified once converted).
964
965
966 */
967
968
969 STRLEN transcount = 1;
970
a02a5408 971 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
972 TRIE_LIST_NEW(1);
973 next_alloc = 2;
974
975 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
976
c445ea15
AL
977 regnode * const noper = NEXTOPER( cur );
978 U8 *uc = (U8*)STRING( noper );
979 const U8 * const e = uc + STR_LEN( noper );
980 U32 state = 1; /* required init */
981 U16 charid = 0; /* sanity init */
982 U8 *scan = (U8*)NULL; /* sanity init */
983 STRLEN foldlen = 0; /* required init */
984 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
985
986 for ( ; uc < e ; uc += len ) {
987
988 TRIE_READ_CHAR;
989
990 if ( uvc < 256 ) {
991 charid = trie->charmap[ uvc ];
992 } else {
993 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
994 if ( !svpp ) {
995 charid = 0;
996 } else {
997 charid=(U16)SvIV( *svpp );
998 }
999 }
1000 if ( charid ) {
a3621e74 1001
c445ea15
AL
1002 U16 check;
1003 U32 newstate = 0;
a3621e74 1004
c445ea15
AL
1005 charid--;
1006 if ( !trie->states[ state ].trans.list ) {
1007 TRIE_LIST_NEW( state );
1008 }
1009 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1010 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1011 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1012 break;
1013 }
1014 }
1015 if ( ! newstate ) {
1016 newstate = next_alloc++;
1017 TRIE_LIST_PUSH( state, charid, newstate );
1018 transcount++;
1019 }
1020 state = newstate;
1021 } else {
1022 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a28509cc 1023 }
c445ea15
AL
1024 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1025 }
a3621e74 1026
c445ea15
AL
1027 if ( !trie->states[ state ].wordnum ) {
1028 /* we havent inserted this word into the structure yet. */
1029 trie->states[ state ].wordnum = ++curword;
a3621e74 1030
c445ea15
AL
1031 DEBUG_r({
1032 /* store the word for dumping */
1033 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1034 if ( UTF ) SvUTF8_on( tmp );
1035 av_push( trie->words, tmp );
1036 });
a3621e74 1037
c445ea15
AL
1038 } else {
1039 /* Its a dupe. So ignore it. */
1040 }
a3621e74
YO
1041
1042 } /* end second pass */
1043
1044 trie->laststate = next_alloc;
1045 Renew( trie->states, next_alloc, reg_trie_state );
1046
1047 DEBUG_TRIE_COMPILE_MORE_r({
1048 U32 state;
a3621e74 1049
a28509cc 1050 /* print out the table precompression. */
a3621e74
YO
1051
1052 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1053 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1054
1055 for( state=1 ; state < next_alloc ; state ++ ) {
a28509cc 1056 U16 charid;
a3621e74 1057
e4584336 1058 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
a3621e74
YO
1059 if ( ! trie->states[ state ].wordnum ) {
1060 PerlIO_printf( Perl_debug_log, "%5s| ","");
1061 } else {
e4584336 1062 PerlIO_printf( Perl_debug_log, "W%04x| ",
a3621e74
YO
1063 trie->states[ state ].wordnum
1064 );
1065 }
1066 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1067 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
e4584336 1068 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
cfd0369c 1069 SvPV_nolen_const( *tmp ),
a3621e74 1070 TRIE_LIST_ITEM(state,charid).forid,
e4584336 1071 (UV)TRIE_LIST_ITEM(state,charid).newstate
a3621e74
YO
1072 );
1073 }
1074
1075 }
1076 PerlIO_printf( Perl_debug_log, "\n\n" );
1077 });
1078
a02a5408 1079 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1080 {
1081 U32 state;
a3621e74
YO
1082 U32 tp = 0;
1083 U32 zp = 0;
1084
1085
1086 for( state=1 ; state < next_alloc ; state ++ ) {
1087 U32 base=0;
1088
1089 /*
1090 DEBUG_TRIE_COMPILE_MORE_r(
1091 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1092 );
1093 */
1094
1095 if (trie->states[state].trans.list) {
1096 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1097 U16 maxid=minid;
a28509cc 1098 U16 idx;
a3621e74
YO
1099
1100 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1101 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1102 if ( forid < minid ) {
1103 minid=forid;
1104 } else if ( forid > maxid ) {
1105 maxid=forid;
1106 }
a3621e74
YO
1107 }
1108 if ( transcount < tp + maxid - minid + 1) {
1109 transcount *= 2;
1110 Renew( trie->trans, transcount, reg_trie_trans );
1111 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1112 }
1113 base = trie->uniquecharcount + tp - minid;
1114 if ( maxid == minid ) {
1115 U32 set = 0;
1116 for ( ; zp < tp ; zp++ ) {
1117 if ( ! trie->trans[ zp ].next ) {
1118 base = trie->uniquecharcount + zp - minid;
1119 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1120 trie->trans[ zp ].check = state;
1121 set = 1;
1122 break;
1123 }
1124 }
1125 if ( !set ) {
1126 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1127 trie->trans[ tp ].check = state;
1128 tp++;
1129 zp = tp;
1130 }
1131 } else {
1132 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1133 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1134 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1135 trie->trans[ tid ].check = state;
1136 }
1137 tp += ( maxid - minid + 1 );
1138 }
1139 Safefree(trie->states[ state ].trans.list);
1140 }
1141 /*
1142 DEBUG_TRIE_COMPILE_MORE_r(
1143 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1144 );
1145 */
1146 trie->states[ state ].trans.base=base;
1147 }
cc601c31 1148 trie->lasttrans = tp + 1;
a3621e74
YO
1149 }
1150 } else {
1151 /*
1152 Second Pass -- Flat Table Representation.
1153
1154 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1155 We know that we will need Charcount+1 trans at most to store the data
1156 (one row per char at worst case) So we preallocate both structures
1157 assuming worst case.
1158
1159 We then construct the trie using only the .next slots of the entry
1160 structs.
1161
1162 We use the .check field of the first entry of the node temporarily to
1163 make compression both faster and easier by keeping track of how many non
1164 zero fields are in the node.
1165
1166 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1167 transition.
1168
1169 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1170 number representing the first entry of the node, and state as a
1171 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1172 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1173 are 2 entrys per node. eg:
1174
1175 A B A B
1176 1. 2 4 1. 3 7
1177 2. 0 3 3. 0 5
1178 3. 0 0 5. 0 0
1179 4. 0 0 7. 0 0
1180
1181 The table is internally in the right hand, idx form. However as we also
1182 have to deal with the states array which is indexed by nodenum we have to
1183 use TRIE_NODENUM() to convert.
1184
1185 */
1186
a02a5408 1187 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
a3621e74 1188 reg_trie_trans );
a02a5408 1189 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
1190 next_alloc = trie->uniquecharcount + 1;
1191
1192 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1193
c445ea15 1194 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1195 const U8 *uc = (U8*)STRING( noper );
1196 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1197
1198 U32 state = 1; /* required init */
1199
1200 U16 charid = 0; /* sanity init */
1201 U32 accept_state = 0; /* sanity init */
1202 U8 *scan = (U8*)NULL; /* sanity init */
1203
1204 STRLEN foldlen = 0; /* required init */
1205 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1206
1207
1208 for ( ; uc < e ; uc += len ) {
1209
1210 TRIE_READ_CHAR;
1211
1212 if ( uvc < 256 ) {
1213 charid = trie->charmap[ uvc ];
1214 } else {
c445ea15
AL
1215 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1216 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74
YO
1217 }
1218 if ( charid ) {
1219 charid--;
1220 if ( !trie->trans[ state + charid ].next ) {
1221 trie->trans[ state + charid ].next = next_alloc;
1222 trie->trans[ state ].check++;
1223 next_alloc += trie->uniquecharcount;
1224 }
1225 state = trie->trans[ state + charid ].next;
1226 } else {
e4584336 1227 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1228 }
1229 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1230 }
1231
1232 accept_state = TRIE_NODENUM( state );
1233 if ( !trie->states[ accept_state ].wordnum ) {
1234 /* we havent inserted this word into the structure yet. */
1235 trie->states[ accept_state ].wordnum = ++curword;
1236
1237 DEBUG_r({
1238 /* store the word for dumping */
1239 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1240 if ( UTF ) SvUTF8_on( tmp );
1241 av_push( trie->words, tmp );
1242 });
1243
1244 } else {
1245 /* Its a dupe. So ignore it. */
1246 }
1247
1248 } /* end second pass */
1249
1250 DEBUG_TRIE_COMPILE_MORE_r({
1251 /*
1252 print out the table precompression so that we can do a visual check
1253 that they are identical.
1254 */
1255 U32 state;
1256 U16 charid;
1257 PerlIO_printf( Perl_debug_log, "\nChar : " );
1258
1259 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1260 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1261 if ( tmp ) {
cfd0369c 1262 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1263 }
1264 }
1265
1266 PerlIO_printf( Perl_debug_log, "\nState+-" );
1267
1268 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1269 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1270 }
1271
1272 PerlIO_printf( Perl_debug_log, "\n" );
1273
1274 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1275
e4584336 1276 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
a3621e74
YO
1277
1278 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
e4584336
RB
1279 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1280 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
a3621e74
YO
1281 }
1282 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
e4584336 1283 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
a3621e74 1284 } else {
e4584336 1285 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
a3621e74
YO
1286 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1287 }
1288 }
1289 PerlIO_printf( Perl_debug_log, "\n\n" );
1290 });
1291 {
1292 /*
1293 * Inplace compress the table.*
1294
1295 For sparse data sets the table constructed by the trie algorithm will
1296 be mostly 0/FAIL transitions or to put it another way mostly empty.
1297 (Note that leaf nodes will not contain any transitions.)
1298
1299 This algorithm compresses the tables by eliminating most such
1300 transitions, at the cost of a modest bit of extra work during lookup:
1301
1302 - Each states[] entry contains a .base field which indicates the
1303 index in the state[] array wheres its transition data is stored.
1304
1305 - If .base is 0 there are no valid transitions from that node.
1306
1307 - If .base is nonzero then charid is added to it to find an entry in
1308 the trans array.
1309
1310 -If trans[states[state].base+charid].check!=state then the
1311 transition is taken to be a 0/Fail transition. Thus if there are fail
1312 transitions at the front of the node then the .base offset will point
1313 somewhere inside the previous nodes data (or maybe even into a node
1314 even earlier), but the .check field determines if the transition is
1315 valid.
1316
1317 The following process inplace converts the table to the compressed
1318 table: We first do not compress the root node 1,and mark its all its
1319 .check pointers as 1 and set its .base pointer as 1 as well. This
1320 allows to do a DFA construction from the compressed table later, and
1321 ensures that any .base pointers we calculate later are greater than
1322 0.
1323
1324 - We set 'pos' to indicate the first entry of the second node.
1325
1326 - We then iterate over the columns of the node, finding the first and
1327 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1328 and set the .check pointers accordingly, and advance pos
1329 appropriately and repreat for the next node. Note that when we copy
1330 the next pointers we have to convert them from the original
1331 NODEIDX form to NODENUM form as the former is not valid post
1332 compression.
1333
1334 - If a node has no transitions used we mark its base as 0 and do not
1335 advance the pos pointer.
1336
1337 - If a node only has one transition we use a second pointer into the
1338 structure to fill in allocated fail transitions from other states.
1339 This pointer is independent of the main pointer and scans forward
1340 looking for null transitions that are allocated to a state. When it
1341 finds one it writes the single transition into the "hole". If the
1342 pointer doesnt find one the single transition is appeneded as normal.
1343
1344 - Once compressed we can Renew/realloc the structures to release the
1345 excess space.
1346
1347 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1348 specifically Fig 3.47 and the associated pseudocode.
1349
1350 demq
1351 */
a3b680e6 1352 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1353 U32 state, charid;
a3621e74
YO
1354 U32 pos = 0, zp=0;
1355 trie->laststate = laststate;
1356
1357 for ( state = 1 ; state < laststate ; state++ ) {
1358 U8 flag = 0;
a28509cc
AL
1359 const U32 stateidx = TRIE_NODEIDX( state );
1360 const U32 o_used = trie->trans[ stateidx ].check;
1361 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1362 trie->trans[ stateidx ].check = 0;
1363
1364 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1365 if ( flag || trie->trans[ stateidx + charid ].next ) {
1366 if ( trie->trans[ stateidx + charid ].next ) {
1367 if (o_used == 1) {
1368 for ( ; zp < pos ; zp++ ) {
1369 if ( ! trie->trans[ zp ].next ) {
1370 break;
1371 }
1372 }
1373 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1374 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1375 trie->trans[ zp ].check = state;
1376 if ( ++zp > pos ) pos = zp;
1377 break;
1378 }
1379 used--;
1380 }
1381 if ( !flag ) {
1382 flag = 1;
1383 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1384 }
1385 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1386 trie->trans[ pos ].check = state;
1387 pos++;
1388 }
1389 }
1390 }
cc601c31 1391 trie->lasttrans = pos + 1;
a3621e74
YO
1392 Renew( trie->states, laststate + 1, reg_trie_state);
1393 DEBUG_TRIE_COMPILE_MORE_r(
e4584336
RB
1394 PerlIO_printf( Perl_debug_log,
1395 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
5d7488b2
AL
1396 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1397 (IV)next_alloc,
1398 (IV)pos,
a3621e74
YO
1399 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1400 );
1401
1402 } /* end table compress */
1403 }
cc601c31
YO
1404 /* resize the trans array to remove unused space */
1405 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74
YO
1406
1407 DEBUG_TRIE_COMPILE_r({
1408 U32 state;
1409 /*
1410 Now we print it out again, in a slightly different form as there is additional
1411 info we want to be able to see when its compressed. They are close enough for
1412 visual comparison though.
1413 */
1414 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1415
1416 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1417 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1418 if ( tmp ) {
cfd0369c 1419 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1420 }
1421 }
1422 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
cc601c31 1423
a3621e74
YO
1424 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1425 PerlIO_printf( Perl_debug_log, "-----");
1426 PerlIO_printf( Perl_debug_log, "\n");
cc601c31 1427
a3621e74 1428 for( state = 1 ; state < trie->laststate ; state++ ) {
a28509cc 1429 const U32 base = trie->states[ state ].trans.base;
a3621e74 1430
e4584336 1431 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
a3621e74
YO
1432
1433 if ( trie->states[ state ].wordnum ) {
1434 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1435 } else {
1436 PerlIO_printf( Perl_debug_log, "%6s", "" );
1437 }
1438
e4584336 1439 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
a3621e74
YO
1440
1441 if ( base ) {
1442 U32 ofs = 0;
1443
cc601c31
YO
1444 while( ( base + ofs < trie->uniquecharcount ) ||
1445 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1446 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
a3621e74
YO
1447 ofs++;
1448
e4584336 1449 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
a3621e74
YO
1450
1451 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1452 if ( ( base + ofs >= trie->uniquecharcount ) &&
1453 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1454 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1455 {
e4584336
RB
1456 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1457 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
a3621e74
YO
1458 } else {
1459 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1460 }
1461 }
1462
e4584336 1463 PerlIO_printf( Perl_debug_log, "]");
a3621e74
YO
1464
1465 }
1466 PerlIO_printf( Perl_debug_log, "\n" );
1467 }
1468 });
1469
1470 {
1471 /* now finally we "stitch in" the new TRIE node
1472 This means we convert either the first branch or the first Exact,
1473 depending on whether the thing following (in 'last') is a branch
1474 or not and whther first is the startbranch (ie is it a sub part of
1475 the alternation or is it the whole thing.)
1476 Assuming its a sub part we conver the EXACT otherwise we convert
1477 the whole branch sequence, including the first.
1478 */
1479 regnode *convert;
1480
1481
1482
1483
1484 if ( first == startbranch && OP( last ) != BRANCH ) {
1485 convert = first;
1486 } else {
1487 convert = NEXTOPER( first );
1488 NEXT_OFF( first ) = (U16)(last - first);
1489 }
1490
1491 OP( convert ) = TRIE + (U8)( flags - EXACT );
1492 NEXT_OFF( convert ) = (U16)(tail - convert);
1493 ARG_SET( convert, data_slot );
1494
1495 /* tells us if we need to handle accept buffers specially */
1496 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1497
1498
1499 /* needed for dumping*/
1500 DEBUG_r({
1501 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1502 /* We now need to mark all of the space originally used by the
1503 branches as optimized away. This keeps the dumpuntil from
1504 throwing a wobbly as it doesnt use regnext() to traverse the
1505 opcodes.
1506 */
1507 while( optimize < last ) {
1508 OP( optimize ) = OPTIMIZED;
1509 optimize++;
1510 }
1511 });
1512 } /* end node insert */
1513 return 1;
1514}
1515
1516
1517
1518/*
5d1c421c
JH
1519 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1520 * These need to be revisited when a newer toolchain becomes available.
1521 */
1522#if defined(__sparc64__) && defined(__GNUC__)
1523# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1524# undef SPARC64_GCC_WORKAROUND
1525# define SPARC64_GCC_WORKAROUND 1
1526# endif
1527#endif
1528
653099ff
GS
1529/* REx optimizer. Converts nodes into quickier variants "in place".
1530 Finds fixed substrings. */
1531
a0288114 1532/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
1533 to the position after last scanned or to NULL. */
1534
a3621e74 1535
76e3520e 1536STATIC I32
9a957fbc
AL
1537S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1538 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1539 /* scanp: Start here (read-write). */
1540 /* deltap: Write maxlen-minlen here. */
1541 /* last: Stop before this one. */
1542{
1543 I32 min = 0, pars = 0, code;
1544 regnode *scan = *scanp, *next;
1545 I32 delta = 0;
1546 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1547 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1548 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1549 scan_data_t data_fake;
653099ff 1550 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1551 SV *re_trie_maxbuff = NULL;
1552
1553 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1554
c277df42
IZ
1555 while (scan && OP(scan) != END && scan < last) {
1556 /* Peephole optimizer: */
a3621e74 1557 DEBUG_OPTIMISE_r({
c445ea15 1558 SV * const mysv=sv_newmortal();
a3621e74 1559 regprop( mysv, scan);
e4584336 1560 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
cfd0369c 1561 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
a3621e74 1562 });
c277df42 1563
22c35a8c 1564 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 1565 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1566 regnode *n = regnext(scan);
1567 U32 stringok = 1;
1568#ifdef DEBUGGING
1569 regnode *stop = scan;
b81d288d 1570#endif
c277df42 1571
cd439c50 1572 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1573 /* Skip NOTHING, merge EXACT*. */
1574 while (n &&
b81d288d 1575 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
1576 (stringok && (OP(n) == OP(scan))))
1577 && NEXT_OFF(n)
1578 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1579 if (OP(n) == TAIL || n > next)
1580 stringok = 0;
22c35a8c 1581 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
1582 NEXT_OFF(scan) += NEXT_OFF(n);
1583 next = n + NODE_STEP_REGNODE;
1584#ifdef DEBUGGING
1585 if (stringok)
1586 stop = n;
b81d288d 1587#endif
c277df42 1588 n = regnext(n);
a0ed51b3 1589 }
f49d4d0f 1590 else if (stringok) {
a3b680e6 1591 const int oldl = STR_LEN(scan);
c445ea15 1592 regnode * const nnext = regnext(n);
f49d4d0f 1593
b81d288d 1594 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1595 break;
1596 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1597 STR_LEN(scan) += STR_LEN(n);
1598 next = n + NODE_SZ_STR(n);
c277df42 1599 /* Now we can overwrite *n : */
f49d4d0f 1600 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1601#ifdef DEBUGGING
f49d4d0f 1602 stop = next - 1;
b81d288d 1603#endif
c277df42
IZ
1604 n = nnext;
1605 }
1606 }
61a36c01 1607
a3621e74 1608 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1609/*
1610 Two problematic code points in Unicode casefolding of EXACT nodes:
1611
1612 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1613 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1614
1615 which casefold to
1616
1617 Unicode UTF-8
1618
1619 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1620 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1621
1622 This means that in case-insensitive matching (or "loose matching",
1623 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1624 length of the above casefolded versions) can match a target string
1625 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1626 This would rather mess up the minimum length computation.
1627
1628 What we'll do is to look for the tail four bytes, and then peek
1629 at the preceding two bytes to see whether we need to decrease
1630 the minimum length by four (six minus two).
1631
1632 Thanks to the design of UTF-8, there cannot be false matches:
1633 A sequence of valid UTF-8 bytes cannot be a subsequence of
1634 another valid sequence of UTF-8 bytes.
1635
1636*/
c445ea15
AL
1637 char * const s0 = STRING(scan), *s, *t;
1638 char * const s1 = s0 + STR_LEN(scan) - 1;
1639 char * const s2 = s1 - 4;
a28509cc
AL
1640 const char * const t0 = "\xcc\x88\xcc\x81";
1641 const char * const t1 = t0 + 3;
2af232bd 1642
61a36c01
JH
1643 for (s = s0 + 2;
1644 s < s2 && (t = ninstr(s, s1, t0, t1));
1645 s = t + 4) {
1646 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1647 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1648 min -= 4;
1649 }
1650 }
1651
c277df42
IZ
1652#ifdef DEBUGGING
1653 /* Allow dumping */
cd439c50 1654 n = scan + NODE_SZ_STR(scan);
c277df42 1655 while (n <= stop) {
22c35a8c 1656 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1657 OP(n) = OPTIMIZED;
1658 NEXT_OFF(n) = 0;
1659 }
1660 n++;
1661 }
653099ff 1662#endif
c277df42 1663 }
a3621e74
YO
1664
1665
1666
653099ff
GS
1667 /* Follow the next-chain of the current node and optimize
1668 away all the NOTHINGs from it. */
c277df42 1669 if (OP(scan) != CURLYX) {
a3b680e6 1670 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
1671 ? I32_MAX
1672 /* I32 may be smaller than U16 on CRAYs! */
1673 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1674 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1675 int noff;
1676 regnode *n = scan;
b81d288d 1677
c277df42
IZ
1678 /* Skip NOTHING and LONGJMP. */
1679 while ((n = regnext(n))
22c35a8c 1680 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1681 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1682 && off + noff < max)
1683 off += noff;
1684 if (reg_off_by_arg[OP(scan)])
1685 ARG(scan) = off;
b81d288d 1686 else
c277df42
IZ
1687 NEXT_OFF(scan) = off;
1688 }
a3621e74 1689
653099ff
GS
1690 /* The principal pseudo-switch. Cannot be a switch, since we
1691 look into several different things. */
b81d288d 1692 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1693 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1694 next = regnext(scan);
1695 code = OP(scan);
a3621e74 1696 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1697
1698 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1699 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1700 struct regnode_charclass_class accum;
a3621e74 1701 regnode *startbranch=scan;
c277df42 1702
653099ff 1703 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1704 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1705 if (flags & SCF_DO_STCLASS)
830247a4 1706 cl_init_zero(pRExC_state, &accum);
a3621e74 1707
c277df42 1708 while (OP(scan) == code) {
830247a4 1709 I32 deltanext, minnext, f = 0, fake;
653099ff 1710 struct regnode_charclass_class this_class;
c277df42
IZ
1711
1712 num++;
1713 data_fake.flags = 0;
b81d288d 1714 if (data) {
2c2d71f5 1715 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1716 data_fake.last_closep = data->last_closep;
1717 }
1718 else
1719 data_fake.last_closep = &fake;
c277df42
IZ
1720 next = regnext(scan);
1721 scan = NEXTOPER(scan);
1722 if (code != BRANCH)
1723 scan = NEXTOPER(scan);
653099ff 1724 if (flags & SCF_DO_STCLASS) {
830247a4 1725 cl_init(pRExC_state, &this_class);
653099ff
GS
1726 data_fake.start_class = &this_class;
1727 f = SCF_DO_STCLASS_AND;
b81d288d 1728 }
e1901655
IZ
1729 if (flags & SCF_WHILEM_VISITED_POS)
1730 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1731
653099ff 1732 /* we suppose the run is continuous, last=next...*/
830247a4 1733 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1734 next, &data_fake, f,depth+1);
b81d288d 1735 if (min1 > minnext)
c277df42
IZ
1736 min1 = minnext;
1737 if (max1 < minnext + deltanext)
1738 max1 = minnext + deltanext;
1739 if (deltanext == I32_MAX)
aca2d497 1740 is_inf = is_inf_internal = 1;
c277df42
IZ
1741 scan = next;
1742 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1743 pars++;
405ff068 1744 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1745 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1746 if (data)
1747 data->whilem_c = data_fake.whilem_c;
653099ff 1748 if (flags & SCF_DO_STCLASS)
830247a4 1749 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1750 if (code == SUSPEND)
c277df42
IZ
1751 break;
1752 }
1753 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1754 min1 = 0;
1755 if (flags & SCF_DO_SUBSTR) {
1756 data->pos_min += min1;
1757 data->pos_delta += max1 - min1;
1758 if (max1 != min1 || is_inf)
1759 data->longest = &(data->longest_float);
1760 }
1761 min += min1;
1762 delta += max1 - min1;
653099ff 1763 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1764 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1765 if (min1) {
1766 cl_and(data->start_class, &and_with);
1767 flags &= ~SCF_DO_STCLASS;
1768 }
1769 }
1770 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1771 if (min1) {
1772 cl_and(data->start_class, &accum);
653099ff 1773 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1774 }
1775 else {
b81d288d 1776 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1777 * data->start_class */
1778 StructCopy(data->start_class, &and_with,
1779 struct regnode_charclass_class);
1780 flags &= ~SCF_DO_STCLASS_AND;
1781 StructCopy(&accum, data->start_class,
1782 struct regnode_charclass_class);
1783 flags |= SCF_DO_STCLASS_OR;
1784 data->start_class->flags |= ANYOF_EOS;
1785 }
653099ff 1786 }
a3621e74
YO
1787
1788 /* demq.
1789
1790 Assuming this was/is a branch we are dealing with: 'scan' now
1791 points at the item that follows the branch sequence, whatever
1792 it is. We now start at the beginning of the sequence and look
1793 for subsequences of
1794
1795 BRANCH->EXACT=>X
1796 BRANCH->EXACT=>X
1797
1798 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1799
1800 If we can find such a subseqence we need to turn the first
1801 element into a trie and then add the subsequent branch exact
1802 strings to the trie.
1803
1804 We have two cases
1805
1806 1. patterns where the whole set of branch can be converted to a trie,
1807
1808 2. patterns where only a subset of the alternations can be
1809 converted to a trie.
1810
1811 In case 1 we can replace the whole set with a single regop
1812 for the trie. In case 2 we need to keep the start and end
1813 branchs so
1814
1815 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1816 becomes BRANCH TRIE; BRANCH X;
1817
1818 Hypthetically when we know the regex isnt anchored we can
1819 turn a case 1 into a DFA and let it rip... Every time it finds a match
1820 it would just call its tail, no WHILEM/CURLY needed.
1821
1822 */
0111c4fd
RGS
1823 if (DO_TRIE) {
1824 if (!re_trie_maxbuff) {
1825 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1826 if (!SvIOK(re_trie_maxbuff))
1827 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1828 }
a3621e74
YO
1829 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1830 regnode *cur;
1831 regnode *first = (regnode *)NULL;
1832 regnode *last = (regnode *)NULL;
1833 regnode *tail = scan;
1834 U8 optype = 0;
1835 U32 count=0;
1836
1837#ifdef DEBUGGING
c445ea15 1838 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
1839#endif
1840 /* var tail is used because there may be a TAIL
1841 regop in the way. Ie, the exacts will point to the
1842 thing following the TAIL, but the last branch will
1843 point at the TAIL. So we advance tail. If we
1844 have nested (?:) we may have to move through several
1845 tails.
1846 */
1847
1848 while ( OP( tail ) == TAIL ) {
1849 /* this is the TAIL generated by (?:) */
1850 tail = regnext( tail );
1851 }
1852
1853 DEBUG_OPTIMISE_r({
1854 regprop( mysv, tail );
1855 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
cfd0369c 1856 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
a3621e74
YO
1857 (RExC_seen_evals) ? "[EVAL]" : ""
1858 );
1859 });
1860 /*
1861
1862 step through the branches, cur represents each
1863 branch, noper is the first thing to be matched
1864 as part of that branch and noper_next is the
1865 regnext() of that node. if noper is an EXACT
1866 and noper_next is the same as scan (our current
1867 position in the regex) then the EXACT branch is
1868 a possible optimization target. Once we have
1869 two or more consequetive such branches we can
1870 create a trie of the EXACT's contents and stich
1871 it in place. If the sequence represents all of
1872 the branches we eliminate the whole thing and
1873 replace it with a single TRIE. If it is a
1874 subsequence then we need to stitch it in. This
1875 means the first branch has to remain, and needs
1876 to be repointed at the item on the branch chain
1877 following the last branch optimized. This could
1878 be either a BRANCH, in which case the
1879 subsequence is internal, or it could be the
1880 item following the branch sequence in which
1881 case the subsequence is at the end.
1882
1883 */
1884
1885 /* dont use tail as the end marker for this traverse */
1886 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
1887 regnode * const noper = NEXTOPER( cur );
1888 regnode * const noper_next = regnext( noper );
a3621e74 1889
a3621e74
YO
1890 DEBUG_OPTIMISE_r({
1891 regprop( mysv, cur);
1892 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1893 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
a3621e74
YO
1894
1895 regprop( mysv, noper);
1896 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 1897 SvPV_nolen_const(mysv));
a3621e74
YO
1898
1899 if ( noper_next ) {
1900 regprop( mysv, noper_next );
1901 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 1902 SvPV_nolen_const(mysv));
a3621e74
YO
1903 }
1904 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1905 first, last, cur );
1906 });
1907 if ( ( first ? OP( noper ) == optype
1908 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1909 && noper_next == tail && count<U16_MAX)
1910 {
1911 count++;
1912 if ( !first ) {
1913 first = cur;
1914 optype = OP( noper );
1915 } else {
1916 DEBUG_OPTIMISE_r(
1917 if (!last ) {
1918 regprop( mysv, first);
1919 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1920 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
a3621e74
YO
1921 regprop( mysv, NEXTOPER(first) );
1922 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1923 SvPV_nolen_const( mysv ) );
a3621e74
YO
1924 }
1925 );
1926 last = cur;
1927 DEBUG_OPTIMISE_r({
1928 regprop( mysv, cur);
1929 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1930 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
a3621e74
YO
1931 regprop( mysv, noper );
1932 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1933 SvPV_nolen_const( mysv ) );
a3621e74
YO
1934 });
1935 }
1936 } else {
1937 if ( last ) {
1938 DEBUG_OPTIMISE_r(
1939 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1940 (int)depth * 2 + 2, "E:", "**END**" );
a3621e74
YO
1941 );
1942 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1943 }
1944 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1945 && noper_next == tail )
1946 {
1947 count = 1;
1948 first = cur;
1949 optype = OP( noper );
1950 } else {
1951 count = 0;
1952 first = NULL;
1953 optype = 0;
1954 }
1955 last = NULL;
1956 }
1957 }
1958 DEBUG_OPTIMISE_r({
1959 regprop( mysv, cur);
1960 PerlIO_printf( Perl_debug_log,
e4584336 1961 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
cfd0369c 1962 " ", SvPV_nolen_const( mysv ), first, last, cur);
a3621e74
YO
1963
1964 });
1965 if ( last ) {
1966 DEBUG_OPTIMISE_r(
1967 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1968 (int)depth * 2 + 2, "E:", "==END==" );
a3621e74
YO
1969 );
1970 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1971 }
1972 }
1973 }
a0ed51b3 1974 }
a3621e74 1975 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 1976 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 1977 } else /* single branch is optimized. */
c277df42
IZ
1978 scan = NEXTOPER(scan);
1979 continue;
a0ed51b3
LW
1980 }
1981 else if (OP(scan) == EXACT) {
cd439c50 1982 I32 l = STR_LEN(scan);
c445ea15 1983 UV uc;
a0ed51b3 1984 if (UTF) {
a3b680e6 1985 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 1986 l = utf8_length(s, s + l);
9041c2e3 1987 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
1988 } else {
1989 uc = *((U8*)STRING(scan));
a0ed51b3
LW
1990 }
1991 min += l;
c277df42 1992 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
1993 /* The code below prefers earlier match for fixed
1994 offset, later match for variable offset. */
1995 if (data->last_end == -1) { /* Update the start info. */
1996 data->last_start_min = data->pos_min;
1997 data->last_start_max = is_inf
b81d288d 1998 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 1999 }
cd439c50 2000 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292 2001 {
9a957fbc 2002 SV * const sv = data->last_found;
a28509cc 2003 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2004 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2005 if (mg && mg->mg_len >= 0)
5e43f467
JH
2006 mg->mg_len += utf8_length((U8*)STRING(scan),
2007 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2008 }
33b8afdf
JH
2009 if (UTF)
2010 SvUTF8_on(data->last_found);
c277df42
IZ
2011 data->last_end = data->pos_min + l;
2012 data->pos_min += l; /* As in the first entry. */
2013 data->flags &= ~SF_BEFORE_EOL;
2014 }
653099ff
GS
2015 if (flags & SCF_DO_STCLASS_AND) {
2016 /* Check whether it is compatible with what we know already! */
2017 int compat = 1;
2018
1aa99e6b 2019 if (uc >= 0x100 ||
516a5887 2020 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2021 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2022 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2023 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2024 )
653099ff
GS
2025 compat = 0;
2026 ANYOF_CLASS_ZERO(data->start_class);
2027 ANYOF_BITMAP_ZERO(data->start_class);
2028 if (compat)
1aa99e6b 2029 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2030 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2031 if (uc < 0x100)
2032 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2033 }
2034 else if (flags & SCF_DO_STCLASS_OR) {
2035 /* false positive possible if the class is case-folded */
1aa99e6b 2036 if (uc < 0x100)
9b877dbb
IH
2037 ANYOF_BITMAP_SET(data->start_class, uc);
2038 else
2039 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2040 data->start_class->flags &= ~ANYOF_EOS;
2041 cl_and(data->start_class, &and_with);
2042 }
2043 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2044 }
653099ff 2045 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2046 I32 l = STR_LEN(scan);
1aa99e6b 2047 UV uc = *((U8*)STRING(scan));
653099ff
GS
2048
2049 /* Search for fixed substrings supports EXACT only. */
b81d288d 2050 if (flags & SCF_DO_SUBSTR)
830247a4 2051 scan_commit(pRExC_state, data);
a0ed51b3 2052 if (UTF) {
1aa99e6b
IH
2053 U8 *s = (U8 *)STRING(scan);
2054 l = utf8_length(s, s + l);
9041c2e3 2055 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2056 }
2057 min += l;
c277df42 2058 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 2059 data->pos_min += l;
653099ff
GS
2060 if (flags & SCF_DO_STCLASS_AND) {
2061 /* Check whether it is compatible with what we know already! */
2062 int compat = 1;
2063
1aa99e6b 2064 if (uc >= 0x100 ||
516a5887 2065 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2066 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2067 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2068 compat = 0;
2069 ANYOF_CLASS_ZERO(data->start_class);
2070 ANYOF_BITMAP_ZERO(data->start_class);
2071 if (compat) {
1aa99e6b 2072 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2073 data->start_class->flags &= ~ANYOF_EOS;
2074 data->start_class->flags |= ANYOF_FOLD;
2075 if (OP(scan) == EXACTFL)
2076 data->start_class->flags |= ANYOF_LOCALE;
2077 }
2078 }
2079 else if (flags & SCF_DO_STCLASS_OR) {
2080 if (data->start_class->flags & ANYOF_FOLD) {
2081 /* false positive possible if the class is case-folded.
2082 Assume that the locale settings are the same... */
1aa99e6b
IH
2083 if (uc < 0x100)
2084 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2085 data->start_class->flags &= ~ANYOF_EOS;
2086 }
2087 cl_and(data->start_class, &and_with);
2088 }
2089 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2090 }
bfed75c6 2091 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2092 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2093 I32 f = flags, pos_before = 0;
c277df42 2094 regnode *oscan = scan;
653099ff
GS
2095 struct regnode_charclass_class this_class;
2096 struct regnode_charclass_class *oclass = NULL;
727f22e3 2097 I32 next_is_eval = 0;
653099ff 2098
22c35a8c 2099 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2100 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2101 scan = NEXTOPER(scan);
2102 goto finish;
2103 case PLUS:
653099ff 2104 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2105 next = NEXTOPER(scan);
653099ff 2106 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2107 mincount = 1;
2108 maxcount = REG_INFTY;
c277df42
IZ
2109 next = regnext(scan);
2110 scan = NEXTOPER(scan);
2111 goto do_curly;
2112 }
2113 }
2114 if (flags & SCF_DO_SUBSTR)
2115 data->pos_min++;
2116 min++;
2117 /* Fall through. */
2118 case STAR:
653099ff
GS
2119 if (flags & SCF_DO_STCLASS) {
2120 mincount = 0;
b81d288d 2121 maxcount = REG_INFTY;
653099ff
GS
2122 next = regnext(scan);
2123 scan = NEXTOPER(scan);
2124 goto do_curly;
2125 }
b81d288d 2126 is_inf = is_inf_internal = 1;
c277df42
IZ
2127 scan = regnext(scan);
2128 if (flags & SCF_DO_SUBSTR) {
830247a4 2129 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2130 data->longest = &(data->longest_float);
2131 }
2132 goto optimize_curly_tail;
2133 case CURLY:
b81d288d 2134 mincount = ARG1(scan);
c277df42
IZ
2135 maxcount = ARG2(scan);
2136 next = regnext(scan);
cb434fcc
IZ
2137 if (OP(scan) == CURLYX) {
2138 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2139 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2140 }
c277df42 2141 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2142 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2143 do_curly:
2144 if (flags & SCF_DO_SUBSTR) {
830247a4 2145 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2146 pos_before = data->pos_min;
2147 }
2148 if (data) {
2149 fl = data->flags;
2150 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2151 if (is_inf)
2152 data->flags |= SF_IS_INF;
2153 }
653099ff 2154 if (flags & SCF_DO_STCLASS) {
830247a4 2155 cl_init(pRExC_state, &this_class);
653099ff
GS
2156 oclass = data->start_class;
2157 data->start_class = &this_class;
2158 f |= SCF_DO_STCLASS_AND;
2159 f &= ~SCF_DO_STCLASS_OR;
2160 }
e1901655
IZ
2161 /* These are the cases when once a subexpression
2162 fails at a particular position, it cannot succeed
2163 even after backtracking at the enclosing scope.
b81d288d 2164
e1901655
IZ
2165 XXXX what if minimal match and we are at the
2166 initial run of {n,m}? */
2167 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2168 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2169
c277df42 2170 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2171 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2172 (mincount == 0
2173 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2174
2175 if (flags & SCF_DO_STCLASS)
2176 data->start_class = oclass;
2177 if (mincount == 0 || minnext == 0) {
2178 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2179 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2180 }
2181 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2182 /* Switch to OR mode: cache the old value of
653099ff
GS
2183 * data->start_class */
2184 StructCopy(data->start_class, &and_with,
2185 struct regnode_charclass_class);
2186 flags &= ~SCF_DO_STCLASS_AND;
2187 StructCopy(&this_class, data->start_class,
2188 struct regnode_charclass_class);
2189 flags |= SCF_DO_STCLASS_OR;
2190 data->start_class->flags |= ANYOF_EOS;
2191 }
2192 } else { /* Non-zero len */
2193 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2194 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2195 cl_and(data->start_class, &and_with);
2196 }
2197 else if (flags & SCF_DO_STCLASS_AND)
2198 cl_and(data->start_class, &this_class);
2199 flags &= ~SCF_DO_STCLASS;
2200 }
c277df42
IZ
2201 if (!scan) /* It was not CURLYX, but CURLY. */
2202 scan = next;
041457d9
DM
2203 if ( /* ? quantifier ok, except for (?{ ... }) */
2204 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2205 && (minnext == 0) && (deltanext == 0)
99799961 2206 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2207 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2208 && ckWARN(WARN_REGEXP))
b45f050a 2209 {
830247a4 2210 vWARN(RExC_parse,
b45f050a
JF
2211 "Quantifier unexpected on zero-length expression");
2212 }
2213
c277df42 2214 min += minnext * mincount;
b81d288d 2215 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2216 && (minnext + deltanext) > 0)
2217 || deltanext == I32_MAX);
aca2d497 2218 is_inf |= is_inf_internal;
c277df42
IZ
2219 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2220
2221 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2222 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2223 && data->flags & SF_IN_PAR
2224 && !(data->flags & SF_HAS_EVAL)
2225 && !deltanext && minnext == 1 ) {
2226 /* Try to optimize to CURLYN. */
2227 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
2228 regnode *nxt1 = nxt;
2229#ifdef DEBUGGING
2230 regnode *nxt2;
2231#endif
c277df42
IZ
2232
2233 /* Skip open. */
2234 nxt = regnext(nxt);
bfed75c6 2235 if (!strchr((const char*)PL_simple,OP(nxt))
22c35a8c 2236 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 2237 && STR_LEN(nxt) == 1))
c277df42 2238 goto nogo;
497b47a8 2239#ifdef DEBUGGING
c277df42 2240 nxt2 = nxt;
497b47a8 2241#endif
c277df42 2242 nxt = regnext(nxt);
b81d288d 2243 if (OP(nxt) != CLOSE)
c277df42
IZ
2244 goto nogo;
2245 /* Now we know that nxt2 is the only contents: */
eb160463 2246 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2247 OP(oscan) = CURLYN;
2248 OP(nxt1) = NOTHING; /* was OPEN. */
2249#ifdef DEBUGGING
2250 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2251 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2252 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2253 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2254 OP(nxt + 1) = OPTIMIZED; /* was count. */
2255 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2256#endif
c277df42 2257 }
c277df42
IZ
2258 nogo:
2259
2260 /* Try optimization CURLYX => CURLYM. */
b81d288d 2261 if ( OP(oscan) == CURLYX && data
c277df42 2262 && !(data->flags & SF_HAS_PAR)
c277df42 2263 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2264 && !deltanext /* atom is fixed width */
2265 && minnext != 0 /* CURLYM can't handle zero width */
2266 ) {
c277df42
IZ
2267 /* XXXX How to optimize if data == 0? */
2268 /* Optimize to a simpler form. */
2269 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2270 regnode *nxt2;
2271
2272 OP(oscan) = CURLYM;
2273 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2274 && (OP(nxt2) != WHILEM))
c277df42
IZ
2275 nxt = nxt2;
2276 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2277 /* Need to optimize away parenths. */
2278 if (data->flags & SF_IN_PAR) {
2279 /* Set the parenth number. */
2280 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2281
b81d288d 2282 if (OP(nxt) != CLOSE)
b45f050a 2283 FAIL("Panic opt close");
eb160463 2284 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2285 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2286 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2287#ifdef DEBUGGING
2288 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2289 OP(nxt + 1) = OPTIMIZED; /* was count. */
2290 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2291 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2292#endif
c277df42
IZ
2293#if 0
2294 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2295 regnode *nnxt = regnext(nxt1);
b81d288d 2296
c277df42
IZ
2297 if (nnxt == nxt) {
2298 if (reg_off_by_arg[OP(nxt1)])
2299 ARG_SET(nxt1, nxt2 - nxt1);
2300 else if (nxt2 - nxt1 < U16_MAX)
2301 NEXT_OFF(nxt1) = nxt2 - nxt1;
2302 else
2303 OP(nxt) = NOTHING; /* Cannot beautify */
2304 }
2305 nxt1 = nnxt;
2306 }
2307#endif
2308 /* Optimize again: */
b81d288d 2309 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2310 NULL, 0,depth+1);
a0ed51b3
LW
2311 }
2312 else
c277df42 2313 oscan->flags = 0;
c277df42 2314 }
e1901655
IZ
2315 else if ((OP(oscan) == CURLYX)
2316 && (flags & SCF_WHILEM_VISITED_POS)
2317 /* See the comment on a similar expression above.
2318 However, this time it not a subexpression
2319 we care about, but the expression itself. */
2320 && (maxcount == REG_INFTY)
2321 && data && ++data->whilem_c < 16) {
2322 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2323 /* Find WHILEM (as in regexec.c) */
2324 regnode *nxt = oscan + NEXT_OFF(oscan);
2325
2326 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2327 nxt += ARG(nxt);
eb160463
GS
2328 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2329 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2330 }
b81d288d 2331 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2332 pars++;
2333 if (flags & SCF_DO_SUBSTR) {
c445ea15 2334 SV *last_str = NULL;
c277df42
IZ
2335 int counted = mincount != 0;
2336
2337 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2338#if defined(SPARC64_GCC_WORKAROUND)
2339 I32 b = 0;
2340 STRLEN l = 0;
cfd0369c 2341 const char *s = NULL;
5d1c421c
JH
2342 I32 old = 0;
2343
2344 if (pos_before >= data->last_start_min)
2345 b = pos_before;
2346 else
2347 b = data->last_start_min;
2348
2349 l = 0;
cfd0369c 2350 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2351 old = b - data->last_start_min;
2352
2353#else
b81d288d 2354 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2355 ? pos_before : data->last_start_min;
2356 STRLEN l;
cfd0369c 2357 const char *s = SvPV_const(data->last_found, l);
a0ed51b3 2358 I32 old = b - data->last_start_min;
5d1c421c 2359#endif
a0ed51b3
LW
2360
2361 if (UTF)
2362 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2363
a0ed51b3 2364 l -= old;
c277df42 2365 /* Get the added string: */
79cb57f6 2366 last_str = newSVpvn(s + old, l);
0e933229
IH
2367 if (UTF)
2368 SvUTF8_on(last_str);
c277df42
IZ
2369 if (deltanext == 0 && pos_before == b) {
2370 /* What was added is a constant string */
2371 if (mincount > 1) {
2372 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2373 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2374 SvPVX_const(last_str), l, mincount - 1);
b162af07 2375 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2376 /* Add additional parts. */
b81d288d 2377 SvCUR_set(data->last_found,
c277df42
IZ
2378 SvCUR(data->last_found) - l);
2379 sv_catsv(data->last_found, last_str);
0eda9292
JH
2380 {
2381 SV * sv = data->last_found;
2382 MAGIC *mg =
2383 SvUTF8(sv) && SvMAGICAL(sv) ?
2384 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2385 if (mg && mg->mg_len >= 0)
2386 mg->mg_len += CHR_SVLEN(last_str);
2387 }
c277df42
IZ
2388 data->last_end += l * (mincount - 1);
2389 }
2a8d9689
HS
2390 } else {
2391 /* start offset must point into the last copy */
2392 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2393 data->last_start_max += is_inf ? I32_MAX
2394 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2395 }
2396 }
2397 /* It is counted once already... */
2398 data->pos_min += minnext * (mincount - counted);
2399 data->pos_delta += - counted * deltanext +
2400 (minnext + deltanext) * maxcount - minnext * mincount;
2401 if (mincount != maxcount) {
653099ff
GS
2402 /* Cannot extend fixed substrings found inside
2403 the group. */
830247a4 2404 scan_commit(pRExC_state,data);
c277df42
IZ
2405 if (mincount && last_str) {
2406 sv_setsv(data->last_found, last_str);
2407 data->last_end = data->pos_min;
b81d288d 2408 data->last_start_min =
a0ed51b3 2409 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2410 data->last_start_max = is_inf
2411 ? I32_MAX
c277df42 2412 : data->pos_min + data->pos_delta
a0ed51b3 2413 - CHR_SVLEN(last_str);
c277df42
IZ
2414 }
2415 data->longest = &(data->longest_float);
2416 }
aca2d497 2417 SvREFCNT_dec(last_str);
c277df42 2418 }
405ff068 2419 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2420 data->flags |= SF_HAS_EVAL;
2421 optimize_curly_tail:
c277df42 2422 if (OP(oscan) != CURLYX) {
22c35a8c 2423 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2424 && NEXT_OFF(next))
2425 NEXT_OFF(oscan) += NEXT_OFF(next);
2426 }
c277df42 2427 continue;
653099ff 2428 default: /* REF and CLUMP only? */
c277df42 2429 if (flags & SCF_DO_SUBSTR) {
830247a4 2430 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2431 data->longest = &(data->longest_float);
2432 }
aca2d497 2433 is_inf = is_inf_internal = 1;
653099ff 2434 if (flags & SCF_DO_STCLASS_OR)
830247a4 2435 cl_anything(pRExC_state, data->start_class);
653099ff 2436 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2437 break;
2438 }
a0ed51b3 2439 }
bfed75c6 2440 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2441 int value = 0;
653099ff 2442
c277df42 2443 if (flags & SCF_DO_SUBSTR) {
830247a4 2444 scan_commit(pRExC_state,data);
c277df42
IZ
2445 data->pos_min++;
2446 }
2447 min++;
653099ff
GS
2448 if (flags & SCF_DO_STCLASS) {
2449 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2450
2451 /* Some of the logic below assumes that switching
2452 locale on will only add false positives. */
2453 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2454 case SANY:
653099ff
GS
2455 default:
2456 do_default:
2457 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2458 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2459 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2460 break;
2461 case REG_ANY:
2462 if (OP(scan) == SANY)
2463 goto do_default;
2464 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2465 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2466 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2467 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2468 }
2469 if (flags & SCF_DO_STCLASS_AND || !value)
2470 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2471 break;
2472 case ANYOF:
2473 if (flags & SCF_DO_STCLASS_AND)
2474 cl_and(data->start_class,
2475 (struct regnode_charclass_class*)scan);
2476 else
830247a4 2477 cl_or(pRExC_state, data->start_class,
653099ff
GS
2478 (struct regnode_charclass_class*)scan);
2479 break;
2480 case ALNUM:
2481 if (flags & SCF_DO_STCLASS_AND) {
2482 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2483 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2484 for (value = 0; value < 256; value++)
2485 if (!isALNUM(value))
2486 ANYOF_BITMAP_CLEAR(data->start_class, value);
2487 }
2488 }
2489 else {
2490 if (data->start_class->flags & ANYOF_LOCALE)
2491 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2492 else {
2493 for (value = 0; value < 256; value++)
2494 if (isALNUM(value))
b81d288d 2495 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2496 }
2497 }
2498 break;
2499 case ALNUML:
2500 if (flags & SCF_DO_STCLASS_AND) {
2501 if (data->start_class->flags & ANYOF_LOCALE)
2502 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2503 }
2504 else {
2505 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2506 data->start_class->flags |= ANYOF_LOCALE;
2507 }
2508 break;
2509 case NALNUM:
2510 if (flags & SCF_DO_STCLASS_AND) {
2511 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2512 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2513 for (value = 0; value < 256; value++)
2514 if (isALNUM(value))
2515 ANYOF_BITMAP_CLEAR(data->start_class, value);
2516 }
2517 }
2518 else {
2519 if (data->start_class->flags & ANYOF_LOCALE)
2520 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2521 else {
2522 for (value = 0; value < 256; value++)
2523 if (!isALNUM(value))
b81d288d 2524 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2525 }
2526 }
2527 break;
2528 case NALNUML:
2529 if (flags & SCF_DO_STCLASS_AND) {
2530 if (data->start_class->flags & ANYOF_LOCALE)
2531 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2532 }
2533 else {
2534 data->start_class->flags |= ANYOF_LOCALE;
2535 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2536 }
2537 break;
2538 case SPACE:
2539 if (flags & SCF_DO_STCLASS_AND) {
2540 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2541 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2542 for (value = 0; value < 256; value++)
2543 if (!isSPACE(value))
2544 ANYOF_BITMAP_CLEAR(data->start_class, value);
2545 }
2546 }
2547 else {
2548 if (data->start_class->flags & ANYOF_LOCALE)
2549 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2550 else {
2551 for (value = 0; value < 256; value++)
2552 if (isSPACE(value))
b81d288d 2553 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2554 }
2555 }
2556 break;
2557 case SPACEL:
2558 if (flags & SCF_DO_STCLASS_AND) {
2559 if (data->start_class->flags & ANYOF_LOCALE)
2560 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2561 }
2562 else {
2563 data->start_class->flags |= ANYOF_LOCALE;
2564 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2565 }
2566 break;
2567 case NSPACE:
2568 if (flags & SCF_DO_STCLASS_AND) {
2569 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2570 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2571 for (value = 0; value < 256; value++)
2572 if (isSPACE(value))
2573 ANYOF_BITMAP_CLEAR(data->start_class, value);
2574 }
2575 }
2576 else {
2577 if (data->start_class->flags & ANYOF_LOCALE)
2578 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2579 else {
2580 for (value = 0; value < 256; value++)
2581 if (!isSPACE(value))
b81d288d 2582 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2583 }
2584 }
2585 break;
2586 case NSPACEL:
2587 if (flags & SCF_DO_STCLASS_AND) {
2588 if (data->start_class->flags & ANYOF_LOCALE) {
2589 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2590 for (value = 0; value < 256; value++)
2591 if (!isSPACE(value))
2592 ANYOF_BITMAP_CLEAR(data->start_class, value);
2593 }
2594 }
2595 else {
2596 data->start_class->flags |= ANYOF_LOCALE;
2597 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2598 }
2599 break;
2600 case DIGIT:
2601 if (flags & SCF_DO_STCLASS_AND) {
2602 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2603 for (value = 0; value < 256; value++)
2604 if (!isDIGIT(value))
2605 ANYOF_BITMAP_CLEAR(data->start_class, value);
2606 }
2607 else {
2608 if (data->start_class->flags & ANYOF_LOCALE)
2609 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2610 else {
2611 for (value = 0; value < 256; value++)
2612 if (isDIGIT(value))
b81d288d 2613 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2614 }
2615 }
2616 break;
2617 case NDIGIT:
2618 if (flags & SCF_DO_STCLASS_AND) {
2619 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2620 for (value = 0; value < 256; value++)
2621 if (isDIGIT(value))
2622 ANYOF_BITMAP_CLEAR(data->start_class, value);
2623 }
2624 else {
2625 if (data->start_class->flags & ANYOF_LOCALE)
2626 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2627 else {
2628 for (value = 0; value < 256; value++)
2629 if (!isDIGIT(value))
b81d288d 2630 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2631 }
2632 }
2633 break;
2634 }
2635 if (flags & SCF_DO_STCLASS_OR)
2636 cl_and(data->start_class, &and_with);
2637 flags &= ~SCF_DO_STCLASS;
2638 }
a0ed51b3 2639 }
22c35a8c 2640 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2641 data->flags |= (OP(scan) == MEOL
2642 ? SF_BEFORE_MEOL
2643 : SF_BEFORE_SEOL);
a0ed51b3 2644 }
653099ff
GS
2645 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2646 /* Lookbehind, or need to calculate parens/evals/stclass: */
2647 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2648 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2649 /* Lookahead/lookbehind */
cb434fcc 2650 I32 deltanext, minnext, fake = 0;
c277df42 2651 regnode *nscan;
653099ff
GS
2652 struct regnode_charclass_class intrnl;
2653 int f = 0;
c277df42
IZ
2654
2655 data_fake.flags = 0;
b81d288d 2656 if (data) {
2c2d71f5 2657 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2658 data_fake.last_closep = data->last_closep;
2659 }
2660 else
2661 data_fake.last_closep = &fake;
653099ff
GS
2662 if ( flags & SCF_DO_STCLASS && !scan->flags
2663 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2664 cl_init(pRExC_state, &intrnl);
653099ff 2665 data_fake.start_class = &intrnl;
e1901655 2666 f |= SCF_DO_STCLASS_AND;
653099ff 2667 }
e1901655
IZ
2668 if (flags & SCF_WHILEM_VISITED_POS)
2669 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2670 next = regnext(scan);
2671 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2672 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2673 if (scan->flags) {
2674 if (deltanext) {
9baa0206 2675 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2676 }
2677 else if (minnext > U8_MAX) {
9baa0206 2678 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2679 }
eb160463 2680 scan->flags = (U8)minnext;
c277df42
IZ
2681 }
2682 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2683 pars++;
405ff068 2684 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2685 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2686 if (data)
2687 data->whilem_c = data_fake.whilem_c;
e1901655 2688 if (f & SCF_DO_STCLASS_AND) {
a28509cc 2689 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
2690
2691 cl_and(data->start_class, &intrnl);
2692 if (was)
2693 data->start_class->flags |= ANYOF_EOS;
2694 }
a0ed51b3
LW
2695 }
2696 else if (OP(scan) == OPEN) {
c277df42 2697 pars++;
a0ed51b3 2698 }
cb434fcc 2699 else if (OP(scan) == CLOSE) {
eb160463 2700 if ((I32)ARG(scan) == is_par) {
cb434fcc 2701 next = regnext(scan);
c277df42 2702
cb434fcc
IZ
2703 if ( next && (OP(next) != WHILEM) && next < last)
2704 is_par = 0; /* Disable optimization */
2705 }
2706 if (data)
2707 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2708 }
2709 else if (OP(scan) == EVAL) {
c277df42
IZ
2710 if (data)
2711 data->flags |= SF_HAS_EVAL;
2712 }
96776eda 2713 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2714 if (flags & SCF_DO_SUBSTR) {
830247a4 2715 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2716 data->longest = &(data->longest_float);
2717 }
2718 is_inf = is_inf_internal = 1;
653099ff 2719 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2720 cl_anything(pRExC_state, data->start_class);
96776eda 2721 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2722 }
c277df42
IZ
2723 /* Else: zero-length, ignore. */
2724 scan = regnext(scan);
2725 }
2726
2727 finish:
2728 *scanp = scan;
aca2d497 2729 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2730 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2731 data->pos_delta = I32_MAX - data->pos_min;
2732 if (is_par > U8_MAX)
2733 is_par = 0;
2734 if (is_par && pars==1 && data) {
2735 data->flags |= SF_IN_PAR;
2736 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2737 }
2738 else if (pars && data) {
c277df42
IZ
2739 data->flags |= SF_HAS_PAR;
2740 data->flags &= ~SF_IN_PAR;
2741 }
653099ff
GS
2742 if (flags & SCF_DO_STCLASS_OR)
2743 cl_and(data->start_class, &and_with);
c277df42
IZ
2744 return min;
2745}
2746
76e3520e 2747STATIC I32
bfed75c6 2748S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2749{
830247a4 2750 if (RExC_rx->data) {
b81d288d
AB
2751 Renewc(RExC_rx->data,
2752 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2753 char, struct reg_data);
830247a4
IZ
2754 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2755 RExC_rx->data->count += n;
a0ed51b3
LW
2756 }
2757 else {
a02a5408 2758 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2759 char, struct reg_data);
a02a5408 2760 Newx(RExC_rx->data->what, n, U8);
830247a4 2761 RExC_rx->data->count = n;
c277df42 2762 }
830247a4
IZ
2763 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2764 return RExC_rx->data->count - n;
c277df42
IZ
2765}
2766
d88dccdf 2767void
864dbfa3 2768Perl_reginitcolors(pTHX)
d88dccdf 2769{
1df70142 2770 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 2771 if (s) {
1df70142
AL
2772 char *t = savepv(s);
2773 int i = 0;
2774 PL_colors[0] = t;
d88dccdf 2775 while (++i < 6) {
1df70142
AL
2776 t = strchr(t, '\t');
2777 if (t) {
2778 *t = '\0';
2779 PL_colors[i] = ++t;
d88dccdf
IZ
2780 }
2781 else
1df70142 2782 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
2783 }
2784 } else {
1df70142 2785 int i = 0;
b81d288d 2786 while (i < 6)
06b5626a 2787 PL_colors[i++] = (char *)"";
d88dccdf
IZ
2788 }
2789 PL_colorset = 1;
2790}
2791
8615cb43 2792
a687059c 2793/*
e50aee73 2794 - pregcomp - compile a regular expression into internal code
a687059c
LW
2795 *
2796 * We can't allocate space until we know how big the compiled form will be,
2797 * but we can't compile it (and thus know how big it is) until we've got a
2798 * place to put the code. So we cheat: we compile it twice, once with code
2799 * generation turned off and size counting turned on, and once "for real".
2800 * This also means that we don't allocate space until we are sure that the
2801 * thing really will compile successfully, and we never have to move the
2802 * code and thus invalidate pointers into it. (Note that it has to be in
2803 * one piece because free() must be able to free it all.) [NB: not true in perl]
2804 *
2805 * Beware that the optimization-preparation code in here knows about some
2806 * of the structure of the compiled regexp. [I'll say.]
2807 */
2808regexp *
864dbfa3 2809Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2810{
a0d0e21e 2811 register regexp *r;
c277df42 2812 regnode *scan;
c277df42 2813 regnode *first;
a0d0e21e 2814 I32 flags;
a0d0e21e
LW
2815 I32 minlen = 0;
2816 I32 sawplus = 0;
2817 I32 sawopen = 0;
2c2d71f5 2818 scan_data_t data;
830247a4
IZ
2819 RExC_state_t RExC_state;
2820 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 2821
a3621e74
YO
2822 GET_RE_DEBUG_FLAGS_DECL;
2823
a0d0e21e 2824 if (exp == NULL)
c277df42 2825 FAIL("NULL regexp argument");
a0d0e21e 2826
a5961de5 2827 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 2828
5cfc7842 2829 RExC_precomp = exp;
a3621e74
YO
2830 DEBUG_r(if (!PL_colorset) reginitcolors());
2831 DEBUG_COMPILE_r({
2832 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
2833 PL_colors[4],PL_colors[5],PL_colors[0],
2834 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2835 });
e2509266 2836 RExC_flags = pm->op_pmflags;
830247a4 2837 RExC_sawback = 0;
bbce6d69 2838
830247a4
IZ
2839 RExC_seen = 0;
2840 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2841 RExC_seen_evals = 0;
2842 RExC_extralen = 0;
c277df42 2843
bbce6d69 2844 /* First pass: determine size, legality. */
830247a4 2845 RExC_parse = exp;
fac92740 2846 RExC_start = exp;
830247a4
IZ
2847 RExC_end = xend;
2848 RExC_naughty = 0;
2849 RExC_npar = 1;
2850 RExC_size = 0L;
2851 RExC_emit = &PL_regdummy;
2852 RExC_whilem_seen = 0;
85ddcde9
JH
2853#if 0 /* REGC() is (currently) a NOP at the first pass.
2854 * Clever compilers notice this and complain. --jhi */
830247a4 2855 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 2856#endif
830247a4 2857 if (reg(pRExC_state, 0, &flags) == NULL) {
c445ea15 2858 RExC_precomp = NULL;
a0d0e21e
LW
2859 return(NULL);
2860 }
a3621e74 2861 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 2862
c277df42
IZ
2863 /* Small enough for pointer-storage convention?
2864 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
2865 if (RExC_size >= 0x10000L && RExC_extralen)
2866 RExC_size += RExC_extralen;
c277df42 2867 else
830247a4
IZ
2868 RExC_extralen = 0;
2869 if (RExC_whilem_seen > 15)
2870 RExC_whilem_seen = 15;
a0d0e21e 2871
bbce6d69 2872 /* Allocate space and initialize. */
a02a5408 2873 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 2874 char, regexp);
a0d0e21e 2875 if (r == NULL)
b45f050a
JF
2876 FAIL("Regexp out of space");
2877
0f79a09d
GS
2878#ifdef DEBUGGING
2879 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 2880 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 2881#endif
c277df42 2882 r->refcnt = 1;
bbce6d69 2883 r->prelen = xend - exp;
5cfc7842 2884 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 2885 r->subbeg = NULL;
f8c7b90f 2886#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 2887 r->saved_copy = NULL;
ed252734 2888#endif
cf93c79d 2889 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 2890 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
2891
2892 r->substrs = 0; /* Useful during FAIL. */
2893 r->startp = 0; /* Useful during FAIL. */
2894 r->endp = 0; /* Useful during FAIL. */
2895
a02a5408 2896 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 2897 if (r->offsets) {
2af232bd 2898 r->offsets[0] = RExC_size;
fac92740 2899 }
a3621e74 2900 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
2901 "%s %"UVuf" bytes for offset annotations.\n",
2902 r->offsets ? "Got" : "Couldn't get",
392fbf5d 2903 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 2904
830247a4 2905 RExC_rx = r;
bbce6d69 2906
2907 /* Second pass: emit code. */
e2509266 2908 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
2909 RExC_parse = exp;
2910 RExC_end = xend;
2911 RExC_naughty = 0;
2912 RExC_npar = 1;
fac92740 2913 RExC_emit_start = r->program;
830247a4 2914 RExC_emit = r->program;
2cd61cdb 2915 /* Store the count of eval-groups for security checks: */
eb160463 2916 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 2917 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 2918 r->data = 0;
830247a4 2919 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
2920 return(NULL);
2921
a3621e74 2922
a0d0e21e 2923 /* Dig out information for optimizations. */
cf93c79d 2924 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 2925 pm->op_pmflags = RExC_flags;
a0ed51b3 2926 if (UTF)
5ff6fc6d 2927 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 2928 r->regstclass = NULL;
830247a4 2929 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 2930 r->reganch |= ROPT_NAUGHTY;
c277df42 2931 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
2932
2933 /* XXXX To minimize changes to RE engine we always allocate
2934 3-units-long substrs field. */
a02a5408 2935 Newxz(r->substrs, 1, struct reg_substr_data);
2779dcf1 2936
2c2d71f5 2937 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 2938 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 2939 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 2940 I32 fake;
c5254dd6 2941 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
2942 struct regnode_charclass_class ch_class;
2943 int stclass_flag;
cb434fcc 2944 I32 last_close = 0;
a0d0e21e
LW
2945
2946 first = scan;
c277df42 2947 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 2948 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 2949 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
2950 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2951 (OP(first) == PLUS) ||
2952 (OP(first) == MINMOD) ||
653099ff 2953 /* An {n,m} with n>0 */
22c35a8c 2954 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
2955 if (OP(first) == PLUS)
2956 sawplus = 1;
2957 else
2958 first += regarglen[(U8)OP(first)];
2959 first = NEXTOPER(first);
a687059c
LW
2960 }
2961
a0d0e21e
LW
2962 /* Starting-point info. */
2963 again:
653099ff 2964 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
2965 if (OP(first) == EXACT)
2966 ; /* Empty, get anchored substr later. */
2967 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
2968 r->regstclass = first;
2969 }
bfed75c6 2970 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 2971 r->regstclass = first;
22c35a8c
GS
2972 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2973 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 2974 r->regstclass = first;
22c35a8c 2975 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
2976 r->reganch |= (OP(first) == MBOL
2977 ? ROPT_ANCH_MBOL
2978 : (OP(first) == SBOL
2979 ? ROPT_ANCH_SBOL
2980 : ROPT_ANCH_BOL));
a0d0e21e 2981 first = NEXTOPER(first);
774d564b 2982 goto again;
2983 }
2984 else if (OP(first) == GPOS) {
2985 r->reganch |= ROPT_ANCH_GPOS;
2986 first = NEXTOPER(first);
2987 goto again;
a0d0e21e 2988 }
e09294f4 2989 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 2990 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
2991 !(r->reganch & ROPT_ANCH) )
2992 {
2993 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
2994 const int type =
2995 (OP(NEXTOPER(first)) == REG_ANY)
2996 ? ROPT_ANCH_MBOL
2997 : ROPT_ANCH_SBOL;
cad2e5aa 2998 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 2999 first = NEXTOPER(first);
774d564b 3000 goto again;
a0d0e21e 3001 }
b81d288d 3002 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3003 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3004 /* x+ must match at the 1st pos of run of x's */
3005 r->reganch |= ROPT_SKIP;
a0d0e21e 3006
c277df42 3007 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 3008 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 3009 (IV)(first - scan + 1)));
a0d0e21e
LW
3010 /*
3011 * If there's something expensive in the r.e., find the
3012 * longest literal string that must appear and make it the
3013 * regmust. Resolve ties in favor of later strings, since
3014 * the regstart check works with the beginning of the r.e.
3015 * and avoiding duplication strengthens checking. Not a
3016 * strong reason, but sufficient in the absence of others.
3017 * [Now we resolve ties in favor of the earlier string if
c277df42 3018 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3019 * earlier string may buy us something the later one won't.]
3020 */
a0d0e21e 3021 minlen = 0;
a687059c 3022
396482e1
GA
3023 data.longest_fixed = newSVpvs("");
3024 data.longest_float = newSVpvs("");
3025 data.last_found = newSVpvs("");
c277df42
IZ
3026 data.longest = &(data.longest_fixed);
3027 first = scan;
653099ff 3028 if (!r->regstclass) {
830247a4 3029 cl_init(pRExC_state, &ch_class);
653099ff
GS
3030 data.start_class = &ch_class;
3031 stclass_flag = SCF_DO_STCLASS_AND;
3032 } else /* XXXX Check for BOUND? */
3033 stclass_flag = 0;
cb434fcc 3034 data.last_closep = &last_close;
653099ff 3035
830247a4 3036 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3037 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3038 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3039 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3040 && !RExC_seen_zerolen
3041 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3042 r->reganch |= ROPT_CHECK_ALL;
830247a4 3043 scan_commit(pRExC_state, &data);
c277df42
IZ
3044 SvREFCNT_dec(data.last_found);
3045
a0ed51b3 3046 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3047 if (longest_float_length
c277df42
IZ
3048 || (data.flags & SF_FL_BEFORE_EOL
3049 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3050 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3051 int t;
3052
a0ed51b3 3053 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3054 && data.offset_fixed == data.offset_float_min
3055 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3056 goto remove_float; /* As in (a)+. */
3057
33b8afdf
JH
3058 if (SvUTF8(data.longest_float)) {
3059 r->float_utf8 = data.longest_float;
c445ea15 3060 r->float_substr = NULL;
33b8afdf
JH
3061 } else {
3062 r->float_substr = data.longest_float;
c445ea15 3063 r->float_utf8 = NULL;
33b8afdf 3064 }
c277df42
IZ
3065 r->float_min_offset = data.offset_float_min;
3066 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3067 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3068 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3069 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3070 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3071 }
3072 else {
aca2d497 3073 remove_float:
c445ea15 3074 r->float_substr = r->float_utf8 = NULL;
c277df42 3075 SvREFCNT_dec(data.longest_float);
c5254dd6 3076 longest_float_length = 0;
a0d0e21e 3077 }
c277df42 3078
a0ed51b3 3079 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3080 if (longest_fixed_length
c277df42
IZ
3081 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3082 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3083 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3084 int t;
3085
33b8afdf
JH
3086 if (SvUTF8(data.longest_fixed)) {
3087 r->anchored_utf8 = data.longest_fixed;
c445ea15 3088 r->anchored_substr = NULL;
33b8afdf
JH
3089 } else {
3090 r->anchored_substr = data.longest_fixed;
c445ea15 3091 r->anchored_utf8 = NULL;
33b8afdf 3092 }
c277df42 3093 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3094 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3095 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3096 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3097 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3098 }
3099 else {
c445ea15 3100 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 3101 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3102 longest_fixed_length = 0;
a0d0e21e 3103 }
b81d288d 3104 if (r->regstclass
ffc61ed2 3105 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3106 r->regstclass = NULL;
33b8afdf
JH
3107 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3108 && stclass_flag
653099ff 3109 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3110 && !cl_is_anything(data.start_class))
3111 {
1df70142 3112 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3113
a02a5408 3114 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3115 struct regnode_charclass_class);
3116 StructCopy(data.start_class,
830247a4 3117 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3118 struct regnode_charclass_class);
830247a4 3119 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3120 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 3121 PL_regdata = r->data; /* for regprop() */
a3621e74 3122 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
9c5ffd7c
JH
3123 regprop(sv, (regnode*)data.start_class);
3124 PerlIO_printf(Perl_debug_log,
a0288114 3125 "synthetic stclass \"%s\".\n",
3f7c398e 3126 SvPVX_const(sv));});
653099ff 3127 }
c277df42
IZ
3128
3129 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3130 if (longest_fixed_length > longest_float_length) {
c277df42 3131 r->check_substr = r->anchored_substr;
33b8afdf 3132 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3133 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3134 if (r->reganch & ROPT_ANCH_SINGLE)
3135 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3136 }
3137 else {
c277df42 3138 r->check_substr = r->float_substr;
33b8afdf 3139 r->check_utf8 = r->float_utf8;
c277df42
IZ
3140 r->check_offset_min = data.offset_float_min;
3141 r->check_offset_max = data.offset_float_max;
a0d0e21e 3142 }
30382c73
IZ
3143 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3144 This should be changed ASAP! */
33b8afdf 3145 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3146 r->reganch |= RE_USE_INTUIT;
33b8afdf 3147 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3148 r->reganch |= RE_INTUIT_TAIL;
3149 }
a0ed51b3
LW
3150 }
3151 else {
c277df42
IZ
3152 /* Several toplevels. Best we can is to set minlen. */
3153 I32 fake;
653099ff 3154 struct regnode_charclass_class ch_class;
cb434fcc 3155 I32 last_close = 0;
c277df42 3156
a3621e74 3157 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3158 scan = r->program + 1;
830247a4 3159 cl_init(pRExC_state, &ch_class);
653099ff 3160 data.start_class = &ch_class;
cb434fcc 3161 data.last_closep = &last_close;
a3621e74 3162 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf 3163 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 3164 = r->float_substr = r->float_utf8 = NULL;
653099ff 3165 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3166 && !cl_is_anything(data.start_class))
3167 {
1df70142 3168 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3169
a02a5408 3170 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3171 struct regnode_charclass_class);
3172 StructCopy(data.start_class,
830247a4 3173 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3174 struct regnode_charclass_class);
830247a4 3175 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3176 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3177 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
9c5ffd7c
JH
3178 regprop(sv, (regnode*)data.start_class);
3179 PerlIO_printf(Perl_debug_log,
a0288114 3180 "synthetic stclass \"%s\".\n",
3f7c398e 3181 SvPVX_const(sv));});
653099ff 3182 }
a0d0e21e
LW
3183 }
3184
a0d0e21e 3185 r->minlen = minlen;
b81d288d 3186 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3187 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3188 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3189 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3190 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3191 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3192 if (RExC_seen & REG_SEEN_CANY)
3193 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3194 Newxz(r->startp, RExC_npar, I32);
3195 Newxz(r->endp, RExC_npar, I32);
ffc61ed2 3196 PL_regdata = r->data; /* for regprop() */
a3621e74 3197 DEBUG_COMPILE_r(regdump(r));
a0d0e21e 3198 return(r);
a687059c
LW
3199}
3200
3201/*
3202 - reg - regular expression, i.e. main body or parenthesized thing
3203 *
3204 * Caller must absorb opening parenthesis.
3205 *
3206 * Combining parenthesis handling with the base level of regular expression
3207 * is a trifle forced, but the need to tie the tails of the branches to what
3208 * follows makes it hard to avoid.
3209 */
76e3520e 3210STATIC regnode *
830247a4 3211S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 3212 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3213{
27da23d5 3214 dVAR;
c277df42
IZ
3215 register regnode *ret; /* Will be the head of the group. */
3216 register regnode *br;
3217 register regnode *lastbr;
cbbf8932 3218 register regnode *ender = NULL;
a0d0e21e 3219 register I32 parno = 0;
cbbf8932
AL
3220 I32 flags;
3221 const I32 oregflags = RExC_flags;
3222 I32 have_branch = 0;
3223 I32 open = 0;
9d1d55b5
JP
3224
3225 /* for (?g), (?gc), and (?o) warnings; warning
3226 about (?c) will warn about (?g) -- japhy */
3227
cbbf8932
AL
3228 I32 wastedflags = 0x00;
3229 const I32 wasted_o = 0x01;
3230 const I32 wasted_g = 0x02;
3231 const I32 wasted_gc = 0x02 | 0x04;
3232 const I32 wasted_c = 0x04;
9d1d55b5 3233
fac92740 3234 char * parse_start = RExC_parse; /* MJD */
a28509cc 3235 char * const oregcomp_parse = RExC_parse;
c277df42 3236 char c;
a0d0e21e 3237
821b33a5 3238 *flagp = 0; /* Tentatively. */
a0d0e21e 3239
9d1d55b5 3240
a0d0e21e
LW
3241 /* Make an OPEN node, if parenthesized. */
3242 if (paren) {
fac92740 3243 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3244 U32 posflags = 0, negflags = 0;
3245 U32 *flagsp = &posflags;
0f5d15d6 3246 int logical = 0;
a28509cc 3247 const char * const seqstart = RExC_parse;
ca9dfc88 3248
830247a4
IZ
3249 RExC_parse++;
3250 paren = *RExC_parse++;
c277df42 3251 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3252 switch (paren) {
fac92740 3253 case '<': /* (?<...) */
830247a4 3254 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3255 if (*RExC_parse == '!')
c277df42 3256 paren = ',';
b81d288d 3257 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3258 goto unknown;
830247a4 3259 RExC_parse++;
fac92740
MJD
3260 case '=': /* (?=...) */
3261 case '!': /* (?!...) */
830247a4 3262 RExC_seen_zerolen++;
fac92740
MJD
3263 case ':': /* (?:...) */
3264 case '>': /* (?>...) */
a0d0e21e 3265 break;
fac92740
MJD
3266 case '$': /* (?$...) */
3267 case '@': /* (?@...) */
8615cb43 3268 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3269 break;
fac92740 3270 case '#': /* (?#...) */
830247a4
IZ
3271 while (*RExC_parse && *RExC_parse != ')')
3272 RExC_parse++;
3273 if (*RExC_parse != ')')
c277df42 3274 FAIL("Sequence (?#... not terminated");
830247a4 3275 nextchar(pRExC_state);
a0d0e21e
LW
3276 *flagp = TRYAGAIN;
3277 return NULL;
fac92740 3278 case 'p': /* (?p...) */
9014280d 3279 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3280 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3281 /* FALL THROUGH*/
fac92740 3282 case '?': /* (??...) */
0f5d15d6 3283 logical = 1;
438a3801
YST
3284 if (*RExC_parse != '{')
3285 goto unknown;
830247a4 3286 paren = *RExC_parse++;
0f5d15d6 3287 /* FALL THROUGH */
fac92740 3288 case '{': /* (?{...}) */
c277df42 3289 {
c277df42
IZ
3290 I32 count = 1, n = 0;
3291 char c;
830247a4 3292 char *s = RExC_parse;
c277df42
IZ
3293 SV *sv;
3294 OP_4tree *sop, *rop;
3295
830247a4
IZ
3296 RExC_seen_zerolen++;
3297 RExC_seen |= REG_SEEN_EVAL;
3298 while (count && (c = *RExC_parse)) {
3299 if (c == '\\' && RExC_parse[1])
3300 RExC_parse++;
b81d288d 3301 else if (c == '{')
c277df42 3302 count++;
b81d288d 3303 else if (c == '}')
c277df42 3304 count--;
830247a4 3305 RExC_parse++;
c277df42 3306 }
830247a4 3307 if (*RExC_parse != ')')
b45f050a 3308 {
b81d288d 3309 RExC_parse = s;
b45f050a
JF
3310 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3311 }
c277df42 3312 if (!SIZE_ONLY) {
f3548bdc 3313 PAD *pad;
b81d288d
AB
3314
3315 if (RExC_parse - 1 - s)
830247a4 3316 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3317 else
396482e1 3318 sv = newSVpvs("");
c277df42 3319
569233ed
SB
3320 ENTER;
3321 Perl_save_re_context(aTHX);
f3548bdc 3322 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3323 sop->op_private |= OPpREFCOUNTED;
3324 /* re_dup will OpREFCNT_inc */
3325 OpREFCNT_set(sop, 1);
569233ed 3326 LEAVE;
c277df42 3327
830247a4
IZ
3328 n = add_data(pRExC_state, 3, "nop");
3329 RExC_rx->data->data[n] = (void*)rop;
3330 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3331 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3332 SvREFCNT_dec(sv);
a0ed51b3 3333 }
e24b16f9 3334 else { /* First pass */
830247a4 3335 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3336 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3337 /* No compiled RE interpolated, has runtime
3338 components ===> unsafe. */
3339 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3340 if (PL_tainting && PL_tainted)
cc6b7395 3341 FAIL("Eval-group in insecure regular expression");
923e4eb5 3342 if (IN_PERL_COMPILETIME)
b5c19bd7 3343 PL_cv_has_eval = 1;
c277df42 3344 }
b5c19bd7 3345
830247a4 3346 nextchar(pRExC_state);
0f5d15d6 3347 if (logical) {
830247a4 3348 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3349 if (!SIZE_ONLY)
3350 ret->flags = 2;
830247a4 3351 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3352 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3353 return ret;
3354 }
ccb2c380
MP
3355 ret = reganode(pRExC_state, EVAL, n);
3356 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3357 Set_Node_Offset(ret, parse_start);
3358 return ret;
c277df42 3359 }
fac92740 3360 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3361 {
fac92740 3362 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3363 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3364 || RExC_parse[1] == '<'
830247a4 3365 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3366 I32 flag;
3367
830247a4 3368 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3369 if (!SIZE_ONLY)
3370 ret->flags = 1;
830247a4 3371 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 3372 goto insert_if;
b81d288d 3373 }
a0ed51b3 3374 }
830247a4 3375 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3376 /* (?(1)...) */
830247a4 3377 parno = atoi(RExC_parse++);
c277df42 3378
830247a4
IZ
3379 while (isDIGIT(*RExC_parse))
3380 RExC_parse++;
fac92740 3381 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3382
830247a4 3383 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3384 vFAIL("Switch condition not recognized");
c277df42 3385 insert_if:
830247a4
IZ
3386 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3387 br = regbranch(pRExC_state, &flags, 1);
c277df42 3388 if (br == NULL)
830247a4 3389 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3390 else
830247a4
IZ
3391 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3392 c = *nextchar(pRExC_state);
d1b80229
IZ
3393 if (flags&HASWIDTH)
3394 *flagp |= HASWIDTH;
c277df42 3395 if (c == '|') {
830247a4
IZ
3396 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3397 regbranch(pRExC_state, &flags, 1);
3398 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
3399 if (flags&HASWIDTH)
3400 *flagp |= HASWIDTH;
830247a4 3401 c = *nextchar(pRExC_state);
a0ed51b3
LW
3402 }
3403 else
c277df42
IZ
3404 lastbr = NULL;
3405 if (c != ')')
8615cb43 3406 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
3407 ender = reg_node(pRExC_state, TAIL);
3408 regtail(pRExC_state, br, ender);
c277df42 3409 if (lastbr) {
830247a4
IZ
3410 regtail(pRExC_state, lastbr, ender);
3411 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3412 }
3413 else
830247a4 3414 regtail(pRExC_state, ret, ender);
c277df42 3415 return ret;
a0ed51b3
LW
3416 }
3417 else {
830247a4 3418 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3419 }
3420 }
1b1626e4 3421 case 0:
830247a4 3422 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3423 vFAIL("Sequence (? incomplete");
1b1626e4 3424 break;
a0d0e21e 3425 default:
830247a4 3426 --RExC_parse;
fac92740 3427 parse_flags: /* (?i) */
830247a4 3428 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3429 /* (?g), (?gc) and (?o) are useless here
3430 and must be globally applied -- japhy */
3431
3432 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3433 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3434 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3435 if (! (wastedflags & wflagbit) ) {
3436 wastedflags |= wflagbit;
3437 vWARN5(
3438 RExC_parse + 1,
3439 "Useless (%s%c) - %suse /%c modifier",
3440 flagsp == &negflags ? "?-" : "?",
3441 *RExC_parse,
3442 flagsp == &negflags ? "don't " : "",
3443 *RExC_parse
3444 );
3445 }
3446 }
3447 }
3448 else if (*RExC_parse == 'c') {
3449 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3450 if (! (wastedflags & wasted_c) ) {
3451 wastedflags |= wasted_gc;
3452 vWARN3(
3453 RExC_parse + 1,
3454 "Useless (%sc) - %suse /gc modifier",
3455 flagsp == &negflags ? "?-" : "?",
3456 flagsp == &negflags ? "don't " : ""
3457 );
3458 }
3459 }
3460 }
3461 else { pmflag(flagsp, *RExC_parse); }
3462
830247a4 3463 ++RExC_parse;
ca9dfc88 3464 }
830247a4 3465 if (*RExC_parse == '-') {
ca9dfc88 3466 flagsp = &negflags;
9d1d55b5 3467 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3468 ++RExC_parse;
ca9dfc88 3469 goto parse_flags;
48c036b1 3470 }
e2509266
JH
3471 RExC_flags |= posflags;
3472 RExC_flags &= ~negflags;
830247a4
IZ
3473 if (*RExC_parse == ':') {
3474 RExC_parse++;
ca9dfc88
IZ
3475 paren = ':';
3476 break;
3477 }
c277df42 3478 unknown:
830247a4
IZ
3479 if (*RExC_parse != ')') {
3480 RExC_parse++;
3481 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3482 }
830247a4 3483 nextchar(pRExC_state);
a0d0e21e
LW
3484 *flagp = TRYAGAIN;
3485 return NULL;
3486 }
3487 }
fac92740 3488 else { /* (...) */
830247a4
IZ
3489 parno = RExC_npar;
3490 RExC_npar++;
3491 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3492 Set_Node_Length(ret, 1); /* MJD */
3493 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 3494 open = 1;
a0d0e21e 3495 }
a0ed51b3 3496 }
fac92740 3497 else /* ! paren */
a0d0e21e
LW
3498 ret = NULL;
3499
3500 /* Pick up the branches, linking them together. */
fac92740 3501 parse_start = RExC_parse; /* MJD */
830247a4 3502 br = regbranch(pRExC_state, &flags, 1);
fac92740 3503 /* branch_len = (paren != 0); */
2af232bd 3504
a0d0e21e
LW
3505 if (br == NULL)
3506 return(NULL);
830247a4
IZ
3507 if (*RExC_parse == '|') {
3508 if (!SIZE_ONLY && RExC_extralen) {
3509 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3510 }
fac92740 3511 else { /* MJD */
830247a4 3512 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3513 Set_Node_Length(br, paren != 0);
3514 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3515 }
c277df42
IZ
3516 have_branch = 1;
3517 if (SIZE_ONLY)
830247a4 3518 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3519 }
3520 else if (paren == ':') {
c277df42
IZ
3521 *flagp |= flags&SIMPLE;
3522 }
3523 if (open) { /* Starts with OPEN. */
830247a4 3524 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3525 }
3526 else if (paren != '?') /* Not Conditional */
a0d0e21e 3527 ret = br;
32a0ca98 3528 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3529 lastbr = br;
830247a4
IZ
3530 while (*RExC_parse == '|') {
3531 if (!SIZE_ONLY && RExC_extralen) {
3532 ender = reganode(pRExC_state, LONGJMP,0);
3533 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3534 }
3535 if (SIZE_ONLY)
830247a4
IZ
3536 RExC_extralen += 2; /* Account for LONGJMP. */
3537 nextchar(pRExC_state);
3538 br = regbranch(pRExC_state, &flags, 0);
2af232bd 3539
a687059c 3540 if (br == NULL)
a0d0e21e 3541 return(NULL);
830247a4 3542 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3543 lastbr = br;
821b33a5
IZ
3544 if (flags&HASWIDTH)
3545 *flagp |= HASWIDTH;
a687059c 3546 *flagp |= flags&SPSTART;
a0d0e21e
LW
3547 }
3548
c277df42
IZ
3549 if (have_branch || paren != ':') {
3550 /* Make a closing node, and hook it on the end. */
3551 switch (paren) {
3552 case ':':
830247a4 3553 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3554 break;
3555 case 1:
830247a4 3556 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3557 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3558 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3559 break;
3560 case '<':
c277df42
IZ
3561 case ',':
3562 case '=':
3563 case '!':
c277df42 3564 *flagp &= ~HASWIDTH;
821b33a5
IZ
3565 /* FALL THROUGH */
3566 case '>':
830247a4 3567 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3568 break;
3569 case 0:
830247a4 3570 ender = reg_node(pRExC_state, END);
c277df42
IZ
3571 break;
3572 }
830247a4 3573 regtail(pRExC_state, lastbr, ender);
a0d0e21e 3574
c277df42
IZ
3575 if (have_branch) {
3576 /* Hook the tails of the branches to the closing node. */
3577 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 3578 regoptail(pRExC_state, br, ender);
c277df42
IZ
3579 }
3580 }
a0d0e21e 3581 }
c277df42
IZ
3582
3583 {
e1ec3a88
AL
3584 const char *p;
3585 static const char parens[] = "=!<,>";
c277df42
IZ
3586
3587 if (paren && (p = strchr(parens, paren))) {
eb160463 3588 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
3589 int flag = (p - parens) > 1;
3590
3591 if (paren == '>')
3592 node = SUSPEND, flag = 0;
830247a4 3593 reginsert(pRExC_state, node,ret);
45948336
EP
3594 Set_Node_Cur_Length(ret);
3595 Set_Node_Offset(ret, parse_start + 1);
c277df42 3596 ret->flags = flag;
830247a4 3597 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 3598 }
a0d0e21e
LW
3599 }
3600
3601 /* Check for proper termination. */
ce3e6498 3602 if (paren) {
e2509266 3603 RExC_flags = oregflags;
830247a4
IZ
3604 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3605 RExC_parse = oregcomp_parse;
380a0633 3606 vFAIL("Unmatched (");
ce3e6498 3607 }
a0ed51b3 3608 }
830247a4
IZ
3609 else if (!paren && RExC_parse < RExC_end) {
3610 if (*RExC_parse == ')') {
3611 RExC_parse++;
380a0633 3612 vFAIL("Unmatched )");
a0ed51b3
LW
3613 }
3614 else
b45f050a 3615 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
3616 /* NOTREACHED */
3617 }
a687059c 3618
a0d0e21e 3619 return(ret);
a687059c
LW
3620}
3621
3622/*
3623 - regbranch - one alternative of an | operator
3624 *
3625 * Implements the concatenation operator.
3626 */
76e3520e 3627STATIC regnode *
830247a4 3628S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 3629{
c277df42
IZ
3630 register regnode *ret;
3631 register regnode *chain = NULL;
3632 register regnode *latest;
3633 I32 flags = 0, c = 0;
a0d0e21e 3634
b81d288d 3635 if (first)
c277df42
IZ
3636 ret = NULL;
3637 else {
b81d288d 3638 if (!SIZE_ONLY && RExC_extralen)
830247a4 3639 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 3640 else {
830247a4 3641 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
3642 Set_Node_Length(ret, 1);
3643 }
c277df42
IZ
3644 }
3645
b81d288d 3646 if (!first && SIZE_ONLY)
830247a4 3647 RExC_extralen += 1; /* BRANCHJ */
b81d288d 3648
c277df42 3649 *flagp = WORST; /* Tentatively. */
a0d0e21e 3650
830247a4
IZ
3651 RExC_parse--;
3652 nextchar(pRExC_state);
3653 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 3654 flags &= ~TRYAGAIN;
830247a4 3655 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
3656 if (latest == NULL) {
3657 if (flags & TRYAGAIN)
3658 continue;
3659 return(NULL);
a0ed51b3
LW
3660 }
3661 else if (ret == NULL)
c277df42 3662 ret = latest;
a0d0e21e 3663 *flagp |= flags&HASWIDTH;
c277df42 3664 if (chain == NULL) /* First piece. */
a0d0e21e
LW
3665 *flagp |= flags&SPSTART;
3666 else {
830247a4
IZ
3667 RExC_naughty++;
3668 regtail(pRExC_state, chain, latest);
a687059c 3669 }
a0d0e21e 3670 chain = latest;
c277df42
IZ
3671 c++;
3672 }
3673 if (chain == NULL) { /* Loop ran zero times. */
830247a4 3674 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
3675 if (ret == NULL)
3676 ret = chain;
3677 }
3678 if (c == 1) {
3679 *flagp |= flags&SIMPLE;
a0d0e21e 3680 }
a687059c 3681
a0d0e21e 3682 return(ret);
a687059c
LW
3683}
3684
3685/*
3686 - regpiece - something followed by possible [*+?]
3687 *
3688 * Note that the branching code sequences used for ? and the general cases
3689 * of * and + are somewhat optimized: they use the same NOTHING node as
3690 * both the endmarker for their branch list and the body of the last branch.
3691 * It might seem that this node could be dispensed with entirely, but the
3692 * endmarker role is not redundant.
3693 */
76e3520e 3694STATIC regnode *
830247a4 3695S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3696{
c277df42 3697 register regnode *ret;
a0d0e21e
LW
3698 register char op;
3699 register char *next;
3700 I32 flags;
1df70142 3701 const char * const origparse = RExC_parse;
a0d0e21e
LW
3702 char *maxpos;
3703 I32 min;
c277df42 3704 I32 max = REG_INFTY;
fac92740 3705 char *parse_start;
a0d0e21e 3706
830247a4 3707 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
3708 if (ret == NULL) {
3709 if (flags & TRYAGAIN)
3710 *flagp |= TRYAGAIN;
3711 return(NULL);
3712 }
3713
830247a4 3714 op = *RExC_parse;
a0d0e21e 3715
830247a4 3716 if (op == '{' && regcurly(RExC_parse)) {
fac92740 3717 parse_start = RExC_parse; /* MJD */
830247a4 3718 next = RExC_parse + 1;
c445ea15 3719 maxpos = NULL;
a0d0e21e
LW
3720 while (isDIGIT(*next) || *next == ',') {
3721 if (*next == ',') {
3722 if (maxpos)
3723 break;
3724 else
3725 maxpos = next;
a687059c 3726 }
a0d0e21e
LW
3727 next++;
3728 }
3729 if (*next == '}') { /* got one */
3730 if (!maxpos)
3731 maxpos = next;
830247a4
IZ
3732 RExC_parse++;
3733 min = atoi(RExC_parse);
a0d0e21e
LW
3734 if (*maxpos == ',')
3735 maxpos++;
3736 else
830247a4 3737 maxpos = RExC_parse;
a0d0e21e
LW
3738 max = atoi(maxpos);
3739 if (!max && *maxpos != '0')
c277df42
IZ
3740 max = REG_INFTY; /* meaning "infinity" */
3741 else if (max >= REG_INFTY)
8615cb43 3742 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
3743 RExC_parse = next;
3744 nextchar(pRExC_state);
a0d0e21e
LW
3745
3746 do_curly:
3747 if ((flags&SIMPLE)) {
830247a4
IZ
3748 RExC_naughty += 2 + RExC_naughty / 2;
3749 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
3750 Set_Node_Offset(ret, parse_start+1); /* MJD */
3751 Set_Node_Cur_Length(ret);
a0d0e21e
LW
3752 }
3753 else {
830247a4 3754 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
3755
3756 w->flags = 0;
830247a4
IZ
3757 regtail(pRExC_state, ret, w);
3758 if (!SIZE_ONLY && RExC_extralen) {
3759 reginsert(pRExC_state, LONGJMP,ret);
3760 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
3761 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3762 }
830247a4 3763 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
3764 /* MJD hk */
3765 Set_Node_Offset(ret, parse_start+1);
2af232bd 3766 Set_Node_Length(ret,
fac92740 3767 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 3768
830247a4 3769 if (!SIZE_ONLY && RExC_extralen)
c277df42 3770 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 3771 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 3772 if (SIZE_ONLY)
830247a4
IZ
3773 RExC_whilem_seen++, RExC_extralen += 3;
3774 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 3775 }
c277df42 3776 ret->flags = 0;
a0d0e21e
LW
3777
3778 if (min > 0)
821b33a5
IZ
3779 *flagp = WORST;
3780 if (max > 0)
3781 *flagp |= HASWIDTH;
a0d0e21e 3782 if (max && max < min)
8615cb43 3783 vFAIL("Can't do {n,m} with n > m");
c277df42 3784 if (!SIZE_ONLY) {
eb160463
GS
3785 ARG1_SET(ret, (U16)min);
3786 ARG2_SET(ret, (U16)max);
a687059c 3787 }
a687059c 3788
a0d0e21e 3789 goto nest_check;
a687059c 3790 }
a0d0e21e 3791 }
a687059c 3792
a0d0e21e
LW
3793 if (!ISMULT1(op)) {
3794 *flagp = flags;
a687059c 3795 return(ret);
a0d0e21e 3796 }
bb20fd44 3797
c277df42 3798#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
3799
3800 /* if this is reinstated, don't forget to put this back into perldiag:
3801
3802 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3803
3804 (F) The part of the regexp subject to either the * or + quantifier
3805 could match an empty string. The {#} shows in the regular
3806 expression about where the problem was discovered.
3807
3808 */
3809
bb20fd44 3810 if (!(flags&HASWIDTH) && op != '?')
b45f050a 3811 vFAIL("Regexp *+ operand could be empty");
b81d288d 3812#endif
bb20fd44 3813
fac92740 3814 parse_start = RExC_parse;
830247a4 3815 nextchar(pRExC_state);
a0d0e21e 3816
821b33a5 3817 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
3818
3819 if (op == '*' && (flags&SIMPLE)) {
830247a4 3820 reginsert(pRExC_state, STAR, ret);
c277df42 3821 ret->flags = 0;
830247a4 3822 RExC_naughty += 4;
a0d0e21e
LW
3823 }
3824 else if (op == '*') {
3825 min = 0;
3826 goto do_curly;
a0ed51b3
LW
3827 }
3828 else if (op == '+' && (flags&SIMPLE)) {
830247a4 3829 reginsert(pRExC_state, PLUS, ret);
c277df42 3830 ret->flags = 0;
830247a4 3831 RExC_naughty += 3;
a0d0e21e
LW
3832 }
3833 else if (op == '+') {
3834 min = 1;
3835 goto do_curly;
a0ed51b3
LW
3836 }
3837 else if (op == '?') {
a0d0e21e
LW
3838 min = 0; max = 1;
3839 goto do_curly;
3840 }
3841 nest_check:
041457d9 3842 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 3843 vWARN3(RExC_parse,
b45f050a 3844 "%.*s matches null string many times",
830247a4 3845 RExC_parse - origparse,
b45f050a 3846 origparse);
a0d0e21e
LW
3847 }
3848
830247a4
IZ
3849 if (*RExC_parse == '?') {
3850 nextchar(pRExC_state);
3851 reginsert(pRExC_state, MINMOD, ret);
3852 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 3853 }
830247a4
IZ
3854 if (ISMULT2(RExC_parse)) {
3855 RExC_parse++;
b45f050a
JF
3856 vFAIL("Nested quantifiers");
3857 }
a0d0e21e
LW
3858
3859 return(ret);
a687059c
LW
3860}
3861
3862/*
3863 - regatom - the lowest level
3864 *
3865 * Optimization: gobbles an entire sequence of ordinary characters so that
3866 * it can turn them into a single node, which is smaller to store and
3867 * faster to run. Backslashed characters are exceptions, each becoming a
3868 * separate node; the code is simpler that way and it's not worth fixing.
3869 *
b45f050a 3870 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 3871STATIC regnode *
830247a4 3872S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3873{
cbbf8932 3874 register regnode *ret = NULL;
a0d0e21e 3875 I32 flags;
45948336 3876 char *parse_start = RExC_parse;
a0d0e21e
LW
3877
3878 *flagp = WORST; /* Tentatively. */
3879
3880tryagain:
830247a4 3881 switch (*RExC_parse) {
a0d0e21e 3882 case '^':
830247a4
IZ
3883 RExC_seen_zerolen++;
3884 nextchar(pRExC_state);
e2509266 3885 if (RExC_flags & PMf_MULTILINE)
830247a4 3886 ret = reg_node(pRExC_state, MBOL);
e2509266 3887 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3888 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3889 else
830247a4 3890 ret = reg_node(pRExC_state, BOL);
fac92740 3891 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3892 break;
3893 case '$':
830247a4 3894 nextchar(pRExC_state);
b81d288d 3895 if (*RExC_parse)
830247a4 3896 RExC_seen_zerolen++;
e2509266 3897 if (RExC_flags & PMf_MULTILINE)
830247a4 3898 ret = reg_node(pRExC_state, MEOL);
e2509266 3899 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3900 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3901 else
830247a4 3902 ret = reg_node(pRExC_state, EOL);
fac92740 3903 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3904 break;
3905 case '.':
830247a4 3906 nextchar(pRExC_state);
e2509266 3907 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
3908 ret = reg_node(pRExC_state, SANY);
3909 else
3910 ret = reg_node(pRExC_state, REG_ANY);
3911 *flagp |= HASWIDTH|SIMPLE;
830247a4 3912 RExC_naughty++;
fac92740 3913 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3914 break;
3915 case '[':
b45f050a 3916 {
830247a4 3917 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 3918 ret = regclass(pRExC_state);
830247a4
IZ
3919 if (*RExC_parse != ']') {
3920 RExC_parse = oregcomp_parse;
b45f050a
JF
3921 vFAIL("Unmatched [");
3922 }
830247a4 3923 nextchar(pRExC_state);
a0d0e21e 3924 *flagp |= HASWIDTH|SIMPLE;
fac92740 3925 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 3926 break;
b45f050a 3927 }
a0d0e21e 3928 case '(':
830247a4
IZ
3929 nextchar(pRExC_state);
3930 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 3931 if (ret == NULL) {
bf93d4cc 3932 if (flags & TRYAGAIN) {
830247a4 3933 if (RExC_parse == RExC_end) {
bf93d4cc
GS
3934 /* Make parent create an empty node if needed. */
3935 *flagp |= TRYAGAIN;
3936 return(NULL);
3937 }
a0d0e21e 3938 goto tryagain;
bf93d4cc 3939 }
a0d0e21e
LW
3940 return(NULL);
3941 }
c277df42 3942 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
3943 break;
3944 case '|':
3945 case ')':
3946 if (flags & TRYAGAIN) {
3947 *flagp |= TRYAGAIN;
3948 return NULL;
3949 }
b45f050a 3950 vFAIL("Internal urp");
a0d0e21e
LW
3951 /* Supposed to be caught earlier. */
3952 break;
85afd4ae 3953 case '{':
830247a4
IZ
3954 if (!regcurly(RExC_parse)) {
3955 RExC_parse++;
85afd4ae
CS
3956 goto defchar;
3957 }
3958 /* FALL THROUGH */
a0d0e21e
LW
3959 case '?':
3960 case '+':
3961 case '*':
830247a4 3962 RExC_parse++;
b45f050a 3963 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
3964 break;
3965 case '\\':
830247a4 3966 switch (*++RExC_parse) {
a0d0e21e 3967 case 'A':
830247a4
IZ
3968 RExC_seen_zerolen++;
3969 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3970 *flagp |= SIMPLE;
830247a4 3971 nextchar(pRExC_state);
fac92740 3972 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3973 break;
3974 case 'G':
830247a4
IZ
3975 ret = reg_node(pRExC_state, GPOS);
3976 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 3977 *flagp |= SIMPLE;
830247a4 3978 nextchar(pRExC_state);
fac92740 3979 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3980 break;
3981 case 'Z':
830247a4 3982 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3983 *flagp |= SIMPLE;
a1917ab9 3984 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 3985 nextchar(pRExC_state);
a0d0e21e 3986 break;
b85d18e9 3987 case 'z':
830247a4 3988 ret = reg_node(pRExC_state, EOS);
b85d18e9 3989 *flagp |= SIMPLE;
830247a4
IZ
3990 RExC_seen_zerolen++; /* Do not optimize RE away */
3991 nextchar(pRExC_state);
fac92740 3992 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 3993 break;
4a2d328f 3994 case 'C':
f33976b4
DB
3995 ret = reg_node(pRExC_state, CANY);
3996 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 3997 *flagp |= HASWIDTH|SIMPLE;
830247a4 3998 nextchar(pRExC_state);
fac92740 3999 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
4000 break;
4001 case 'X':
830247a4 4002 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 4003 *flagp |= HASWIDTH;
830247a4 4004 nextchar(pRExC_state);
fac92740 4005 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 4006 break;
a0d0e21e 4007 case 'w':
eb160463 4008 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 4009 *flagp |= HASWIDTH|SIMPLE;
830247a4 4010 nextchar(pRExC_state);
fac92740 4011 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4012 break;
4013 case 'W':
eb160463 4014 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4015 *flagp |= HASWIDTH|SIMPLE;
830247a4 4016 nextchar(pRExC_state);
fac92740 4017 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4018 break;
4019 case 'b':
830247a4
IZ
4020 RExC_seen_zerolen++;
4021 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4022 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4023 *flagp |= SIMPLE;
830247a4 4024 nextchar(pRExC_state);
fac92740 4025 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4026 break;
4027 case 'B':
830247a4
IZ
4028 RExC_seen_zerolen++;
4029 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4030 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4031 *flagp |= SIMPLE;
830247a4 4032 nextchar(pRExC_state);
fac92740 4033 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4034 break;
4035 case 's':
eb160463 4036 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4037 *flagp |= HASWIDTH|SIMPLE;
830247a4 4038 nextchar(pRExC_state);
fac92740 4039 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4040 break;
4041 case 'S':
eb160463 4042 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4043 *flagp |= HASWIDTH|SIMPLE;
830247a4 4044 nextchar(pRExC_state);
fac92740 4045 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4046 break;
4047 case 'd':
ffc61ed2 4048 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4049 *flagp |= HASWIDTH|SIMPLE;
830247a4 4050 nextchar(pRExC_state);
fac92740 4051 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4052 break;
4053 case 'D':
ffc61ed2 4054 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4055 *flagp |= HASWIDTH|SIMPLE;
830247a4 4056 nextchar(pRExC_state);
fac92740 4057 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4058 break;
a14b48bc
LW
4059 case 'p':
4060 case 'P':
3568d838 4061 {
830247a4 4062 char* oldregxend = RExC_end;
ccb2c380 4063 char* parse_start = RExC_parse - 2;
a14b48bc 4064
830247a4 4065 if (RExC_parse[1] == '{') {
3568d838 4066 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4067 RExC_end = strchr(RExC_parse, '}');
4068 if (!RExC_end) {
0da60cf5 4069 U8 c = (U8)*RExC_parse;
830247a4
IZ
4070 RExC_parse += 2;
4071 RExC_end = oldregxend;
0da60cf5 4072 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4073 }
830247a4 4074 RExC_end++;
a14b48bc 4075 }
af6f566e 4076 else {
830247a4 4077 RExC_end = RExC_parse + 2;
af6f566e
HS
4078 if (RExC_end > oldregxend)
4079 RExC_end = oldregxend;
4080 }
830247a4 4081 RExC_parse--;
a14b48bc 4082
ffc61ed2 4083 ret = regclass(pRExC_state);
a14b48bc 4084
830247a4
IZ
4085 RExC_end = oldregxend;
4086 RExC_parse--;
ccb2c380
MP
4087
4088 Set_Node_Offset(ret, parse_start + 2);
4089 Set_Node_Cur_Length(ret);
830247a4 4090 nextchar(pRExC_state);
a14b48bc
LW
4091 *flagp |= HASWIDTH|SIMPLE;
4092 }
4093 break;
a0d0e21e
LW
4094 case 'n':
4095 case 'r':
4096 case 't':
4097 case 'f':
4098 case 'e':
4099 case 'a':
4100 case 'x':
4101 case 'c':
4102 case '0':
4103 goto defchar;
4104 case '1': case '2': case '3': case '4':
4105 case '5': case '6': case '7': case '8': case '9':
4106 {
1df70142 4107 const I32 num = atoi(RExC_parse);
a0d0e21e 4108
830247a4 4109 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4110 goto defchar;
4111 else {
fac92740 4112 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4113 while (isDIGIT(*RExC_parse))
4114 RExC_parse++;
b45f050a 4115
eb160463 4116 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4117 vFAIL("Reference to nonexistent group");
830247a4 4118 RExC_sawback = 1;
eb160463
GS
4119 ret = reganode(pRExC_state,
4120 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4121 num);
a0d0e21e 4122 *flagp |= HASWIDTH;
2af232bd 4123
fac92740 4124 /* override incorrect value set in reganode MJD */
2af232bd 4125 Set_Node_Offset(ret, parse_start+1);
fac92740 4126 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4127 RExC_parse--;
4128 nextchar(pRExC_state);
a0d0e21e
LW
4129 }
4130 }
4131 break;
4132 case '\0':
830247a4 4133 if (RExC_parse >= RExC_end)
b45f050a 4134 FAIL("Trailing \\");
a0d0e21e
LW
4135 /* FALL THROUGH */
4136 default:
a0288114 4137 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 4138 back into the quick-grab loop below */
45948336 4139 parse_start--;
a0d0e21e
LW
4140 goto defchar;
4141 }
4142 break;
4633a7c4
LW
4143
4144 case '#':
e2509266 4145 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4146 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4147 if (RExC_parse < RExC_end)
4633a7c4
LW
4148 goto tryagain;
4149 }
4150 /* FALL THROUGH */
4151
a0d0e21e 4152 default: {
ba210ebe 4153 register STRLEN len;
58ae7d3f 4154 register UV ender;
a0d0e21e 4155 register char *p;
c277df42 4156 char *oldp, *s;
80aecb99 4157 STRLEN foldlen;
89ebb4a3 4158 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4159
4160 parse_start = RExC_parse - 1;
a0d0e21e 4161
830247a4 4162 RExC_parse++;
a0d0e21e
LW
4163
4164 defchar:
58ae7d3f 4165 ender = 0;
eb160463
GS
4166 ret = reg_node(pRExC_state,
4167 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4168 s = STRING(ret);
830247a4
IZ
4169 for (len = 0, p = RExC_parse - 1;
4170 len < 127 && p < RExC_end;
a0d0e21e
LW
4171 len++)
4172 {
4173 oldp = p;
5b5a24f7 4174
e2509266 4175 if (RExC_flags & PMf_EXTENDED)
830247a4 4176 p = regwhite(p, RExC_end);
a0d0e21e
LW
4177 switch (*p) {
4178 case '^':
4179 case '$':
4180 case '.':
4181 case '[':
4182 case '(':
4183 case ')':
4184 case '|':
4185 goto loopdone;
4186 case '\\':
4187 switch (*++p) {
4188 case 'A':
1ed8eac0
JF
4189 case 'C':
4190 case 'X':
a0d0e21e
LW
4191 case 'G':
4192 case 'Z':
b85d18e9 4193 case 'z':
a0d0e21e
LW
4194 case 'w':
4195 case 'W':
4196 case 'b':
4197 case 'B':
4198 case 's':
4199 case 'S':
4200 case 'd':
4201 case 'D':
a14b48bc
LW
4202 case 'p':
4203 case 'P':
a0d0e21e
LW
4204 --p;
4205 goto loopdone;
4206 case 'n':
4207 ender = '\n';
4208 p++;
a687059c 4209 break;
a0d0e21e
LW
4210 case 'r':
4211 ender = '\r';
4212 p++;
a687059c 4213 break;
a0d0e21e
LW
4214 case 't':
4215 ender = '\t';
4216 p++;
a687059c 4217 break;
a0d0e21e
LW
4218 case 'f':
4219 ender = '\f';
4220 p++;
a687059c 4221 break;
a0d0e21e 4222 case 'e':
c7f1f016 4223 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4224 p++;
a687059c 4225 break;
a0d0e21e 4226 case 'a':
c7f1f016 4227 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4228 p++;
a687059c 4229 break;
a0d0e21e 4230 case 'x':
a0ed51b3 4231 if (*++p == '{') {
1df70142 4232 char* const e = strchr(p, '}');
b81d288d 4233
b45f050a 4234 if (!e) {
830247a4 4235 RExC_parse = p + 1;
b45f050a
JF
4236 vFAIL("Missing right brace on \\x{}");
4237 }
de5f0749 4238 else {
a4c04bdc
NC
4239 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4240 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 4241 STRLEN numlen = e - p - 1;
53305cf1 4242 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4243 if (ender > 0xff)
4244 RExC_utf8 = 1;
a0ed51b3
LW
4245 p = e + 1;
4246 }
a0ed51b3
LW
4247 }
4248 else {
a4c04bdc 4249 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 4250 STRLEN numlen = 2;
53305cf1 4251 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4252 p += numlen;
4253 }
a687059c 4254 break;
a0d0e21e
LW
4255 case 'c':
4256 p++;
bbce6d69 4257 ender = UCHARAT(p++);
4258 ender = toCTRL(ender);
a687059c 4259 break;
a0d0e21e
LW
4260 case '0': case '1': case '2': case '3':case '4':
4261 case '5': case '6': case '7': case '8':case '9':
4262 if (*p == '0' ||
830247a4 4263 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 4264 I32 flags = 0;
1df70142 4265 STRLEN numlen = 3;
53305cf1 4266 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4267 p += numlen;
4268 }
4269 else {
4270 --p;
4271 goto loopdone;
a687059c
LW
4272 }
4273 break;
a0d0e21e 4274 case '\0':
830247a4 4275 if (p >= RExC_end)
b45f050a 4276 FAIL("Trailing \\");
a687059c 4277 /* FALL THROUGH */
a0d0e21e 4278 default:
041457d9 4279 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 4280 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4281 goto normal_default;
a0d0e21e
LW
4282 }
4283 break;
a687059c 4284 default:
a0ed51b3 4285 normal_default:
fd400ab9 4286 if (UTF8_IS_START(*p) && UTF) {
1df70142 4287 STRLEN numlen;
5e12f4fb 4288 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 4289 &numlen, 0);
a0ed51b3
LW
4290 p += numlen;
4291 }
4292 else
4293 ender = *p++;
a0d0e21e 4294 break;
a687059c 4295 }
e2509266 4296 if (RExC_flags & PMf_EXTENDED)
830247a4 4297 p = regwhite(p, RExC_end);
60a8b682
JH
4298 if (UTF && FOLD) {
4299 /* Prime the casefolded buffer. */
ac7e0132 4300 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4301 }
a0d0e21e
LW
4302 if (ISMULT2(p)) { /* Back off on ?+*. */
4303 if (len)
4304 p = oldp;
16ea2a2e 4305 else if (UTF) {
0ebc6274
JH
4306 STRLEN unilen;
4307
80aecb99 4308 if (FOLD) {
60a8b682 4309 /* Emit all the Unicode characters. */
1df70142 4310 STRLEN numlen;
80aecb99
JH
4311 for (foldbuf = tmpbuf;
4312 foldlen;
4313 foldlen -= numlen) {
4314 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4315 if (numlen > 0) {
0ebc6274
JH
4316 reguni(pRExC_state, ender, s, &unilen);
4317 s += unilen;
4318 len += unilen;
4319 /* In EBCDIC the numlen
4320 * and unilen can differ. */
9dc45d57 4321 foldbuf += numlen;
47654450
JH
4322 if (numlen >= foldlen)
4323 break;
9dc45d57
JH
4324 }
4325 else
4326 break; /* "Can't happen." */
80aecb99
JH
4327 }
4328 }
4329 else {
0ebc6274 4330 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4331 if (unilen > 0) {
0ebc6274
JH
4332 s += unilen;
4333 len += unilen;
9dc45d57 4334 }
80aecb99 4335 }
a0ed51b3 4336 }
a0d0e21e
LW
4337 else {
4338 len++;
eb160463 4339 REGC((char)ender, s++);
a0d0e21e
LW
4340 }
4341 break;
a687059c 4342 }
16ea2a2e 4343 if (UTF) {
0ebc6274
JH
4344 STRLEN unilen;
4345
80aecb99 4346 if (FOLD) {
60a8b682 4347 /* Emit all the Unicode characters. */
1df70142 4348 STRLEN numlen;
80aecb99
JH
4349 for (foldbuf = tmpbuf;
4350 foldlen;
4351 foldlen -= numlen) {
4352 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4353 if (numlen > 0) {
0ebc6274
JH
4354 reguni(pRExC_state, ender, s, &unilen);
4355 len += unilen;
4356 s += unilen;
4357 /* In EBCDIC the numlen
4358 * and unilen can differ. */
9dc45d57 4359 foldbuf += numlen;
47654450
JH
4360 if (numlen >= foldlen)
4361 break;
9dc45d57
JH
4362 }
4363 else
4364 break;
80aecb99
JH
4365 }
4366 }
4367 else {
0ebc6274 4368 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4369 if (unilen > 0) {
0ebc6274
JH
4370 s += unilen;
4371 len += unilen;
9dc45d57 4372 }
80aecb99
JH
4373 }
4374 len--;
a0ed51b3
LW
4375 }
4376 else
eb160463 4377 REGC((char)ender, s++);
a0d0e21e
LW
4378 }
4379 loopdone:
830247a4 4380 RExC_parse = p - 1;
fac92740 4381 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4382 nextchar(pRExC_state);
793db0cb
JH
4383 {
4384 /* len is STRLEN which is unsigned, need to copy to signed */
4385 IV iv = len;
4386 if (iv < 0)
4387 vFAIL("Internal disaster");
4388 }
a0d0e21e
LW
4389 if (len > 0)
4390 *flagp |= HASWIDTH;
090f7165 4391 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4392 *flagp |= SIMPLE;
c277df42 4393 if (!SIZE_ONLY)
cd439c50
IZ
4394 STR_LEN(ret) = len;
4395 if (SIZE_ONLY)
830247a4 4396 RExC_size += STR_SZ(len);
cd439c50 4397 else
830247a4 4398 RExC_emit += STR_SZ(len);
a687059c 4399 }
a0d0e21e
LW
4400 break;
4401 }
a687059c 4402
60a8b682
JH
4403 /* If the encoding pragma is in effect recode the text of
4404 * any EXACT-kind nodes. */
22c54be3 4405 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
4406 STRLEN oldlen = STR_LEN(ret);
4407 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4408
4409 if (RExC_utf8)
4410 SvUTF8_on(sv);
4411 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
4412 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4413 const STRLEN newlen = SvCUR(sv);
d0063567
DK
4414
4415 if (SvUTF8(sv))
4416 RExC_utf8 = 1;
4417 if (!SIZE_ONLY) {
a3621e74
YO
4418 GET_RE_DEBUG_FLAGS_DECL;
4419 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
4420 (int)oldlen, STRING(ret),
4421 (int)newlen, s));
4422 Copy(s, STRING(ret), newlen, char);
4423 STR_LEN(ret) += newlen - oldlen;
4424 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4425 } else
4426 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4427 }
a72c7584
JH
4428 }
4429
a0d0e21e 4430 return(ret);
a687059c
LW
4431}
4432
873ef191 4433STATIC char *
504618e9 4434S_regwhite(pTHX_ char *p, const char *e)
5b5a24f7
CS
4435{
4436 while (p < e) {
4437 if (isSPACE(*p))
4438 ++p;
4439 else if (*p == '#') {
4440 do {
4441 p++;
4442 } while (p < e && *p != '\n');
4443 }
4444 else
4445 break;
4446 }
4447 return p;
4448}
4449
b8c5462f
JH
4450/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4451 Character classes ([:foo:]) can also be negated ([:^foo:]).
4452 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4453 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 4454 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
4455
4456#define POSIXCC_DONE(c) ((c) == ':')
4457#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4458#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4459
b8c5462f 4460STATIC I32
830247a4 4461S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 4462{
936ed897 4463 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 4464
830247a4 4465 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 4466 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 4467 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 4468 const char c = UCHARAT(RExC_parse);
830247a4 4469 char* s = RExC_parse++;
b81d288d 4470
9a86a77b 4471 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
4472 RExC_parse++;
4473 if (RExC_parse == RExC_end)
620e46c5 4474 /* Grandfather lone [:, [=, [. */
830247a4 4475 RExC_parse = s;
620e46c5 4476 else {
1df70142 4477 const char* t = RExC_parse++; /* skip over the c */
a28509cc 4478 const char *posixcc;
b8c5462f 4479
80916619
NC
4480 assert(*t == c);
4481
9a86a77b 4482 if (UCHARAT(RExC_parse) == ']') {
830247a4 4483 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
4484 posixcc = s + 1;
4485 if (*s == ':') {
1df70142
AL
4486 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4487 const I32 skip = t - posixcc;
80916619
NC
4488
4489 /* Initially switch on the length of the name. */
4490 switch (skip) {
4491 case 4:
4492 if (memEQ(posixcc, "word", 4)) {
4493 /* this is not POSIX, this is the Perl \w */;
4494 namedclass
4495 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4496 }
cc4319de 4497 break;
80916619
NC
4498 case 5:
4499 /* Names all of length 5. */
4500 /* alnum alpha ascii blank cntrl digit graph lower
4501 print punct space upper */
4502 /* Offset 4 gives the best switch position. */
4503 switch (posixcc[4]) {
4504 case 'a':
4505 if (memEQ(posixcc, "alph", 4)) {
4506 /* a */
4507 namedclass
4508 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4509 }
4510 break;
4511 case 'e':
4512 if (memEQ(posixcc, "spac", 4)) {
4513 /* e */
4514 namedclass
4515 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4516 }
4517 break;
4518 case 'h':
4519 if (memEQ(posixcc, "grap", 4)) {
4520 /* h */
4521 namedclass
4522 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4523 }
4524 break;
4525 case 'i':
4526 if (memEQ(posixcc, "asci", 4)) {
4527 /* i */
4528 namedclass
4529 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4530 }
4531 break;
4532 case 'k':
4533 if (memEQ(posixcc, "blan", 4)) {
4534 /* k */
4535 namedclass
4536 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4537 }
4538 break;
4539 case 'l':
4540 if (memEQ(posixcc, "cntr", 4)) {
4541 /* l */
4542 namedclass
4543 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4544 }
4545 break;
4546 case 'm':
4547 if (memEQ(posixcc, "alnu", 4)) {
4548 /* m */
4549 namedclass
4550 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4551 }
4552 break;
4553 case 'r':
4554 if (memEQ(posixcc, "lowe", 4)) {
4555 /* r */
4556 namedclass
4557 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4558 }
4559 if (memEQ(posixcc, "uppe", 4)) {
8fdec511 4560 /* r */
80916619
NC
4561 namedclass
4562 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4563 }
4564 break;
4565 case 't':
4566 if (memEQ(posixcc, "digi", 4)) {
4567 /* t */
4568 namedclass
4569 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4570 }
4571 if (memEQ(posixcc, "prin", 4)) {
8fdec511 4572 /* t */
80916619
NC
4573 namedclass
4574 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4575 }
4576 if (memEQ(posixcc, "punc", 4)) {
4577 /* t */
4578 namedclass
4579 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4580 }
4581 break;
b8c5462f
JH
4582 }
4583 break;
80916619
NC
4584 case 6:
4585 if (memEQ(posixcc, "xdigit", 6)) {
4586 namedclass
4587 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
4588 }
4589 break;
4590 }
80916619
NC
4591
4592 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
4593 {
4594 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4595 t - s - 1, s + 1);
4596 }
80916619
NC
4597 assert (posixcc[skip] == ':');
4598 assert (posixcc[skip+1] == ']');
b45f050a 4599 } else if (!SIZE_ONLY) {
b8c5462f 4600 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 4601
830247a4 4602 /* adjust RExC_parse so the warning shows after
b45f050a 4603 the class closes */
9a86a77b 4604 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 4605 RExC_parse++;
b45f050a
JF
4606 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4607 }
b8c5462f
JH
4608 } else {
4609 /* Maternal grandfather:
4610 * "[:" ending in ":" but not in ":]" */
830247a4 4611 RExC_parse = s;
767d463e 4612 }
620e46c5
JH
4613 }
4614 }
4615
b8c5462f
JH
4616 return namedclass;
4617}
4618
4619STATIC void
830247a4 4620S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 4621{
b938889d 4622 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
4623 const char *s = RExC_parse;
4624 const char c = *s++;
b8c5462f
JH
4625
4626 while(*s && isALNUM(*s))
4627 s++;
4628 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
4629 if (ckWARN(WARN_REGEXP))
4630 vWARN3(s+2,
4631 "POSIX syntax [%c %c] belongs inside character classes",
4632 c, c);
b45f050a
JF
4633
4634 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 4635 if (POSIXCC_NOTYET(c)) {
830247a4 4636 /* adjust RExC_parse so the error shows after
b45f050a 4637 the class closes */
9a86a77b 4638 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
4639 ;
4640 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4641 }
b8c5462f
JH
4642 }
4643 }
620e46c5
JH
4644}
4645
76e3520e 4646STATIC regnode *
830247a4 4647S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 4648{
ffc61ed2 4649 register UV value;
9a86a77b 4650 register UV nextvalue;
3568d838 4651 register IV prevvalue = OOB_UNICODE;
ffc61ed2 4652 register IV range = 0;
c277df42 4653 register regnode *ret;
ba210ebe 4654 STRLEN numlen;
ffc61ed2 4655 IV namedclass;
cbbf8932 4656 char *rangebegin = NULL;
936ed897 4657 bool need_class = 0;
c445ea15 4658 SV *listsv = NULL;
ffc61ed2
JH
4659 register char *e;
4660 UV n;
9e55ce06 4661 bool optimize_invert = TRUE;
cbbf8932 4662 AV* unicode_alternate = NULL;
1b2d223b
JH
4663#ifdef EBCDIC
4664 UV literal_endpoint = 0;
4665#endif
ffc61ed2
JH
4666
4667 ret = reganode(pRExC_state, ANYOF, 0);
4668
4669 if (!SIZE_ONLY)
4670 ANYOF_FLAGS(ret) = 0;
4671
9a86a77b 4672 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
4673 RExC_naughty++;
4674 RExC_parse++;
4675 if (!SIZE_ONLY)
4676 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4677 }
a0d0e21e 4678
936ed897 4679 if (SIZE_ONLY)
830247a4 4680 RExC_size += ANYOF_SKIP;
936ed897 4681 else {
830247a4 4682 RExC_emit += ANYOF_SKIP;
936ed897
IZ
4683 if (FOLD)
4684 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4685 if (LOC)
4686 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 4687 ANYOF_BITMAP_ZERO(ret);
396482e1 4688 listsv = newSVpvs("# comment\n");
a0d0e21e 4689 }
b8c5462f 4690
9a86a77b
JH
4691 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4692
b938889d 4693 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 4694 checkposixcc(pRExC_state);
b8c5462f 4695
f064b6ad
HS
4696 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4697 if (UCHARAT(RExC_parse) == ']')
4698 goto charclassloop;
ffc61ed2 4699
9a86a77b 4700 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
4701
4702 charclassloop:
4703
4704 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4705
73b437c8 4706 if (!range)
830247a4 4707 rangebegin = RExC_parse;
ffc61ed2 4708 if (UTF) {
5e12f4fb 4709 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
4710 RExC_end - RExC_parse,
4711 &numlen, 0);
ffc61ed2
JH
4712 RExC_parse += numlen;
4713 }
4714 else
4715 value = UCHARAT(RExC_parse++);
9a86a77b
JH
4716 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4717 if (value == '[' && POSIXCC(nextvalue))
830247a4 4718 namedclass = regpposixcc(pRExC_state, value);
620e46c5 4719 else if (value == '\\') {
ffc61ed2 4720 if (UTF) {
5e12f4fb 4721 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
4722 RExC_end - RExC_parse,
4723 &numlen, 0);
4724 RExC_parse += numlen;
4725 }
4726 else
4727 value = UCHARAT(RExC_parse++);
470c3474 4728 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 4729 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
4730 * be a problem later if we want switch on Unicode.
4731 * A similar issue a little bit later when switching on
4732 * namedclass. --jhi */
ffc61ed2 4733 switch ((I32)value) {
b8c5462f
JH
4734 case 'w': namedclass = ANYOF_ALNUM; break;
4735 case 'W': namedclass = ANYOF_NALNUM; break;
4736 case 's': namedclass = ANYOF_SPACE; break;
4737 case 'S': namedclass = ANYOF_NSPACE; break;
4738 case 'd': namedclass = ANYOF_DIGIT; break;
4739 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
4740 case 'p':
4741 case 'P':
af6f566e 4742 if (RExC_parse >= RExC_end)
2a4859cd 4743 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 4744 if (*RExC_parse == '{') {
1df70142 4745 const U8 c = (U8)value;
ffc61ed2
JH
4746 e = strchr(RExC_parse++, '}');
4747 if (!e)
0da60cf5 4748 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
4749 while (isSPACE(UCHARAT(RExC_parse)))
4750 RExC_parse++;
4751 if (e == RExC_parse)
0da60cf5 4752 vFAIL2("Empty \\%c{}", c);
ffc61ed2 4753 n = e - RExC_parse;
ab13f0c7
JH
4754 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4755 n--;
ffc61ed2
JH
4756 }
4757 else {
4758 e = RExC_parse;
4759 n = 1;
4760 }
4761 if (!SIZE_ONLY) {
ab13f0c7
JH
4762 if (UCHARAT(RExC_parse) == '^') {
4763 RExC_parse++;
4764 n--;
4765 value = value == 'p' ? 'P' : 'p'; /* toggle */
4766 while (isSPACE(UCHARAT(RExC_parse))) {
4767 RExC_parse++;
4768 n--;
4769 }
4770 }
ffc61ed2 4771 if (value == 'p')
ab13f0c7
JH
4772 Perl_sv_catpvf(aTHX_ listsv,
4773 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 4774 else
ab13f0c7
JH
4775 Perl_sv_catpvf(aTHX_ listsv,
4776 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
4777 }
4778 RExC_parse = e + 1;
4779 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2
JP
4780 namedclass = ANYOF_MAX; /* no official name, but it's named */
4781 break;
b8c5462f
JH
4782 case 'n': value = '\n'; break;
4783 case 'r': value = '\r'; break;
4784 case 't': value = '\t'; break;
4785 case 'f': value = '\f'; break;
4786 case 'b': value = '\b'; break;
c7f1f016
NIS
4787 case 'e': value = ASCII_TO_NATIVE('\033');break;
4788 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 4789 case 'x':
ffc61ed2 4790 if (*RExC_parse == '{') {
a4c04bdc
NC
4791 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4792 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 4793 e = strchr(RExC_parse++, '}');
b81d288d 4794 if (!e)
ffc61ed2 4795 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
4796
4797 numlen = e - RExC_parse;
4798 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4799 RExC_parse = e + 1;
4800 }
4801 else {
a4c04bdc 4802 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4803 numlen = 2;
4804 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4805 RExC_parse += numlen;
4806 }
b8c5462f
JH
4807 break;
4808 case 'c':
830247a4 4809 value = UCHARAT(RExC_parse++);
b8c5462f
JH
4810 value = toCTRL(value);
4811 break;
4812 case '0': case '1': case '2': case '3': case '4':
4813 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
4814 {
4815 I32 flags = 0;
4816 numlen = 3;
4817 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 4818 RExC_parse += numlen;
b8c5462f 4819 break;
53305cf1 4820 }
1028017a 4821 default:
041457d9 4822 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
4823 vWARN2(RExC_parse,
4824 "Unrecognized escape \\%c in character class passed through",
4825 (int)value);
1028017a 4826 break;
b8c5462f 4827 }
ffc61ed2 4828 } /* end of \blah */
1b2d223b
JH
4829#ifdef EBCDIC
4830 else
4831 literal_endpoint++;
4832#endif
ffc61ed2
JH
4833
4834 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4835
4836 if (!SIZE_ONLY && !need_class)
936ed897 4837 ANYOF_CLASS_ZERO(ret);
ffc61ed2 4838
936ed897 4839 need_class = 1;
ffc61ed2
JH
4840
4841 /* a bad range like a-\d, a-[:digit:] ? */
4842 if (range) {
73b437c8 4843 if (!SIZE_ONLY) {
e476b1b5 4844 if (ckWARN(WARN_REGEXP))
830247a4 4845 vWARN4(RExC_parse,
b45f050a 4846 "False [] range \"%*.*s\"",
830247a4
IZ
4847 RExC_parse - rangebegin,
4848 RExC_parse - rangebegin,
b45f050a 4849 rangebegin);
3568d838
JH
4850 if (prevvalue < 256) {
4851 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
4852 ANYOF_BITMAP_SET(ret, '-');
4853 }
4854 else {
4855 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4856 Perl_sv_catpvf(aTHX_ listsv,
3568d838 4857 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 4858 }
b8c5462f 4859 }
ffc61ed2
JH
4860
4861 range = 0; /* this was not a true range */
73b437c8 4862 }
ffc61ed2 4863
73b437c8 4864 if (!SIZE_ONLY) {
c49a72a9
NC
4865 const char *what = NULL;
4866 char yesno = 0;
4867
3568d838
JH
4868 if (namedclass > OOB_NAMEDCLASS)
4869 optimize_invert = FALSE;
e2962f66
JH
4870 /* Possible truncation here but in some 64-bit environments
4871 * the compiler gets heartburn about switch on 64-bit values.
4872 * A similar issue a little earlier when switching on value.
98f323fa 4873 * --jhi */
e2962f66 4874 switch ((I32)namedclass) {
73b437c8
JH
4875 case ANYOF_ALNUM:
4876 if (LOC)
936ed897 4877 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
4878 else {
4879 for (value = 0; value < 256; value++)
4880 if (isALNUM(value))
936ed897 4881 ANYOF_BITMAP_SET(ret, value);
73b437c8 4882 }
c49a72a9
NC
4883 yesno = '+';
4884 what = "Word";
73b437c8
JH
4885 break;
4886 case ANYOF_NALNUM:
4887 if (LOC)
936ed897 4888 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
4889 else {
4890 for (value = 0; value < 256; value++)
4891 if (!isALNUM(value))
936ed897 4892 ANYOF_BITMAP_SET(ret, value);
73b437c8 4893 }
c49a72a9
NC
4894 yesno = '!';
4895 what = "Word";
73b437c8 4896 break;
ffc61ed2 4897 case ANYOF_ALNUMC:
73b437c8 4898 if (LOC)
ffc61ed2 4899 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
4900 else {
4901 for (value = 0; value < 256; value++)
ffc61ed2 4902 if (isALNUMC(value))
936ed897 4903 ANYOF_BITMAP_SET(ret, value);
73b437c8 4904 }
c49a72a9
NC
4905 yesno = '+';
4906 what = "Alnum";
73b437c8
JH
4907 break;
4908 case ANYOF_NALNUMC:
4909 if (LOC)
936ed897 4910 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
4911 else {
4912 for (value = 0; value < 256; value++)
4913 if (!isALNUMC(value))
936ed897 4914 ANYOF_BITMAP_SET(ret, value);
73b437c8 4915 }
c49a72a9
NC
4916 yesno = '!';
4917 what = "Alnum";
73b437c8
JH
4918 break;
4919 case ANYOF_ALPHA:
4920 if (LOC)
936ed897 4921 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
4922 else {
4923 for (value = 0; value < 256; value++)
4924 if (isALPHA(value))
936ed897 4925 ANYOF_BITMAP_SET(ret, value);
73b437c8 4926 }
c49a72a9
NC
4927 yesno = '+';
4928 what = "Alpha";
73b437c8
JH
4929 break;
4930 case ANYOF_NALPHA:
4931 if (LOC)
936ed897 4932 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
4933 else {
4934 for (value = 0; value < 256; value++)
4935 if (!isALPHA(value))
936ed897 4936 ANYOF_BITMAP_SET(ret, value);
73b437c8 4937 }
c49a72a9
NC
4938 yesno = '!';
4939 what = "Alpha";
73b437c8
JH
4940 break;
4941 case ANYOF_ASCII:
4942 if (LOC)
936ed897 4943 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 4944 else {
c7f1f016 4945#ifndef EBCDIC
1ba5c669
JH
4946 for (value = 0; value < 128; value++)
4947 ANYOF_BITMAP_SET(ret, value);
4948#else /* EBCDIC */
ffbc6a93 4949 for (value = 0; value < 256; value++) {
3a3c4447
JH
4950 if (isASCII(value))
4951 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4952 }
1ba5c669 4953#endif /* EBCDIC */
73b437c8 4954 }
c49a72a9
NC
4955 yesno = '+';
4956 what = "ASCII";
73b437c8
JH
4957 break;
4958 case ANYOF_NASCII:
4959 if (LOC)
936ed897 4960 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 4961 else {
c7f1f016 4962#ifndef EBCDIC
1ba5c669
JH
4963 for (value = 128; value < 256; value++)
4964 ANYOF_BITMAP_SET(ret, value);
4965#else /* EBCDIC */
ffbc6a93 4966 for (value = 0; value < 256; value++) {
3a3c4447
JH
4967 if (!isASCII(value))
4968 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4969 }
1ba5c669 4970#endif /* EBCDIC */
73b437c8 4971 }
c49a72a9
NC
4972 yesno = '!';
4973 what = "ASCII";
73b437c8 4974 break;
aaa51d5e
JF
4975 case ANYOF_BLANK:
4976 if (LOC)
4977 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4978 else {
4979 for (value = 0; value < 256; value++)
4980 if (isBLANK(value))
4981 ANYOF_BITMAP_SET(ret, value);
4982 }
c49a72a9
NC
4983 yesno = '+';
4984 what = "Blank";
aaa51d5e
JF
4985 break;
4986 case ANYOF_NBLANK:
4987 if (LOC)
4988 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4989 else {
4990 for (value = 0; value < 256; value++)
4991 if (!isBLANK(value))
4992 ANYOF_BITMAP_SET(ret, value);
4993 }
c49a72a9
NC
4994 yesno = '!';
4995 what = "Blank";
aaa51d5e 4996 break;
73b437c8
JH
4997 case ANYOF_CNTRL:
4998 if (LOC)
936ed897 4999 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
5000 else {
5001 for (value = 0; value < 256; value++)
5002 if (isCNTRL(value))
936ed897 5003 ANYOF_BITMAP_SET(ret, value);
73b437c8 5004 }
c49a72a9
NC
5005 yesno = '+';
5006 what = "Cntrl";
73b437c8
JH
5007 break;
5008 case ANYOF_NCNTRL:
5009 if (LOC)
936ed897 5010 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5011 else {
5012 for (value = 0; value < 256; value++)
5013 if (!isCNTRL(value))
936ed897 5014 ANYOF_BITMAP_SET(ret, value);
73b437c8 5015 }
c49a72a9
NC
5016 yesno = '!';
5017 what = "Cntrl";
ffc61ed2
JH
5018 break;
5019 case ANYOF_DIGIT:
5020 if (LOC)
5021 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5022 else {
5023 /* consecutive digits assumed */
5024 for (value = '0'; value <= '9'; value++)
5025 ANYOF_BITMAP_SET(ret, value);
5026 }
c49a72a9
NC
5027 yesno = '+';
5028 what = "Digit";
ffc61ed2
JH
5029 break;
5030 case ANYOF_NDIGIT:
5031 if (LOC)
5032 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5033 else {
5034 /* consecutive digits assumed */
5035 for (value = 0; value < '0'; value++)
5036 ANYOF_BITMAP_SET(ret, value);
5037 for (value = '9' + 1; value < 256; value++)
5038 ANYOF_BITMAP_SET(ret, value);
5039 }
c49a72a9
NC
5040 yesno = '!';
5041 what = "Digit";
73b437c8
JH
5042 break;
5043 case ANYOF_GRAPH:
5044 if (LOC)
936ed897 5045 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5046 else {
5047 for (value = 0; value < 256; value++)
5048 if (isGRAPH(value))
936ed897 5049 ANYOF_BITMAP_SET(ret, value);
73b437c8 5050 }
c49a72a9
NC
5051 yesno = '+';
5052 what = "Graph";
73b437c8
JH
5053 break;
5054 case ANYOF_NGRAPH:
5055 if (LOC)
936ed897 5056 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5057 else {
5058 for (value = 0; value < 256; value++)
5059 if (!isGRAPH(value))
936ed897 5060 ANYOF_BITMAP_SET(ret, value);
73b437c8 5061 }
c49a72a9
NC
5062 yesno = '!';
5063 what = "Graph";
73b437c8
JH
5064 break;
5065 case ANYOF_LOWER:
5066 if (LOC)
936ed897 5067 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5068 else {
5069 for (value = 0; value < 256; value++)
5070 if (isLOWER(value))
936ed897 5071 ANYOF_BITMAP_SET(ret, value);
73b437c8 5072 }
c49a72a9
NC
5073 yesno = '+';
5074 what = "Lower";
73b437c8
JH
5075 break;
5076 case ANYOF_NLOWER:
5077 if (LOC)
936ed897 5078 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5079 else {
5080 for (value = 0; value < 256; value++)
5081 if (!isLOWER(value))
936ed897 5082 ANYOF_BITMAP_SET(ret, value);
73b437c8 5083 }
c49a72a9
NC
5084 yesno = '!';
5085 what = "Lower";
73b437c8
JH
5086 break;
5087 case ANYOF_PRINT:
5088 if (LOC)
936ed897 5089 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5090 else {
5091 for (value = 0; value < 256; value++)
5092 if (isPRINT(value))
936ed897 5093 ANYOF_BITMAP_SET(ret, value);
73b437c8 5094 }
c49a72a9
NC
5095 yesno = '+';
5096 what = "Print";
73b437c8
JH
5097 break;
5098 case ANYOF_NPRINT:
5099 if (LOC)
936ed897 5100 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5101 else {
5102 for (value = 0; value < 256; value++)
5103 if (!isPRINT(value))
936ed897 5104 ANYOF_BITMAP_SET(ret, value);
73b437c8 5105 }
c49a72a9
NC
5106 yesno = '!';
5107 what = "Print";
73b437c8 5108 break;
aaa51d5e
JF
5109 case ANYOF_PSXSPC:
5110 if (LOC)
5111 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5112 else {
5113 for (value = 0; value < 256; value++)
5114 if (isPSXSPC(value))
5115 ANYOF_BITMAP_SET(ret, value);
5116 }
c49a72a9
NC
5117 yesno = '+';
5118 what = "Space";
aaa51d5e
JF
5119 break;
5120 case ANYOF_NPSXSPC:
5121 if (LOC)
5122 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5123 else {
5124 for (value = 0; value < 256; value++)
5125 if (!isPSXSPC(value))
5126 ANYOF_BITMAP_SET(ret, value);
5127 }
c49a72a9
NC
5128 yesno = '!';
5129 what = "Space";
aaa51d5e 5130 break;
73b437c8
JH
5131 case ANYOF_PUNCT:
5132 if (LOC)
936ed897 5133 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5134 else {
5135 for (value = 0; value < 256; value++)
5136 if (isPUNCT(value))
936ed897 5137 ANYOF_BITMAP_SET(ret, value);
73b437c8 5138 }
c49a72a9
NC
5139 yesno = '+';
5140 what = "Punct";
73b437c8
JH
5141 break;
5142 case ANYOF_NPUNCT:
5143 if (LOC)
936ed897 5144 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5145 else {
5146 for (value = 0; value < 256; value++)
5147 if (!isPUNCT(value))
936ed897 5148 ANYOF_BITMAP_SET(ret, value);
73b437c8 5149 }
c49a72a9
NC
5150 yesno = '!';
5151 what = "Punct";
ffc61ed2
JH
5152 break;
5153 case ANYOF_SPACE:
5154 if (LOC)
5155 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5156 else {
5157 for (value = 0; value < 256; value++)
5158 if (isSPACE(value))
5159 ANYOF_BITMAP_SET(ret, value);
5160 }
c49a72a9
NC
5161 yesno = '+';
5162 what = "SpacePerl";
ffc61ed2
JH
5163 break;
5164 case ANYOF_NSPACE:
5165 if (LOC)
5166 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5167 else {
5168 for (value = 0; value < 256; value++)
5169 if (!isSPACE(value))
5170 ANYOF_BITMAP_SET(ret, value);
5171 }
c49a72a9
NC
5172 yesno = '!';
5173 what = "SpacePerl";
73b437c8
JH
5174 break;
5175 case ANYOF_UPPER:
5176 if (LOC)
936ed897 5177 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5178 else {
5179 for (value = 0; value < 256; value++)
5180 if (isUPPER(value))
936ed897 5181 ANYOF_BITMAP_SET(ret, value);
73b437c8 5182 }
c49a72a9
NC
5183 yesno = '+';
5184 what = "Upper";
73b437c8
JH
5185 break;
5186 case ANYOF_NUPPER:
5187 if (LOC)
936ed897 5188 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5189 else {
5190 for (value = 0; value < 256; value++)
5191 if (!isUPPER(value))
936ed897 5192 ANYOF_BITMAP_SET(ret, value);
73b437c8 5193 }
c49a72a9
NC
5194 yesno = '!';
5195 what = "Upper";
73b437c8
JH
5196 break;
5197 case ANYOF_XDIGIT:
5198 if (LOC)
936ed897 5199 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5200 else {
5201 for (value = 0; value < 256; value++)
5202 if (isXDIGIT(value))
936ed897 5203 ANYOF_BITMAP_SET(ret, value);
73b437c8 5204 }
c49a72a9
NC
5205 yesno = '+';
5206 what = "XDigit";
73b437c8
JH
5207 break;
5208 case ANYOF_NXDIGIT:
5209 if (LOC)
936ed897 5210 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5211 else {
5212 for (value = 0; value < 256; value++)
5213 if (!isXDIGIT(value))
936ed897 5214 ANYOF_BITMAP_SET(ret, value);
73b437c8 5215 }
c49a72a9
NC
5216 yesno = '!';
5217 what = "XDigit";
73b437c8 5218 break;
f81125e2
JP
5219 case ANYOF_MAX:
5220 /* this is to handle \p and \P */
5221 break;
73b437c8 5222 default:
b45f050a 5223 vFAIL("Invalid [::] class");
73b437c8 5224 break;
b8c5462f 5225 }
c49a72a9
NC
5226 if (what) {
5227 /* Strings such as "+utf8::isWord\n" */
5228 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5229 }
b8c5462f 5230 if (LOC)
936ed897 5231 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5232 continue;
a0d0e21e 5233 }
ffc61ed2
JH
5234 } /* end of namedclass \blah */
5235
a0d0e21e 5236 if (range) {
eb160463 5237 if (prevvalue > (IV)value) /* b-a */ {
b45f050a 5238 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
5239 RExC_parse - rangebegin,
5240 RExC_parse - rangebegin,
b45f050a 5241 rangebegin);
3568d838 5242 range = 0; /* not a valid range */
73b437c8 5243 }
a0d0e21e
LW
5244 }
5245 else {
3568d838 5246 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5247 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5248 RExC_parse[1] != ']') {
5249 RExC_parse++;
ffc61ed2
JH
5250
5251 /* a bad range like \w-, [:word:]- ? */
5252 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 5253 if (ckWARN(WARN_REGEXP))
830247a4 5254 vWARN4(RExC_parse,
b45f050a 5255 "False [] range \"%*.*s\"",
830247a4
IZ
5256 RExC_parse - rangebegin,
5257 RExC_parse - rangebegin,
b45f050a 5258 rangebegin);
73b437c8 5259 if (!SIZE_ONLY)
936ed897 5260 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5261 } else
ffc61ed2
JH
5262 range = 1; /* yeah, it's a range! */
5263 continue; /* but do it the next time */
a0d0e21e 5264 }
a687059c 5265 }
ffc61ed2 5266
93733859 5267 /* now is the next time */
ae5c130c 5268 if (!SIZE_ONLY) {
3568d838
JH
5269 IV i;
5270
5271 if (prevvalue < 256) {
1df70142 5272 const IV ceilvalue = value < 256 ? value : 255;
3568d838
JH
5273
5274#ifdef EBCDIC
1b2d223b
JH
5275 /* In EBCDIC [\x89-\x91] should include
5276 * the \x8e but [i-j] should not. */
5277 if (literal_endpoint == 2 &&
5278 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5279 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5280 {
3568d838
JH
5281 if (isLOWER(prevvalue)) {
5282 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5283 if (isLOWER(i))
5284 ANYOF_BITMAP_SET(ret, i);
5285 } else {
3568d838 5286 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5287 if (isUPPER(i))
5288 ANYOF_BITMAP_SET(ret, i);
5289 }
8ada0baa 5290 }
ffc61ed2 5291 else
8ada0baa 5292#endif
a5961de5
JH
5293 for (i = prevvalue; i <= ceilvalue; i++)
5294 ANYOF_BITMAP_SET(ret, i);
3568d838 5295 }
a5961de5 5296 if (value > 255 || UTF) {
1df70142
AL
5297 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5298 const UV natvalue = NATIVE_TO_UNI(value);
b08decb7 5299
ffc61ed2 5300 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5301 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5302 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5303 prevnatvalue, natvalue);
5304 }
5305 else if (prevnatvalue == natvalue) {
5306 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5307 if (FOLD) {
89ebb4a3 5308 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5309 STRLEN foldlen;
1df70142 5310 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5311
c840d2a2
JH
5312 /* If folding and foldable and a single
5313 * character, insert also the folded version
5314 * to the charclass. */
9e55ce06 5315 if (f != value) {
eb160463 5316 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5317 Perl_sv_catpvf(aTHX_ listsv,
5318 "%04"UVxf"\n", f);
5319 else {
5320 /* Any multicharacter foldings
5321 * require the following transform:
5322 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5323 * where E folds into "pq" and F folds
5324 * into "rst", all other characters
5325 * fold to single characters. We save
5326 * away these multicharacter foldings,
5327 * to be later saved as part of the
5328 * additional "s" data. */
5329 SV *sv;
5330
5331 if (!unicode_alternate)
5332 unicode_alternate = newAV();
5333 sv = newSVpvn((char*)foldbuf, foldlen);
5334 SvUTF8_on(sv);
5335 av_push(unicode_alternate, sv);
5336 }
5337 }
254ba52a 5338
60a8b682
JH
5339 /* If folding and the value is one of the Greek
5340 * sigmas insert a few more sigmas to make the
5341 * folding rules of the sigmas to work right.
5342 * Note that not all the possible combinations
5343 * are handled here: some of them are handled
9e55ce06
JH
5344 * by the standard folding rules, and some of
5345 * them (literal or EXACTF cases) are handled
5346 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5347 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5348 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5349 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5350 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5351 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5352 }
5353 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5354 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5355 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5356 }
5357 }
ffc61ed2 5358 }
1b2d223b
JH
5359#ifdef EBCDIC
5360 literal_endpoint = 0;
5361#endif
8ada0baa 5362 }
ffc61ed2
JH
5363
5364 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5365 }
ffc61ed2 5366
936ed897 5367 if (need_class) {
4f66b38d 5368 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5369 if (SIZE_ONLY)
830247a4 5370 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5371 else
830247a4 5372 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5373 }
ffc61ed2 5374
ae5c130c 5375 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 5376 if (!SIZE_ONLY &&
ffc61ed2 5377 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5378 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5379 ) {
a0ed51b3 5380 for (value = 0; value < 256; ++value) {
936ed897 5381 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5382 UV fold = PL_fold[value];
ffc61ed2
JH
5383
5384 if (fold != value)
5385 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5386 }
5387 }
936ed897 5388 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5389 }
ffc61ed2 5390
ae5c130c 5391 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 5392 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
5393 /* If the only flag is inversion. */
5394 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5395 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5396 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5397 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5398 }
a0d0e21e 5399
b81d288d 5400 if (!SIZE_ONLY) {
fde631ed 5401 AV *av = newAV();
ffc61ed2
JH
5402 SV *rv;
5403
9e55ce06 5404 /* The 0th element stores the character class description
6a0407ee 5405 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5406 * to initialize the appropriate swash (which gets stored in
5407 * the 1st element), and also useful for dumping the regnode.
5408 * The 2nd element stores the multicharacter foldings,
6a0407ee 5409 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
5410 av_store(av, 0, listsv);
5411 av_store(av, 1, NULL);
9e55ce06 5412 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 5413 rv = newRV_noinc((SV*)av);
19860706 5414 n = add_data(pRExC_state, 1, "s");
830247a4 5415 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 5416 ARG_SET(ret, n);
a0ed51b3
LW
5417 }
5418
5419 return ret;
5420}
5421
76e3520e 5422STATIC char*
830247a4 5423S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 5424{
830247a4 5425 char* retval = RExC_parse++;
a0d0e21e 5426
4633a7c4 5427 for (;;) {
830247a4
IZ
5428 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5429 RExC_parse[2] == '#') {
e994fd66
AE
5430 while (*RExC_parse != ')') {
5431 if (RExC_parse == RExC_end)
5432 FAIL("Sequence (?#... not terminated");
830247a4 5433 RExC_parse++;
e994fd66 5434 }
830247a4 5435 RExC_parse++;
4633a7c4
LW
5436 continue;
5437 }
e2509266 5438 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
5439 if (isSPACE(*RExC_parse)) {
5440 RExC_parse++;
748a9306
LW
5441 continue;
5442 }
830247a4 5443 else if (*RExC_parse == '#') {
e994fd66
AE
5444 while (RExC_parse < RExC_end)
5445 if (*RExC_parse++ == '\n') break;
748a9306
LW
5446 continue;
5447 }
748a9306 5448 }
4633a7c4 5449 return retval;
a0d0e21e 5450 }
a687059c
LW
5451}
5452
5453/*
c277df42 5454- reg_node - emit a node
a0d0e21e 5455*/
76e3520e 5456STATIC regnode * /* Location. */
830247a4 5457S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 5458{
c277df42 5459 register regnode *ptr;
504618e9 5460 regnode * const ret = RExC_emit;
a687059c 5461
c277df42 5462 if (SIZE_ONLY) {
830247a4
IZ
5463 SIZE_ALIGN(RExC_size);
5464 RExC_size += 1;
a0d0e21e
LW
5465 return(ret);
5466 }
a687059c 5467
c277df42 5468 NODE_ALIGN_FILL(ret);
a0d0e21e 5469 ptr = ret;
c277df42 5470 FILL_ADVANCE_NODE(ptr, op);
fac92740 5471 if (RExC_offsets) { /* MJD */
ccb2c380 5472 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
5473 "reg_node", __LINE__,
5474 reg_name[op],
5475 RExC_emit - RExC_emit_start > RExC_offsets[0]
5476 ? "Overwriting end of array!\n" : "OK",
5477 RExC_emit - RExC_emit_start,
5478 RExC_parse - RExC_start,
5479 RExC_offsets[0]));
ccb2c380 5480 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
5481 }
5482
830247a4 5483 RExC_emit = ptr;
a687059c 5484
a0d0e21e 5485 return(ret);
a687059c
LW
5486}
5487
5488/*
a0d0e21e
LW
5489- reganode - emit a node with an argument
5490*/
76e3520e 5491STATIC regnode * /* Location. */
830247a4 5492S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 5493{
c277df42 5494 register regnode *ptr;
504618e9 5495 regnode * const ret = RExC_emit;
fe14fcc3 5496
c277df42 5497 if (SIZE_ONLY) {
830247a4
IZ
5498 SIZE_ALIGN(RExC_size);
5499 RExC_size += 2;
a0d0e21e
LW
5500 return(ret);
5501 }
fe14fcc3 5502
c277df42 5503 NODE_ALIGN_FILL(ret);
a0d0e21e 5504 ptr = ret;
c277df42 5505 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 5506 if (RExC_offsets) { /* MJD */
ccb2c380 5507 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5508 "reganode",
ccb2c380
MP
5509 __LINE__,
5510 reg_name[op],
fac92740
MJD
5511 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5512 "Overwriting end of array!\n" : "OK",
5513 RExC_emit - RExC_emit_start,
5514 RExC_parse - RExC_start,
5515 RExC_offsets[0]));
ccb2c380 5516 Set_Cur_Node_Offset;
fac92740
MJD
5517 }
5518
830247a4 5519 RExC_emit = ptr;
fe14fcc3 5520
a0d0e21e 5521 return(ret);
fe14fcc3
LW
5522}
5523
5524/*
cd439c50 5525- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
5526*/
5527STATIC void
a28509cc 5528S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 5529{
5e12f4fb 5530 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
5531}
5532
5533/*
a0d0e21e
LW
5534- reginsert - insert an operator in front of already-emitted operand
5535*
5536* Means relocating the operand.
5537*/
76e3520e 5538STATIC void
830247a4 5539S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 5540{
c277df42
IZ
5541 register regnode *src;
5542 register regnode *dst;
5543 register regnode *place;
504618e9 5544 const int offset = regarglen[(U8)op];
b81d288d 5545
22c35a8c 5546/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
5547
5548 if (SIZE_ONLY) {
830247a4 5549 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
5550 return;
5551 }
a687059c 5552
830247a4
IZ
5553 src = RExC_emit;
5554 RExC_emit += NODE_STEP_REGNODE + offset;
5555 dst = RExC_emit;
fac92740 5556 while (src > opnd) {
c277df42 5557 StructCopy(--src, --dst, regnode);
fac92740 5558 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 5559 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 5560 "reg_insert",
ccb2c380
MP
5561 __LINE__,
5562 reg_name[op],
fac92740
MJD
5563 dst - RExC_emit_start > RExC_offsets[0]
5564 ? "Overwriting end of array!\n" : "OK",
5565 src - RExC_emit_start,
5566 dst - RExC_emit_start,
5567 RExC_offsets[0]));
ccb2c380
MP
5568 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5569 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
5570 }
5571 }
5572
a0d0e21e
LW
5573
5574 place = opnd; /* Op node, where operand used to be. */
fac92740 5575 if (RExC_offsets) { /* MJD */
ccb2c380 5576 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5577 "reginsert",
ccb2c380
MP
5578 __LINE__,
5579 reg_name[op],
fac92740
MJD
5580 place - RExC_emit_start > RExC_offsets[0]
5581 ? "Overwriting end of array!\n" : "OK",
5582 place - RExC_emit_start,
5583 RExC_parse - RExC_start,
5584 RExC_offsets[0]));
ccb2c380 5585 Set_Node_Offset(place, RExC_parse);
45948336 5586 Set_Node_Length(place, 1);
fac92740 5587 }
c277df42
IZ
5588 src = NEXTOPER(place);
5589 FILL_ADVANCE_NODE(place, op);
5590 Zero(src, offset, regnode);
a687059c
LW
5591}
5592
5593/*
c277df42 5594- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 5595*/
76e3520e 5596STATIC void
830247a4 5597S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5598{
c277df42 5599 register regnode *scan;
a0d0e21e 5600
c277df42 5601 if (SIZE_ONLY)
a0d0e21e
LW
5602 return;
5603
5604 /* Find last node. */
5605 scan = p;
5606 for (;;) {
504618e9 5607 regnode * const temp = regnext(scan);
a0d0e21e
LW
5608 if (temp == NULL)
5609 break;
5610 scan = temp;
5611 }
a687059c 5612
c277df42
IZ
5613 if (reg_off_by_arg[OP(scan)]) {
5614 ARG_SET(scan, val - scan);
a0ed51b3
LW
5615 }
5616 else {
c277df42
IZ
5617 NEXT_OFF(scan) = val - scan;
5618 }
a687059c
LW
5619}
5620
5621/*
a0d0e21e
LW
5622- regoptail - regtail on operand of first argument; nop if operandless
5623*/
76e3520e 5624STATIC void
830247a4 5625S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5626{
a0d0e21e 5627 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
5628 if (p == NULL || SIZE_ONLY)
5629 return;
22c35a8c 5630 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 5631 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 5632 }
22c35a8c 5633 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 5634 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
5635 }
5636 else
a0d0e21e 5637 return;
a687059c
LW
5638}
5639
5640/*
5641 - regcurly - a little FSA that accepts {\d+,?\d*}
5642 */
79072805 5643STATIC I32
a3b680e6 5644S_regcurly(pTHX_ register const char *s)
a687059c
LW
5645{
5646 if (*s++ != '{')
5647 return FALSE;
f0fcb552 5648 if (!isDIGIT(*s))
a687059c 5649 return FALSE;
f0fcb552 5650 while (isDIGIT(*s))
a687059c
LW
5651 s++;
5652 if (*s == ',')
5653 s++;
f0fcb552 5654 while (isDIGIT(*s))
a687059c
LW
5655 s++;
5656 if (*s != '}')
5657 return FALSE;
5658 return TRUE;
5659}
5660
a687059c
LW
5661
5662/*
fd181c75 5663 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
5664 */
5665void
864dbfa3 5666Perl_regdump(pTHX_ regexp *r)
a687059c 5667{
35ff7856 5668#ifdef DEBUGGING
c445ea15 5669 SV * const sv = sv_newmortal();
a687059c 5670
c277df42 5671 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
5672
5673 /* Header fields of interest. */
c277df42 5674 if (r->anchored_substr)
7b0972df 5675 PerlIO_printf(Perl_debug_log,
a0288114 5676 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
3280af22 5677 PL_colors[0],
7b0972df 5678 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
3f7c398e 5679 SvPVX_const(r->anchored_substr),
3280af22 5680 PL_colors[1],
c277df42 5681 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 5682 (IV)r->anchored_offset);
33b8afdf
JH
5683 else if (r->anchored_utf8)
5684 PerlIO_printf(Perl_debug_log,
a0288114 5685 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
33b8afdf
JH
5686 PL_colors[0],
5687 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
3f7c398e 5688 SvPVX_const(r->anchored_utf8),
33b8afdf
JH
5689 PL_colors[1],
5690 SvTAIL(r->anchored_utf8) ? "$" : "",
5691 (IV)r->anchored_offset);
c277df42 5692 if (r->float_substr)
7b0972df 5693 PerlIO_printf(Perl_debug_log,
a0288114 5694 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
3280af22 5695 PL_colors[0],
b81d288d 5696 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
3f7c398e 5697 SvPVX_const(r->float_substr),
3280af22 5698 PL_colors[1],
c277df42 5699 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 5700 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
5701 else if (r->float_utf8)
5702 PerlIO_printf(Perl_debug_log,
a0288114 5703 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
33b8afdf
JH
5704 PL_colors[0],
5705 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
3f7c398e 5706 SvPVX_const(r->float_utf8),
33b8afdf
JH
5707 PL_colors[1],
5708 SvTAIL(r->float_utf8) ? "$" : "",
5709 (IV)r->float_min_offset, (UV)r->float_max_offset);
5710 if (r->check_substr || r->check_utf8)
b81d288d
AB
5711 PerlIO_printf(Perl_debug_log,
5712 r->check_substr == r->float_substr
33b8afdf 5713 && r->check_utf8 == r->float_utf8
c277df42
IZ
5714 ? "(checking floating" : "(checking anchored");
5715 if (r->reganch & ROPT_NOSCAN)
5716 PerlIO_printf(Perl_debug_log, " noscan");
5717 if (r->reganch & ROPT_CHECK_ALL)
5718 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 5719 if (r->check_substr || r->check_utf8)
c277df42
IZ
5720 PerlIO_printf(Perl_debug_log, ") ");
5721
46fc3d4c 5722 if (r->regstclass) {
5723 regprop(sv, r->regstclass);
3f7c398e 5724 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
46fc3d4c 5725 }
774d564b 5726 if (r->reganch & ROPT_ANCH) {
5727 PerlIO_printf(Perl_debug_log, "anchored");
5728 if (r->reganch & ROPT_ANCH_BOL)
5729 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
5730 if (r->reganch & ROPT_ANCH_MBOL)
5731 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
5732 if (r->reganch & ROPT_ANCH_SBOL)
5733 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 5734 if (r->reganch & ROPT_ANCH_GPOS)
5735 PerlIO_printf(Perl_debug_log, "(GPOS)");
5736 PerlIO_putc(Perl_debug_log, ' ');
5737 }
c277df42
IZ
5738 if (r->reganch & ROPT_GPOS_SEEN)
5739 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 5740 if (r->reganch & ROPT_SKIP)
760ac839 5741 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 5742 if (r->reganch & ROPT_IMPLICIT)
760ac839 5743 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 5744 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
5745 if (r->reganch & ROPT_EVAL_SEEN)
5746 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 5747 PerlIO_printf(Perl_debug_log, "\n");
fac92740 5748 if (r->offsets) {
e4584336 5749 const U32 len = r->offsets[0];
a3621e74
YO
5750 GET_RE_DEBUG_FLAGS_DECL;
5751 DEBUG_OFFSETS_r({
1df70142 5752 U32 i;
e4584336
RB
5753 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5754 for (i = 1; i <= len; i++)
5755 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5756 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5757 PerlIO_printf(Perl_debug_log, "\n");
a3621e74 5758 });
fac92740 5759 }
65e66c80
SP
5760#else
5761 PERL_UNUSED_ARG(r);
17c3b450 5762#endif /* DEBUGGING */
a687059c
LW
5763}
5764
5765/*
a0d0e21e
LW
5766- regprop - printable representation of opcode
5767*/
46fc3d4c 5768void
a3b680e6 5769Perl_regprop(pTHX_ SV *sv, const regnode *o)
a687059c 5770{
35ff7856 5771#ifdef DEBUGGING
9b155405 5772 register int k;
a0d0e21e 5773
54dc92de 5774 sv_setpvn(sv, "", 0);
9b155405 5775 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
5776 /* It would be nice to FAIL() here, but this may be called from
5777 regexec.c, and it would be hard to supply pRExC_state. */
5778 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 5779 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405
IZ
5780
5781 k = PL_regkind[(U8)OP(o)];
5782
2a782b5b 5783 if (k == EXACT) {
396482e1 5784 SV * const dsv = sv_2mortal(newSVpvs(""));
c728cb41
JH
5785 /* Using is_utf8_string() is a crude hack but it may
5786 * be the best for now since we have no flag "this EXACTish
5787 * node was UTF-8" --jhi */
1df70142 5788 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
2d03de9c 5789 const char * const s = do_utf8 ?
c728cb41
JH
5790 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5791 UNI_DISPLAY_REGEX) :
2a782b5b 5792 STRING(o);
e1ec3a88 5793 const int len = do_utf8 ?
2a782b5b
JH
5794 strlen(s) :
5795 STR_LEN(o);
5796 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5797 PL_colors[0],
5798 len, s,
5799 PL_colors[1]);
a3621e74
YO
5800 } else if (k == TRIE) {/*
5801 this isn't always safe, as Pl_regdata may not be for this regex yet
5802 (depending on where its called from) so its being moved to dumpuntil
5803 I32 n = ARG(o);
5804 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5805 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5806 trie->wordcount,
5807 trie->charcount,
5808 trie->uniquecharcount,
5809 trie->laststate);
5810 */
5811 } else if (k == CURLY) {
cb434fcc 5812 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
5813 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5814 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 5815 }
2c2d71f5
JH
5816 else if (k == WHILEM && o->flags) /* Ordinal/of */
5817 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 5818 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 5819 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 5820 else if (k == LOGICAL)
04ebc1ab 5821 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
5822 else if (k == ANYOF) {
5823 int i, rangestart = -1;
2d03de9c 5824 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
5825
5826 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
5827 static const char * const anyofs[] = {
653099ff
GS
5828 "\\w",
5829 "\\W",
5830 "\\s",
5831 "\\S",
5832 "\\d",
5833 "\\D",
5834 "[:alnum:]",
5835 "[:^alnum:]",
5836 "[:alpha:]",
5837 "[:^alpha:]",
5838 "[:ascii:]",
5839 "[:^ascii:]",
5840 "[:ctrl:]",
5841 "[:^ctrl:]",
5842 "[:graph:]",
5843 "[:^graph:]",
5844 "[:lower:]",
5845 "[:^lower:]",
5846 "[:print:]",
5847 "[:^print:]",
5848 "[:punct:]",
5849 "[:^punct:]",
5850 "[:upper:]",
aaa51d5e 5851 "[:^upper:]",
653099ff 5852 "[:xdigit:]",
aaa51d5e
JF
5853 "[:^xdigit:]",
5854 "[:space:]",
5855 "[:^space:]",
5856 "[:blank:]",
5857 "[:^blank:]"
653099ff
GS
5858 };
5859
19860706 5860 if (flags & ANYOF_LOCALE)
396482e1 5861 sv_catpvs(sv, "{loc}");
19860706 5862 if (flags & ANYOF_FOLD)
396482e1 5863 sv_catpvs(sv, "{i}");
653099ff 5864 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 5865 if (flags & ANYOF_INVERT)
396482e1 5866 sv_catpvs(sv, "^");
ffc61ed2
JH
5867 for (i = 0; i <= 256; i++) {
5868 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5869 if (rangestart == -1)
5870 rangestart = i;
5871 } else if (rangestart != -1) {
5872 if (i <= rangestart + 3)
5873 for (; rangestart < i; rangestart++)
653099ff 5874 put_byte(sv, rangestart);
ffc61ed2
JH
5875 else {
5876 put_byte(sv, rangestart);
396482e1 5877 sv_catpvs(sv, "-");
ffc61ed2 5878 put_byte(sv, i - 1);
653099ff 5879 }
ffc61ed2 5880 rangestart = -1;
653099ff 5881 }
847a199f 5882 }
ffc61ed2
JH
5883
5884 if (o->flags & ANYOF_CLASS)
5885 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
5886 if (ANYOF_CLASS_TEST(o,i))
5887 sv_catpv(sv, anyofs[i]);
5888
5889 if (flags & ANYOF_UNICODE)
396482e1 5890 sv_catpvs(sv, "{unicode}");
1aa99e6b 5891 else if (flags & ANYOF_UNICODE_ALL)
396482e1 5892 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
5893
5894 {
5895 SV *lv;
2d03de9c 5896 SV * const sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 5897
ffc61ed2
JH
5898 if (lv) {
5899 if (sw) {
89ebb4a3 5900 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 5901
ffc61ed2 5902 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 5903 uvchr_to_utf8(s, i);
ffc61ed2 5904
3568d838 5905 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
5906 if (rangestart == -1)
5907 rangestart = i;
5908 } else if (rangestart != -1) {
ffc61ed2
JH
5909 if (i <= rangestart + 3)
5910 for (; rangestart < i; rangestart++) {
2d03de9c
AL
5911 const U8 * const e = uvchr_to_utf8(s,rangestart);
5912 U8 *p;
5913 for(p = s; p < e; p++)
ffc61ed2
JH
5914 put_byte(sv, *p);
5915 }
5916 else {
2d03de9c
AL
5917 const U8 *e = uvchr_to_utf8(s,rangestart);
5918 U8 *p;
5919 for (p = s; p < e; p++)
ffc61ed2 5920 put_byte(sv, *p);
396482e1 5921 sv_catpvs(sv, "-");
2d03de9c
AL
5922 e = uvchr_to_utf8(s, i-1);
5923 for (p = s; p < e; p++)
1df70142 5924 put_byte(sv, *p);
ffc61ed2
JH
5925 }
5926 rangestart = -1;
5927 }
19860706 5928 }
ffc61ed2 5929
396482e1 5930 sv_catpvs(sv, "..."); /* et cetera */
19860706 5931 }
fde631ed 5932
ffc61ed2 5933 {
2e0de35c 5934 char *s = savesvpv(lv);
c445ea15 5935 char * const origs = s;
b81d288d 5936
ffc61ed2 5937 while(*s && *s != '\n') s++;
b81d288d 5938
ffc61ed2 5939 if (*s == '\n') {
2d03de9c 5940 const char * const t = ++s;
ffc61ed2
JH
5941
5942 while (*s) {
5943 if (*s == '\n')
5944 *s = ' ';
5945 s++;
5946 }
5947 if (s[-1] == ' ')
5948 s[-1] = 0;
5949
5950 sv_catpv(sv, t);
fde631ed 5951 }
b81d288d 5952
ffc61ed2 5953 Safefree(origs);
fde631ed
JH
5954 }
5955 }
653099ff 5956 }
ffc61ed2 5957
653099ff
GS
5958 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5959 }
9b155405 5960 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 5961 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
65e66c80
SP
5962#else
5963 PERL_UNUSED_ARG(sv);
5964 PERL_UNUSED_ARG(o);
17c3b450 5965#endif /* DEBUGGING */
35ff7856 5966}
a687059c 5967
cad2e5aa
JH
5968SV *
5969Perl_re_intuit_string(pTHX_ regexp *prog)
5970{ /* Assume that RE_INTUIT is set */
a3621e74
YO
5971 GET_RE_DEBUG_FLAGS_DECL;
5972 DEBUG_COMPILE_r(
cfd0369c 5973 {
2d03de9c 5974 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 5975 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
5976
5977 if (!PL_colorset) reginitcolors();
5978 PerlIO_printf(Perl_debug_log,
a0288114 5979 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
5980 PL_colors[4],
5981 prog->check_substr ? "" : "utf8 ",
5982 PL_colors[5],PL_colors[0],
cad2e5aa
JH
5983 s,
5984 PL_colors[1],
5985 (strlen(s) > 60 ? "..." : ""));
5986 } );
5987
33b8afdf 5988 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
5989}
5990
2b69d0c2 5991void
864dbfa3 5992Perl_pregfree(pTHX_ struct regexp *r)
a687059c 5993{
27da23d5 5994 dVAR;
9e55ce06 5995#ifdef DEBUGGING
c445ea15
AL
5996 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
5997 SV * const re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
9e55ce06 5998#endif
7821416a 5999
a3621e74 6000
7821416a
IZ
6001 if (!r || (--r->refcnt > 0))
6002 return;
a3621e74 6003 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
c445ea15 6004 const char * const s = (r->reganch & ROPT_UTF8)
e1ec3a88 6005 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 6006 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
e1ec3a88 6007 const int len = SvCUR(dsv);
9e55ce06
JH
6008 if (!PL_colorset)
6009 reginitcolors();
6010 PerlIO_printf(Perl_debug_log,
a3621e74 6011 "%sFreeing REx:%s %s%*.*s%s%s\n",
9e55ce06
JH
6012 PL_colors[4],PL_colors[5],PL_colors[0],
6013 len, len, s,
6014 PL_colors[1],
6015 len > 60 ? "..." : "");
6016 });
cad2e5aa 6017
43c5f42d
NC
6018 /* gcov results gave these as non-null 100% of the time, so there's no
6019 optimisation in checking them before calling Safefree */
6020 Safefree(r->precomp);
6021 Safefree(r->offsets); /* 20010421 MJD */
ed252734 6022 RX_MATCH_COPY_FREE(r);
f8c7b90f 6023#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
6024 if (r->saved_copy)
6025 SvREFCNT_dec(r->saved_copy);
6026#endif
a193d654
GS
6027 if (r->substrs) {
6028 if (r->anchored_substr)
6029 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6030 if (r->anchored_utf8)
6031 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6032 if (r->float_substr)
6033 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6034 if (r->float_utf8)
6035 SvREFCNT_dec(r->float_utf8);
2779dcf1 6036 Safefree(r->substrs);
a193d654 6037 }
c277df42
IZ
6038 if (r->data) {
6039 int n = r->data->count;
f3548bdc
DM
6040 PAD* new_comppad = NULL;
6041 PAD* old_comppad;
4026c95a 6042 PADOFFSET refcnt;
dfad63ad 6043
c277df42 6044 while (--n >= 0) {
261faec3 6045 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6046 switch (r->data->what[n]) {
6047 case 's':
6048 SvREFCNT_dec((SV*)r->data->data[n]);
6049 break;
653099ff
GS
6050 case 'f':
6051 Safefree(r->data->data[n]);
6052 break;
dfad63ad
HS
6053 case 'p':
6054 new_comppad = (AV*)r->data->data[n];
6055 break;
c277df42 6056 case 'o':
dfad63ad 6057 if (new_comppad == NULL)
cea2e8a9 6058 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6059 PAD_SAVE_LOCAL(old_comppad,
6060 /* Watch out for global destruction's random ordering. */
c445ea15 6061 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 6062 );
b34c0dd4 6063 OP_REFCNT_LOCK;
4026c95a
SH
6064 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6065 OP_REFCNT_UNLOCK;
6066 if (!refcnt)
9b978d73 6067 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6068
f3548bdc 6069 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6070 SvREFCNT_dec((SV*)new_comppad);
6071 new_comppad = NULL;
c277df42
IZ
6072 break;
6073 case 'n':
9e55ce06 6074 break;
a3621e74
YO
6075 case 't':
6076 {
c445ea15 6077 reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
a3621e74
YO
6078 U32 refcount;
6079 OP_REFCNT_LOCK;
e27afef8 6080 refcount = --trie->refcount;
a3621e74
YO
6081 OP_REFCNT_UNLOCK;
6082 if ( !refcount ) {
43c5f42d 6083 Safefree(trie->charmap);
a3621e74
YO
6084 if (trie->widecharmap)
6085 SvREFCNT_dec((SV*)trie->widecharmap);
43c5f42d
NC
6086 Safefree(trie->states);
6087 Safefree(trie->trans);
a3621e74
YO
6088#ifdef DEBUGGING
6089 if (trie->words)
6090 SvREFCNT_dec((SV*)trie->words);
6091 if (trie->revcharmap)
6092 SvREFCNT_dec((SV*)trie->revcharmap);
6093#endif
6094 Safefree(r->data->data[n]); /* do this last!!!! */
6095 }
6096 break;
6097 }
c277df42 6098 default:
830247a4 6099 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6100 }
6101 }
6102 Safefree(r->data->what);
6103 Safefree(r->data);
a0d0e21e
LW
6104 }
6105 Safefree(r->startp);
6106 Safefree(r->endp);
6107 Safefree(r);
a687059c 6108}
c277df42
IZ
6109
6110/*
6111 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6112 */
6113regnode *
864dbfa3 6114Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
6115{
6116 register I32 offset;
6117
3280af22 6118 if (p == &PL_regdummy)
c277df42
IZ
6119 return(NULL);
6120
6121 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6122 if (offset == 0)
6123 return(NULL);
6124
c277df42 6125 return(p+offset);
c277df42
IZ
6126}
6127
01f988be 6128STATIC void
cea2e8a9 6129S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6130{
6131 va_list args;
6132 STRLEN l1 = strlen(pat1);
6133 STRLEN l2 = strlen(pat2);
6134 char buf[512];
06bf62c7 6135 SV *msv;
73d840c0 6136 const char *message;
c277df42
IZ
6137
6138 if (l1 > 510)
6139 l1 = 510;
6140 if (l1 + l2 > 510)
6141 l2 = 510 - l1;
6142 Copy(pat1, buf, l1 , char);
6143 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6144 buf[l1 + l2] = '\n';
6145 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6146#ifdef I_STDARG
6147 /* ANSI variant takes additional second argument */
c277df42 6148 va_start(args, pat2);
8736538c
AS
6149#else
6150 va_start(args);
6151#endif
5a844595 6152 msv = vmess(buf, &args);
c277df42 6153 va_end(args);
cfd0369c 6154 message = SvPV_const(msv,l1);
c277df42
IZ
6155 if (l1 > 512)
6156 l1 = 512;
6157 Copy(message, buf, l1 , char);
197cf9b9 6158 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6159 Perl_croak(aTHX_ "%s", buf);
c277df42 6160}
a0ed51b3
LW
6161
6162/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6163
6164void
864dbfa3 6165Perl_save_re_context(pTHX)
b81d288d 6166{
830247a4 6167 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 6168 SAVEPPTR(PL_bostr);
a0ed51b3
LW
6169 SAVEPPTR(PL_reginput); /* String-input pointer. */
6170 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6171 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
6172 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6173 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6174 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a5db57d6 6175 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
a0ed51b3 6176 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 6177 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 6178 PL_reg_start_tmp = 0;
a0ed51b3
LW
6179 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6180 PL_reg_start_tmpl = 0;
7766f137 6181 SAVEVPTR(PL_regdata);
a0ed51b3
LW
6182 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6183 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 6184 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 6185 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
6186 SAVEVPTR(PL_regcc); /* from regexec.c */
6187 SAVEVPTR(PL_curcop);
7766f137
GS
6188 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6189 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
6190 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6191 SAVESPTR(PL_reg_sv); /* from regexec.c */
9febdf04 6192 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
7766f137 6193 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 6194 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
6195 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6196 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
a5db57d6 6197 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
c445ea15 6198 PL_reg_oldsaved = NULL;
a5db57d6
GS
6199 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6200 PL_reg_oldsavedlen = 0;
f8c7b90f 6201#ifdef PERL_OLD_COPY_ON_WRITE
ed252734 6202 SAVESPTR(PL_nrs);
c445ea15 6203 PL_nrs = NULL;
ed252734 6204#endif
a5db57d6
GS
6205 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6206 PL_reg_maxiter = 0;
6207 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6208 PL_reg_leftiter = 0;
6209 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
c445ea15 6210 PL_reg_poscache = NULL;
a5db57d6
GS
6211 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6212 PL_reg_poscache_size = 0;
6213 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5fb7366e 6214 SAVEI32(PL_regnpar); /* () count. */
e49a9654 6215 SAVEI32(PL_regsize); /* from regexec.c */
ada6e8a9 6216
c445ea15
AL
6217 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6218 if (PL_curpm) {
6219 const REGEXP * const rx = PM_GETRE(PL_curpm);
6220 if (rx) {
1df70142 6221 U32 i;
ada6e8a9 6222 for (i = 1; i <= rx->nparens; i++) {
1df70142 6223 char digits[TYPE_CHARS(long)];
e5105eda 6224 const STRLEN len = my_sprintf(digits, "%lu", (long)i);
c445ea15
AL
6225 GV * const mgv = gv_fetchpvn_flags(digits, len, 0, SVt_PV);
6226 if (mgv)
ada6e8a9
AMS
6227 save_scalar(mgv);
6228 }
6229 }
6230 }
6231
54b6e2fa 6232#ifdef DEBUGGING
b81d288d 6233 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 6234#endif
a0ed51b3 6235}
51371543 6236
51371543 6237static void
acfe0abc 6238clear_re(pTHX_ void *r)
51371543
GS
6239{
6240 ReREFCNT_dec((regexp *)r);
6241}
ffbc6a93 6242
a28509cc
AL
6243#ifdef DEBUGGING
6244
6245STATIC void
6246S_put_byte(pTHX_ SV *sv, int c)
6247{
6248 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6249 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6250 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6251 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6252 else
6253 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6254}
6255
6256
6257STATIC regnode *
6258S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
6259{
6260 register U8 op = EXACT; /* Arbitrary non-END op. */
6261 register regnode *next;
6262
6263 while (op != END && (!last || node < last)) {
6264 /* While that wasn't END last time... */
6265
6266 NODE_ALIGN(node);
6267 op = OP(node);
6268 if (op == CLOSE)
6269 l--;
6270 next = regnext(node);
6271 /* Where, what. */
6272 if (OP(node) == OPTIMIZED)
6273 goto after_print;
6274 regprop(sv, node);
6275 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6276 (int)(2*l + 1), "", SvPVX_const(sv));
6277 if (next == NULL) /* Next ptr. */
6278 PerlIO_printf(Perl_debug_log, "(0)");
6279 else
6280 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6281 (void)PerlIO_putc(Perl_debug_log, '\n');
6282 after_print:
6283 if (PL_regkind[(U8)op] == BRANCHJ) {
6284 register regnode *nnode = (OP(next) == LONGJMP
6285 ? regnext(next)
6286 : next);
6287 if (last && nnode > last)
6288 nnode = last;
6289 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6290 }
6291 else if (PL_regkind[(U8)op] == BRANCH) {
6292 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
6293 }
6294 else if ( PL_regkind[(U8)op] == TRIE ) {
6295 const I32 n = ARG(node);
6296 const reg_trie_data * const trie = (reg_trie_data*)PL_regdata->data[n];
6297 const I32 arry_len = av_len(trie->words)+1;
6298 I32 word_idx;
6299 PerlIO_printf(Perl_debug_log,
6300 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6301 (int)(2*(l+3)),
6302 "",
6303 trie->wordcount,
6304 (int)trie->charcount,
6305 trie->uniquecharcount,
6306 (IV)trie->laststate-1,
6307 node->flags ? " EVAL mode" : "");
6308
6309 for (word_idx=0; word_idx < arry_len; word_idx++) {
6310 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
6311 if (elem_ptr) {
6312 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6313 (int)(2*(l+4)), "",
6314 PL_colors[0],
cfd0369c 6315 SvPV_nolen_const(*elem_ptr),
a28509cc
AL
6316 PL_colors[1]
6317 );
6318 /*
6319 if (next == NULL)
6320 PerlIO_printf(Perl_debug_log, "(0)\n");
6321 else
6322 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6323 */
6324 }
6325
6326 }
6327
6328 node = NEXTOPER(node);
6329 node += regarglen[(U8)op];
6330
6331 }
6332 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6333 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6334 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6335 }
6336 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6337 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6338 next, sv, l + 1);
6339 }
6340 else if ( op == PLUS || op == STAR) {
6341 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6342 }
6343 else if (op == ANYOF) {
6344 /* arglen 1 + class block */
6345 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6346 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6347 node = NEXTOPER(node);
6348 }
6349 else if (PL_regkind[(U8)op] == EXACT) {
6350 /* Literal string, where present. */
6351 node += NODE_SZ_STR(node) - 1;
6352 node = NEXTOPER(node);
6353 }
6354 else {
6355 node = NEXTOPER(node);
6356 node += regarglen[(U8)op];
6357 }
6358 if (op == CURLYX || op == OPEN)
6359 l++;
6360 else if (op == WHILEM)
6361 l--;
6362 }
6363 return node;
6364}
6365
6366#endif /* DEBUGGING */
6367
241d1a3b
NC
6368/*
6369 * Local variables:
6370 * c-indentation-style: bsd
6371 * c-basic-offset: 4
6372 * indent-tabs-mode: t
6373 * End:
6374 *
37442d52
RGS
6375 * ex: set ts=8 sts=4 sw=4 noet:
6376 */