This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Correct the macros in the comments in hv.c. Given the improvements in
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e
AD
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
cad2e5aa 38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
d06ea78c 44/* We *really* need to overwrite these symbols: */
56953603
IZ
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
d06ea78c 48# define Perl_pregfree my_regfree
cad2e5aa
JH
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
d06ea78c 51# define Perl_regnext my_regnext
f0b8d043 52# define Perl_save_re_context my_save_re_context
b81d288d 53# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
b81d288d 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 *
79 **** Alterations to Henry's code are...
80 ****
4bb101f2 81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 82 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 83 ****
9ef589d8
LW
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
86
a687059c
LW
87 *
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
91 */
92#include "EXTERN.h"
864dbfa3 93#define PERL_IN_REGCOMP_C
a687059c 94#include "perl.h"
d06ea78c 95
acfe0abc 96#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
97# include "INTERN.h"
98#endif
c277df42
IZ
99
100#define REG_COMP_C
a687059c
LW
101#include "regcomp.h"
102
d4cce5f1 103#ifdef op
11343788 104#undef op
d4cce5f1 105#endif /* op */
11343788 106
fe14fcc3 107#ifdef MSDOS
7e4e8c89 108# if defined(BUGGY_MSC6)
fe14fcc3 109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 110# pragma optimize("a",off)
fe14fcc3 111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
112# pragma optimize("w",on )
113# endif /* BUGGY_MSC6 */
fe14fcc3
LW
114#endif /* MSDOS */
115
a687059c
LW
116#ifndef STATIC
117#define STATIC static
118#endif
119
830247a4 120typedef struct RExC_state_t {
e2509266 121 U32 flags; /* are we folding, multilining? */
830247a4
IZ
122 char *precomp; /* uncompiled string. */
123 regexp *rx;
fac92740 124 char *start; /* Start of input for compile */
830247a4
IZ
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 128 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 129 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
132 U32 seen;
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
135 I32 extralen;
136 I32 seen_zerolen;
137 I32 seen_evals;
1aa99e6b 138 I32 utf8;
830247a4
IZ
139#if ADD_TO_REGEXEC
140 char *starttry; /* -Dr: where regtry was called. */
141#define RExC_starttry (pRExC_state->starttry)
142#endif
143} RExC_state_t;
144
e2509266 145#define RExC_flags (pRExC_state->flags)
830247a4
IZ
146#define RExC_precomp (pRExC_state->precomp)
147#define RExC_rx (pRExC_state->rx)
fac92740 148#define RExC_start (pRExC_state->start)
830247a4
IZ
149#define RExC_end (pRExC_state->end)
150#define RExC_parse (pRExC_state->parse)
151#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 152#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 153#define RExC_emit (pRExC_state->emit)
fac92740 154#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
155#define RExC_naughty (pRExC_state->naughty)
156#define RExC_sawback (pRExC_state->sawback)
157#define RExC_seen (pRExC_state->seen)
158#define RExC_size (pRExC_state->size)
159#define RExC_npar (pRExC_state->npar)
160#define RExC_extralen (pRExC_state->extralen)
161#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 163#define RExC_utf8 (pRExC_state->utf8)
830247a4 164
a687059c
LW
165#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
a687059c 168
35c8bce7
LW
169#ifdef SPSTART
170#undef SPSTART /* dratted cpp namespace... */
171#endif
a687059c
LW
172/*
173 * Flags to be passed up and down.
174 */
a687059c 175#define WORST 0 /* Worst case. */
821b33a5 176#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
177#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178#define SPSTART 0x4 /* Starts with * or +. */
179#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 180
2c2d71f5
JH
181/* Length of a variant. */
182
183typedef struct scan_data_t {
184 I32 len_min;
185 I32 len_delta;
186 I32 pos_min;
187 I32 pos_delta;
188 SV *last_found;
189 I32 last_end; /* min value, <0 unless valid. */
190 I32 last_start_min;
191 I32 last_start_max;
192 SV **longest; /* Either &l_fixed, or &l_float. */
193 SV *longest_fixed;
194 I32 offset_fixed;
195 SV *longest_float;
196 I32 offset_float_min;
197 I32 offset_float_max;
198 I32 flags;
199 I32 whilem_c;
cb434fcc 200 I32 *last_closep;
653099ff 201 struct regnode_charclass_class *start_class;
2c2d71f5
JH
202} scan_data_t;
203
a687059c 204/*
e50aee73 205 * Forward declarations for pregcomp()'s friends.
a687059c 206 */
a0d0e21e 207
27da23d5
JH
208static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
210
211#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212#define SF_BEFORE_SEOL 0x1
213#define SF_BEFORE_MEOL 0x2
214#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
216
09b7f37c
CB
217#ifdef NO_UNARY_PLUS
218# define SF_FIX_SHIFT_EOL (0+2)
219# define SF_FL_SHIFT_EOL (0+4)
220#else
221# define SF_FIX_SHIFT_EOL (+2)
222# define SF_FL_SHIFT_EOL (+4)
223#endif
c277df42
IZ
224
225#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
227
228#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230#define SF_IS_INF 0x40
231#define SF_HAS_PAR 0x80
232#define SF_IN_PAR 0x100
233#define SF_HAS_EVAL 0x200
4bfe0158 234#define SCF_DO_SUBSTR 0x400
653099ff
GS
235#define SCF_DO_STCLASS_AND 0x0800
236#define SCF_DO_STCLASS_OR 0x1000
237#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 238#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 239
eb160463 240#define UTF (RExC_utf8 != 0)
e2509266
JH
241#define LOC ((RExC_flags & PMf_LOCALE) != 0)
242#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 243
ffc61ed2 244#define OOB_UNICODE 12345678
93733859 245#define OOB_NAMEDCLASS -1
b8c5462f 246
a0ed51b3
LW
247#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
249
8615cb43 250
b45f050a
JF
251/* length of regex to show in messages that don't mark a position within */
252#define RegexLengthToShowInErrorMessages 127
253
254/*
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
258 */
7253e4e3
RK
259#define MARKER1 "<-- HERE" /* marker as it appears in the description */
260#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 261
7253e4e3 262#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
263
264/*
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
267 * "...".
268 */
ccb2c380 269#define FAIL(msg) STMT_START { \
bfed75c6 270 const char *ellipses = ""; \
ccb2c380
MP
271 IV len = RExC_end - RExC_precomp; \
272 \
273 if (!SIZE_ONLY) \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
278 ellipses = "..."; \
279 } \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
282} STMT_END
8615cb43 283
b45f050a 284/*
b45f050a
JF
285 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
286 */
ccb2c380 287#define Simple_vFAIL(m) STMT_START { \
a28509cc 288 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
289 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
290 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
291} STMT_END
b45f050a
JF
292
293/*
294 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
295 */
ccb2c380
MP
296#define vFAIL(m) STMT_START { \
297 if (!SIZE_ONLY) \
298 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
299 Simple_vFAIL(m); \
300} STMT_END
b45f050a
JF
301
302/*
303 * Like Simple_vFAIL(), but accepts two arguments.
304 */
ccb2c380 305#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 306 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
307 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
308 (int)offset, RExC_precomp, RExC_precomp + offset); \
309} STMT_END
b45f050a
JF
310
311/*
312 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
313 */
ccb2c380
MP
314#define vFAIL2(m,a1) STMT_START { \
315 if (!SIZE_ONLY) \
316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
317 Simple_vFAIL2(m, a1); \
318} STMT_END
b45f050a
JF
319
320
321/*
322 * Like Simple_vFAIL(), but accepts three arguments.
323 */
ccb2c380 324#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 325 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
326 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
327 (int)offset, RExC_precomp, RExC_precomp + offset); \
328} STMT_END
b45f050a
JF
329
330/*
331 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
332 */
ccb2c380
MP
333#define vFAIL3(m,a1,a2) STMT_START { \
334 if (!SIZE_ONLY) \
335 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
336 Simple_vFAIL3(m, a1, a2); \
337} STMT_END
b45f050a
JF
338
339/*
340 * Like Simple_vFAIL(), but accepts four arguments.
341 */
ccb2c380 342#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 343 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
344 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
345 (int)offset, RExC_precomp, RExC_precomp + offset); \
346} STMT_END
b45f050a 347
ccb2c380 348#define vWARN(loc,m) STMT_START { \
a28509cc 349 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
350 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
351 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
352} STMT_END
353
354#define vWARNdep(loc,m) STMT_START { \
a28509cc 355 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
356 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
357 "%s" REPORT_LOCATION, \
358 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
359} STMT_END
360
361
362#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 363 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
364 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
365 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
366} STMT_END
367
368#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 369 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
370 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
371 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
372} STMT_END
373
374#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 375 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
376 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
377 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
378} STMT_END
379
380#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 381 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
383 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
384} STMT_END
9d1d55b5 385
8615cb43 386
cd439c50 387/* Allow for side effects in s */
ccb2c380
MP
388#define REGC(c,s) STMT_START { \
389 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
390} STMT_END
cd439c50 391
fac92740
MJD
392/* Macros for recording node offsets. 20001227 mjd@plover.com
393 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
394 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
395 * Element 0 holds the number n.
396 */
397
398#define MJD_OFFSET_DEBUG(x)
a3621e74 399/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
400
401
402#define Set_Node_Offset_To_R(node,byte) STMT_START { \
403 if (! SIZE_ONLY) { \
404 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
405 __LINE__, (node), (byte))); \
406 if((node) < 0) { \
551405c4 407 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
408 } else { \
409 RExC_offsets[2*(node)-1] = (byte); \
410 } \
411 } \
412} STMT_END
413
414#define Set_Node_Offset(node,byte) \
415 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
416#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
417
418#define Set_Node_Length_To_R(node,len) STMT_START { \
419 if (! SIZE_ONLY) { \
420 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 421 __LINE__, (int)(node), (int)(len))); \
ccb2c380 422 if((node) < 0) { \
551405c4 423 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
424 } else { \
425 RExC_offsets[2*(node)] = (len); \
426 } \
427 } \
428} STMT_END
429
430#define Set_Node_Length(node,len) \
431 Set_Node_Length_To_R((node)-RExC_emit_start, len)
432#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
433#define Set_Node_Cur_Length(node) \
434 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
435
436/* Get offsets and lengths */
437#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
438#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
439
acfe0abc 440static void clear_re(pTHX_ void *r);
4327152a 441
653099ff
GS
442/* Mark that we cannot extend a found fixed substring at this point.
443 Updata the longest found anchored substring and the longest found
444 floating substrings if needed. */
445
4327152a 446STATIC void
830247a4 447S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 448{
e1ec3a88
AL
449 const STRLEN l = CHR_SVLEN(data->last_found);
450 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 451
c277df42 452 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 453 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
454 if (*data->longest == data->longest_fixed) {
455 data->offset_fixed = l ? data->last_start_min : data->pos_min;
456 if (data->flags & SF_BEFORE_EOL)
b81d288d 457 data->flags
c277df42
IZ
458 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
459 else
460 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
461 }
462 else {
c277df42 463 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
464 data->offset_float_max = (l
465 ? data->last_start_max
c277df42 466 : data->pos_min + data->pos_delta);
9051bda5
HS
467 if ((U32)data->offset_float_max > (U32)I32_MAX)
468 data->offset_float_max = I32_MAX;
c277df42 469 if (data->flags & SF_BEFORE_EOL)
b81d288d 470 data->flags
c277df42
IZ
471 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
472 else
473 data->flags &= ~SF_FL_BEFORE_EOL;
474 }
475 }
476 SvCUR_set(data->last_found, 0);
0eda9292 477 {
a28509cc
AL
478 SV * const sv = data->last_found;
479 MAGIC * const mg =
0eda9292
JH
480 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
481 if (mg && mg->mg_len > 0)
482 mg->mg_len = 0;
483 }
c277df42
IZ
484 data->last_end = -1;
485 data->flags &= ~SF_BEFORE_EOL;
486}
487
653099ff
GS
488/* Can match anything (initialization) */
489STATIC void
5f66b61c 490S_cl_anything(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 491{
653099ff 492 ANYOF_CLASS_ZERO(cl);
f8bef550 493 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 494 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
495 if (LOC)
496 cl->flags |= ANYOF_LOCALE;
497}
498
499/* Can match anything (initialization) */
500STATIC int
5f66b61c 501S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
502{
503 int value;
504
aaa51d5e 505 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
506 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
507 return 1;
1aa99e6b
IH
508 if (!(cl->flags & ANYOF_UNICODE_ALL))
509 return 0;
f8bef550
NC
510 if (!ANYOF_BITMAP_TESTALLSET(cl))
511 return 0;
653099ff
GS
512 return 1;
513}
514
515/* Can match anything (initialization) */
516STATIC void
5f66b61c 517S_cl_init(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 518{
8ecf7187 519 Zero(cl, 1, struct regnode_charclass_class);
653099ff 520 cl->type = ANYOF;
830247a4 521 cl_anything(pRExC_state, cl);
653099ff
GS
522}
523
524STATIC void
5f66b61c 525S_cl_init_zero(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 526{
8ecf7187 527 Zero(cl, 1, struct regnode_charclass_class);
653099ff 528 cl->type = ANYOF;
830247a4 529 cl_anything(pRExC_state, cl);
653099ff
GS
530 if (LOC)
531 cl->flags |= ANYOF_LOCALE;
532}
533
534/* 'And' a given class with another one. Can create false positives */
535/* We assume that cl is not inverted */
536STATIC void
5f66b61c 537S_cl_and(struct regnode_charclass_class *cl,
a28509cc 538 const struct regnode_charclass_class *and_with)
653099ff 539{
653099ff
GS
540 if (!(and_with->flags & ANYOF_CLASS)
541 && !(cl->flags & ANYOF_CLASS)
542 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
543 && !(and_with->flags & ANYOF_FOLD)
544 && !(cl->flags & ANYOF_FOLD)) {
545 int i;
546
547 if (and_with->flags & ANYOF_INVERT)
548 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
549 cl->bitmap[i] &= ~and_with->bitmap[i];
550 else
551 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
552 cl->bitmap[i] &= and_with->bitmap[i];
553 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
554 if (!(and_with->flags & ANYOF_EOS))
555 cl->flags &= ~ANYOF_EOS;
1aa99e6b 556
14ebb1a2
JH
557 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
558 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
559 cl->flags &= ~ANYOF_UNICODE_ALL;
560 cl->flags |= ANYOF_UNICODE;
561 ARG_SET(cl, ARG(and_with));
562 }
14ebb1a2
JH
563 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
564 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 565 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
566 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
567 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 568 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
569}
570
571/* 'OR' a given class with another one. Can create false positives */
572/* We assume that cl is not inverted */
573STATIC void
5f66b61c 574S_cl_or(RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 575{
653099ff
GS
576 if (or_with->flags & ANYOF_INVERT) {
577 /* We do not use
578 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
579 * <= (B1 | !B2) | (CL1 | !CL2)
580 * which is wasteful if CL2 is small, but we ignore CL2:
581 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
582 * XXXX Can we handle case-fold? Unclear:
583 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
584 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
585 */
586 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
587 && !(or_with->flags & ANYOF_FOLD)
588 && !(cl->flags & ANYOF_FOLD) ) {
589 int i;
590
591 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
592 cl->bitmap[i] |= ~or_with->bitmap[i];
593 } /* XXXX: logic is complicated otherwise */
594 else {
830247a4 595 cl_anything(pRExC_state, cl);
653099ff
GS
596 }
597 } else {
598 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
599 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 600 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
601 || (cl->flags & ANYOF_FOLD)) ) {
602 int i;
603
604 /* OR char bitmap and class bitmap separately */
605 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
606 cl->bitmap[i] |= or_with->bitmap[i];
607 if (or_with->flags & ANYOF_CLASS) {
608 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
609 cl->classflags[i] |= or_with->classflags[i];
610 cl->flags |= ANYOF_CLASS;
611 }
612 }
613 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 614 cl_anything(pRExC_state, cl);
653099ff
GS
615 }
616 }
617 if (or_with->flags & ANYOF_EOS)
618 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
619
620 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
621 ARG(cl) != ARG(or_with)) {
622 cl->flags |= ANYOF_UNICODE_ALL;
623 cl->flags &= ~ANYOF_UNICODE;
624 }
625 if (or_with->flags & ANYOF_UNICODE_ALL) {
626 cl->flags |= ANYOF_UNICODE_ALL;
627 cl->flags &= ~ANYOF_UNICODE;
628 }
653099ff
GS
629}
630
5d1c421c 631/*
a3621e74
YO
632
633 make_trie(startbranch,first,last,tail,flags)
634 startbranch: the first branch in the whole branch sequence
635 first : start branch of sequence of branch-exact nodes.
636 May be the same as startbranch
637 last : Thing following the last branch.
638 May be the same as tail.
639 tail : item following the branch sequence
640 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
641
642Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
643
644A trie is an N'ary tree where the branches are determined by digital
645decomposition of the key. IE, at the root node you look up the 1st character and
646follow that branch repeat until you find the end of the branches. Nodes can be
647marked as "accepting" meaning they represent a complete word. Eg:
648
649 /he|she|his|hers/
650
651would convert into the following structure. Numbers represent states, letters
652following numbers represent valid transitions on the letter from that state, if
653the number is in square brackets it represents an accepting state, otherwise it
654will be in parenthesis.
655
656 +-h->+-e->[3]-+-r->(8)-+-s->[9]
657 | |
658 | (2)
659 | |
660 (1) +-i->(6)-+-s->[7]
661 |
662 +-s->(3)-+-h->(4)-+-e->[5]
663
664 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
665
666This shows that when matching against the string 'hers' we will begin at state 1
667read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
668then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
669is also accepting. Thus we know that we can match both 'he' and 'hers' with a
670single traverse. We store a mapping from accepting to state to which word was
671matched, and then when we have multiple possibilities we try to complete the
672rest of the regex in the order in which they occured in the alternation.
673
674The only prior NFA like behaviour that would be changed by the TRIE support is
675the silent ignoring of duplicate alternations which are of the form:
676
677 / (DUPE|DUPE) X? (?{ ... }) Y /x
678
679Thus EVAL blocks follwing a trie may be called a different number of times with
680and without the optimisation. With the optimisations dupes will be silently
681ignored. This inconsistant behaviour of EVAL type nodes is well established as
682the following demonstrates:
683
684 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
685
686which prints out 'word' three times, but
687
688 'words'=~/(word|word|word)(?{ print $1 })S/
689
690which doesnt print it out at all. This is due to other optimisations kicking in.
691
692Example of what happens on a structural level:
693
694The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
695
696 1: CURLYM[1] {1,32767}(18)
697 5: BRANCH(8)
698 6: EXACT <ac>(16)
699 8: BRANCH(11)
700 9: EXACT <ad>(16)
701 11: BRANCH(14)
702 12: EXACT <ab>(16)
703 16: SUCCEED(0)
704 17: NOTHING(18)
705 18: END(0)
706
707This would be optimizable with startbranch=5, first=5, last=16, tail=16
708and should turn into:
709
710 1: CURLYM[1] {1,32767}(18)
711 5: TRIE(16)
712 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
713 <ac>
714 <ad>
715 <ab>
716 16: SUCCEED(0)
717 17: NOTHING(18)
718 18: END(0)
719
720Cases where tail != last would be like /(?foo|bar)baz/:
721
722 1: BRANCH(4)
723 2: EXACT <foo>(8)
724 4: BRANCH(7)
725 5: EXACT <bar>(8)
726 7: TAIL(8)
727 8: EXACT <baz>(10)
728 10: END(0)
729
730which would be optimizable with startbranch=1, first=1, last=7, tail=8
731and would end up looking like:
732
733 1: TRIE(8)
734 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
735 <foo>
736 <bar>
737 7: TAIL(8)
738 8: EXACT <baz>(10)
739 10: END(0)
740
741*/
742
743#define TRIE_DEBUG_CHAR \
744 DEBUG_TRIE_COMPILE_r({ \
745 SV *tmp; \
746 if ( UTF ) { \
6136c704 747 tmp = newSVpvs( "" ); \
a3621e74
YO
748 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
749 } else { \
e4584336 750 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
a3621e74
YO
751 } \
752 av_push( trie->revcharmap, tmp ); \
753 })
754
755#define TRIE_READ_CHAR STMT_START { \
756 if ( UTF ) { \
757 if ( folder ) { \
758 if ( foldlen > 0 ) { \
759 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
760 foldlen -= len; \
761 scan += len; \
762 len = 0; \
763 } else { \
e1ec3a88 764 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
765 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
766 foldlen -= UNISKIP( uvc ); \
767 scan = foldbuf + UNISKIP( uvc ); \
768 } \
769 } else { \
e1ec3a88 770 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
771 } \
772 } else { \
773 uvc = (U32)*uc; \
774 len = 1; \
775 } \
776} STMT_END
777
778
779#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
780#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
781#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
782#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
783
784#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
785 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
786 TRIE_LIST_LEN( state ) *= 2; \
787 Renew( trie->states[ state ].trans.list, \
788 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
789 } \
790 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
791 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
792 TRIE_LIST_CUR( state )++; \
793} STMT_END
794
795#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 796 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
797 4, reg_trie_trans_le ); \
798 TRIE_LIST_CUR( state ) = 1; \
799 TRIE_LIST_LEN( state ) = 4; \
800} STMT_END
801
802STATIC I32
803S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
804{
27da23d5 805 dVAR;
a3621e74
YO
806 /* first pass, loop through and scan words */
807 reg_trie_data *trie;
808 regnode *cur;
e1ec3a88 809 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
a3621e74
YO
810 STRLEN len = 0;
811 UV uvc = 0;
812 U16 curword = 0;
813 U32 next_alloc = 0;
814 /* we just use folder as a flag in utf8 */
e1ec3a88 815 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
816 ? PL_fold
817 : ( flags == EXACTFL
818 ? PL_fold_locale
819 : NULL
820 )
821 );
822
e1ec3a88 823 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74
YO
824 SV *re_trie_maxbuff;
825
826 GET_RE_DEBUG_FLAGS_DECL;
827
a02a5408 828 Newxz( trie, 1, reg_trie_data );
a3621e74
YO
829 trie->refcount = 1;
830 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 831 Newxz( trie->charmap, 256, U16 );
a3621e74
YO
832 DEBUG_r({
833 trie->words = newAV();
834 trie->revcharmap = newAV();
835 });
836
837
0111c4fd 838 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 839 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 840 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74
YO
841 }
842
843 /* -- First loop and Setup --
844
845 We first traverse the branches and scan each word to determine if it
846 contains widechars, and how many unique chars there are, this is
847 important as we have to build a table with at least as many columns as we
848 have unique chars.
849
850 We use an array of integers to represent the character codes 0..255
851 (trie->charmap) and we use a an HV* to store unicode characters. We use the
852 native representation of the character value as the key and IV's for the
853 coded index.
854
855 *TODO* If we keep track of how many times each character is used we can
856 remap the columns so that the table compression later on is more
857 efficient in terms of memory by ensuring most common value is in the
858 middle and the least common are on the outside. IMO this would be better
859 than a most to least common mapping as theres a decent chance the most
860 common letter will share a node with the least common, meaning the node
861 will not be compressable. With a middle is most common approach the worst
862 case is when we have the least common nodes twice.
863
864 */
865
866
867 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 868 regnode * const noper = NEXTOPER( cur );
e1ec3a88 869 const U8 *uc = (U8*)STRING( noper );
a28509cc 870 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
871 STRLEN foldlen = 0;
872 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 873 const U8 *scan = (U8*)NULL;
a3621e74
YO
874
875 for ( ; uc < e ; uc += len ) {
876 trie->charcount++;
877 TRIE_READ_CHAR;
878 if ( uvc < 256 ) {
879 if ( !trie->charmap[ uvc ] ) {
880 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
881 if ( folder )
882 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
883 TRIE_DEBUG_CHAR;
884 }
885 } else {
886 SV** svpp;
887 if ( !trie->widecharmap )
888 trie->widecharmap = newHV();
889
890 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
891
892 if ( !svpp )
e4584336 893 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
894
895 if ( !SvTRUE( *svpp ) ) {
896 sv_setiv( *svpp, ++trie->uniquecharcount );
897 TRIE_DEBUG_CHAR;
898 }
899 }
900 }
901 trie->wordcount++;
902 } /* end first pass */
903 DEBUG_TRIE_COMPILE_r(
904 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
905 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
5d7488b2 906 (int)trie->charcount, trie->uniquecharcount )
a3621e74
YO
907 );
908
909
910 /*
911 We now know what we are dealing with in terms of unique chars and
912 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
913 representation using a flat table will take. If it's over a reasonable
914 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
915 conservative but potentially much slower representation using an array
916 of lists.
917
918 At the end we convert both representations into the same compressed
919 form that will be used in regexec.c for matching with. The latter
920 is a form that cannot be used to construct with but has memory
921 properties similar to the list form and access properties similar
922 to the table form making it both suitable for fast searches and
923 small enough that its feasable to store for the duration of a program.
924
925 See the comment in the code where the compressed table is produced
926 inplace from the flat tabe representation for an explanation of how
927 the compression works.
928
929 */
930
931
932 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
933 /*
934 Second Pass -- Array Of Lists Representation
935
936 Each state will be represented by a list of charid:state records
937 (reg_trie_trans_le) the first such element holds the CUR and LEN
938 points of the allocated array. (See defines above).
939
940 We build the initial structure using the lists, and then convert
941 it into the compressed table form which allows faster lookups
942 (but cant be modified once converted).
943
944
945 */
946
947
948 STRLEN transcount = 1;
949
a02a5408 950 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
951 TRIE_LIST_NEW(1);
952 next_alloc = 2;
953
954 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
955
c445ea15
AL
956 regnode * const noper = NEXTOPER( cur );
957 U8 *uc = (U8*)STRING( noper );
958 const U8 * const e = uc + STR_LEN( noper );
959 U32 state = 1; /* required init */
960 U16 charid = 0; /* sanity init */
961 U8 *scan = (U8*)NULL; /* sanity init */
962 STRLEN foldlen = 0; /* required init */
963 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
964
965 for ( ; uc < e ; uc += len ) {
966
967 TRIE_READ_CHAR;
968
969 if ( uvc < 256 ) {
970 charid = trie->charmap[ uvc ];
971 } else {
972 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
973 if ( !svpp ) {
974 charid = 0;
975 } else {
976 charid=(U16)SvIV( *svpp );
977 }
978 }
979 if ( charid ) {
a3621e74 980
c445ea15
AL
981 U16 check;
982 U32 newstate = 0;
a3621e74 983
c445ea15
AL
984 charid--;
985 if ( !trie->states[ state ].trans.list ) {
986 TRIE_LIST_NEW( state );
987 }
988 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
989 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
990 newstate = TRIE_LIST_ITEM( state, check ).newstate;
991 break;
992 }
993 }
994 if ( ! newstate ) {
995 newstate = next_alloc++;
996 TRIE_LIST_PUSH( state, charid, newstate );
997 transcount++;
998 }
999 state = newstate;
1000 } else {
1001 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a28509cc 1002 }
c445ea15
AL
1003 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1004 }
a3621e74 1005
c445ea15
AL
1006 if ( !trie->states[ state ].wordnum ) {
1007 /* we havent inserted this word into the structure yet. */
1008 trie->states[ state ].wordnum = ++curword;
a3621e74 1009
c445ea15
AL
1010 DEBUG_r({
1011 /* store the word for dumping */
1012 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1013 if ( UTF ) SvUTF8_on( tmp );
1014 av_push( trie->words, tmp );
1015 });
a3621e74 1016
c445ea15 1017 } else {
bb263b4e 1018 /*EMPTY*/; /* It's a dupe. So ignore it. */
c445ea15 1019 }
a3621e74
YO
1020
1021 } /* end second pass */
1022
1023 trie->laststate = next_alloc;
1024 Renew( trie->states, next_alloc, reg_trie_state );
1025
1026 DEBUG_TRIE_COMPILE_MORE_r({
1027 U32 state;
a3621e74 1028
a28509cc 1029 /* print out the table precompression. */
a3621e74
YO
1030
1031 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1032 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1033
1034 for( state=1 ; state < next_alloc ; state ++ ) {
a28509cc 1035 U16 charid;
a3621e74 1036
e4584336 1037 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
a3621e74
YO
1038 if ( ! trie->states[ state ].wordnum ) {
1039 PerlIO_printf( Perl_debug_log, "%5s| ","");
1040 } else {
e4584336 1041 PerlIO_printf( Perl_debug_log, "W%04x| ",
a3621e74
YO
1042 trie->states[ state ].wordnum
1043 );
1044 }
1045 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1046 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
e4584336 1047 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
cfd0369c 1048 SvPV_nolen_const( *tmp ),
a3621e74 1049 TRIE_LIST_ITEM(state,charid).forid,
e4584336 1050 (UV)TRIE_LIST_ITEM(state,charid).newstate
a3621e74
YO
1051 );
1052 }
1053
1054 }
1055 PerlIO_printf( Perl_debug_log, "\n\n" );
1056 });
1057
a02a5408 1058 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1059 {
1060 U32 state;
a3621e74
YO
1061 U32 tp = 0;
1062 U32 zp = 0;
1063
1064
1065 for( state=1 ; state < next_alloc ; state ++ ) {
1066 U32 base=0;
1067
1068 /*
1069 DEBUG_TRIE_COMPILE_MORE_r(
1070 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1071 );
1072 */
1073
1074 if (trie->states[state].trans.list) {
1075 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1076 U16 maxid=minid;
a28509cc 1077 U16 idx;
a3621e74
YO
1078
1079 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1080 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1081 if ( forid < minid ) {
1082 minid=forid;
1083 } else if ( forid > maxid ) {
1084 maxid=forid;
1085 }
a3621e74
YO
1086 }
1087 if ( transcount < tp + maxid - minid + 1) {
1088 transcount *= 2;
1089 Renew( trie->trans, transcount, reg_trie_trans );
1090 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1091 }
1092 base = trie->uniquecharcount + tp - minid;
1093 if ( maxid == minid ) {
1094 U32 set = 0;
1095 for ( ; zp < tp ; zp++ ) {
1096 if ( ! trie->trans[ zp ].next ) {
1097 base = trie->uniquecharcount + zp - minid;
1098 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1099 trie->trans[ zp ].check = state;
1100 set = 1;
1101 break;
1102 }
1103 }
1104 if ( !set ) {
1105 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1106 trie->trans[ tp ].check = state;
1107 tp++;
1108 zp = tp;
1109 }
1110 } else {
1111 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1112 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1113 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1114 trie->trans[ tid ].check = state;
1115 }
1116 tp += ( maxid - minid + 1 );
1117 }
1118 Safefree(trie->states[ state ].trans.list);
1119 }
1120 /*
1121 DEBUG_TRIE_COMPILE_MORE_r(
1122 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1123 );
1124 */
1125 trie->states[ state ].trans.base=base;
1126 }
cc601c31 1127 trie->lasttrans = tp + 1;
a3621e74
YO
1128 }
1129 } else {
1130 /*
1131 Second Pass -- Flat Table Representation.
1132
1133 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1134 We know that we will need Charcount+1 trans at most to store the data
1135 (one row per char at worst case) So we preallocate both structures
1136 assuming worst case.
1137
1138 We then construct the trie using only the .next slots of the entry
1139 structs.
1140
1141 We use the .check field of the first entry of the node temporarily to
1142 make compression both faster and easier by keeping track of how many non
1143 zero fields are in the node.
1144
1145 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1146 transition.
1147
1148 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1149 number representing the first entry of the node, and state as a
1150 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1151 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1152 are 2 entrys per node. eg:
1153
1154 A B A B
1155 1. 2 4 1. 3 7
1156 2. 0 3 3. 0 5
1157 3. 0 0 5. 0 0
1158 4. 0 0 7. 0 0
1159
1160 The table is internally in the right hand, idx form. However as we also
1161 have to deal with the states array which is indexed by nodenum we have to
1162 use TRIE_NODENUM() to convert.
1163
1164 */
1165
a02a5408 1166 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
a3621e74 1167 reg_trie_trans );
a02a5408 1168 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
1169 next_alloc = trie->uniquecharcount + 1;
1170
1171 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1172
c445ea15 1173 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1174 const U8 *uc = (U8*)STRING( noper );
1175 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1176
1177 U32 state = 1; /* required init */
1178
1179 U16 charid = 0; /* sanity init */
1180 U32 accept_state = 0; /* sanity init */
1181 U8 *scan = (U8*)NULL; /* sanity init */
1182
1183 STRLEN foldlen = 0; /* required init */
1184 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1185
1186
1187 for ( ; uc < e ; uc += len ) {
1188
1189 TRIE_READ_CHAR;
1190
1191 if ( uvc < 256 ) {
1192 charid = trie->charmap[ uvc ];
1193 } else {
c445ea15
AL
1194 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1195 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74
YO
1196 }
1197 if ( charid ) {
1198 charid--;
1199 if ( !trie->trans[ state + charid ].next ) {
1200 trie->trans[ state + charid ].next = next_alloc;
1201 trie->trans[ state ].check++;
1202 next_alloc += trie->uniquecharcount;
1203 }
1204 state = trie->trans[ state + charid ].next;
1205 } else {
e4584336 1206 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1207 }
1208 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1209 }
1210
1211 accept_state = TRIE_NODENUM( state );
1212 if ( !trie->states[ accept_state ].wordnum ) {
1213 /* we havent inserted this word into the structure yet. */
1214 trie->states[ accept_state ].wordnum = ++curword;
1215
1216 DEBUG_r({
1217 /* store the word for dumping */
1218 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1219 if ( UTF ) SvUTF8_on( tmp );
1220 av_push( trie->words, tmp );
1221 });
1222
1223 } else {
bb263b4e 1224 /*EMPTY*/; /* Its a dupe. So ignore it. */
a3621e74
YO
1225 }
1226
1227 } /* end second pass */
1228
1229 DEBUG_TRIE_COMPILE_MORE_r({
1230 /*
1231 print out the table precompression so that we can do a visual check
1232 that they are identical.
1233 */
1234 U32 state;
1235 U16 charid;
1236 PerlIO_printf( Perl_debug_log, "\nChar : " );
1237
1238 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1239 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1240 if ( tmp ) {
cfd0369c 1241 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1242 }
1243 }
1244
1245 PerlIO_printf( Perl_debug_log, "\nState+-" );
1246
1247 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1248 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1249 }
1250
1251 PerlIO_printf( Perl_debug_log, "\n" );
1252
1253 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1254
e4584336 1255 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
a3621e74
YO
1256
1257 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
e4584336
RB
1258 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1259 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
a3621e74
YO
1260 }
1261 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
e4584336 1262 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
a3621e74 1263 } else {
e4584336 1264 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
a3621e74
YO
1265 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1266 }
1267 }
1268 PerlIO_printf( Perl_debug_log, "\n\n" );
1269 });
1270 {
1271 /*
1272 * Inplace compress the table.*
1273
1274 For sparse data sets the table constructed by the trie algorithm will
1275 be mostly 0/FAIL transitions or to put it another way mostly empty.
1276 (Note that leaf nodes will not contain any transitions.)
1277
1278 This algorithm compresses the tables by eliminating most such
1279 transitions, at the cost of a modest bit of extra work during lookup:
1280
1281 - Each states[] entry contains a .base field which indicates the
1282 index in the state[] array wheres its transition data is stored.
1283
1284 - If .base is 0 there are no valid transitions from that node.
1285
1286 - If .base is nonzero then charid is added to it to find an entry in
1287 the trans array.
1288
1289 -If trans[states[state].base+charid].check!=state then the
1290 transition is taken to be a 0/Fail transition. Thus if there are fail
1291 transitions at the front of the node then the .base offset will point
1292 somewhere inside the previous nodes data (or maybe even into a node
1293 even earlier), but the .check field determines if the transition is
1294 valid.
1295
1296 The following process inplace converts the table to the compressed
1297 table: We first do not compress the root node 1,and mark its all its
1298 .check pointers as 1 and set its .base pointer as 1 as well. This
1299 allows to do a DFA construction from the compressed table later, and
1300 ensures that any .base pointers we calculate later are greater than
1301 0.
1302
1303 - We set 'pos' to indicate the first entry of the second node.
1304
1305 - We then iterate over the columns of the node, finding the first and
1306 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1307 and set the .check pointers accordingly, and advance pos
1308 appropriately and repreat for the next node. Note that when we copy
1309 the next pointers we have to convert them from the original
1310 NODEIDX form to NODENUM form as the former is not valid post
1311 compression.
1312
1313 - If a node has no transitions used we mark its base as 0 and do not
1314 advance the pos pointer.
1315
1316 - If a node only has one transition we use a second pointer into the
1317 structure to fill in allocated fail transitions from other states.
1318 This pointer is independent of the main pointer and scans forward
1319 looking for null transitions that are allocated to a state. When it
1320 finds one it writes the single transition into the "hole". If the
1321 pointer doesnt find one the single transition is appeneded as normal.
1322
1323 - Once compressed we can Renew/realloc the structures to release the
1324 excess space.
1325
1326 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1327 specifically Fig 3.47 and the associated pseudocode.
1328
1329 demq
1330 */
a3b680e6 1331 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1332 U32 state, charid;
a3621e74
YO
1333 U32 pos = 0, zp=0;
1334 trie->laststate = laststate;
1335
1336 for ( state = 1 ; state < laststate ; state++ ) {
1337 U8 flag = 0;
a28509cc
AL
1338 const U32 stateidx = TRIE_NODEIDX( state );
1339 const U32 o_used = trie->trans[ stateidx ].check;
1340 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1341 trie->trans[ stateidx ].check = 0;
1342
1343 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1344 if ( flag || trie->trans[ stateidx + charid ].next ) {
1345 if ( trie->trans[ stateidx + charid ].next ) {
1346 if (o_used == 1) {
1347 for ( ; zp < pos ; zp++ ) {
1348 if ( ! trie->trans[ zp ].next ) {
1349 break;
1350 }
1351 }
1352 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1353 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1354 trie->trans[ zp ].check = state;
1355 if ( ++zp > pos ) pos = zp;
1356 break;
1357 }
1358 used--;
1359 }
1360 if ( !flag ) {
1361 flag = 1;
1362 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1363 }
1364 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1365 trie->trans[ pos ].check = state;
1366 pos++;
1367 }
1368 }
1369 }
cc601c31 1370 trie->lasttrans = pos + 1;
a3621e74
YO
1371 Renew( trie->states, laststate + 1, reg_trie_state);
1372 DEBUG_TRIE_COMPILE_MORE_r(
e4584336
RB
1373 PerlIO_printf( Perl_debug_log,
1374 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
5d7488b2
AL
1375 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1376 (IV)next_alloc,
1377 (IV)pos,
a3621e74
YO
1378 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1379 );
1380
1381 } /* end table compress */
1382 }
cc601c31
YO
1383 /* resize the trans array to remove unused space */
1384 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74
YO
1385
1386 DEBUG_TRIE_COMPILE_r({
1387 U32 state;
1388 /*
1389 Now we print it out again, in a slightly different form as there is additional
1390 info we want to be able to see when its compressed. They are close enough for
1391 visual comparison though.
1392 */
1393 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1394
1395 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1396 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1397 if ( tmp ) {
cfd0369c 1398 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1399 }
1400 }
1401 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
cc601c31 1402
a3621e74
YO
1403 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1404 PerlIO_printf( Perl_debug_log, "-----");
1405 PerlIO_printf( Perl_debug_log, "\n");
cc601c31 1406
a3621e74 1407 for( state = 1 ; state < trie->laststate ; state++ ) {
a28509cc 1408 const U32 base = trie->states[ state ].trans.base;
a3621e74 1409
e4584336 1410 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
a3621e74
YO
1411
1412 if ( trie->states[ state ].wordnum ) {
1413 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1414 } else {
1415 PerlIO_printf( Perl_debug_log, "%6s", "" );
1416 }
1417
e4584336 1418 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
a3621e74
YO
1419
1420 if ( base ) {
1421 U32 ofs = 0;
1422
cc601c31
YO
1423 while( ( base + ofs < trie->uniquecharcount ) ||
1424 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1425 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
a3621e74
YO
1426 ofs++;
1427
e4584336 1428 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
a3621e74
YO
1429
1430 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1431 if ( ( base + ofs >= trie->uniquecharcount ) &&
1432 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1433 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1434 {
e4584336
RB
1435 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1436 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
a3621e74
YO
1437 } else {
1438 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1439 }
1440 }
1441
e4584336 1442 PerlIO_printf( Perl_debug_log, "]");
a3621e74
YO
1443
1444 }
1445 PerlIO_printf( Perl_debug_log, "\n" );
1446 }
1447 });
1448
1449 {
1450 /* now finally we "stitch in" the new TRIE node
1451 This means we convert either the first branch or the first Exact,
1452 depending on whether the thing following (in 'last') is a branch
1453 or not and whther first is the startbranch (ie is it a sub part of
1454 the alternation or is it the whole thing.)
1455 Assuming its a sub part we conver the EXACT otherwise we convert
1456 the whole branch sequence, including the first.
1457 */
1458 regnode *convert;
1459
1460
1461
1462
1463 if ( first == startbranch && OP( last ) != BRANCH ) {
1464 convert = first;
1465 } else {
1466 convert = NEXTOPER( first );
1467 NEXT_OFF( first ) = (U16)(last - first);
1468 }
1469
1470 OP( convert ) = TRIE + (U8)( flags - EXACT );
1471 NEXT_OFF( convert ) = (U16)(tail - convert);
1472 ARG_SET( convert, data_slot );
1473
1474 /* tells us if we need to handle accept buffers specially */
1475 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1476
1477
1478 /* needed for dumping*/
1479 DEBUG_r({
1480 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1481 /* We now need to mark all of the space originally used by the
1482 branches as optimized away. This keeps the dumpuntil from
1483 throwing a wobbly as it doesnt use regnext() to traverse the
1484 opcodes.
1485 */
1486 while( optimize < last ) {
1487 OP( optimize ) = OPTIMIZED;
1488 optimize++;
1489 }
1490 });
1491 } /* end node insert */
1492 return 1;
1493}
1494
1495
1496
1497/*
5d1c421c
JH
1498 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1499 * These need to be revisited when a newer toolchain becomes available.
1500 */
1501#if defined(__sparc64__) && defined(__GNUC__)
1502# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1503# undef SPARC64_GCC_WORKAROUND
1504# define SPARC64_GCC_WORKAROUND 1
1505# endif
1506#endif
1507
653099ff
GS
1508/* REx optimizer. Converts nodes into quickier variants "in place".
1509 Finds fixed substrings. */
1510
a0288114 1511/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
1512 to the position after last scanned or to NULL. */
1513
a3621e74 1514
76e3520e 1515STATIC I32
9a957fbc
AL
1516S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1517 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1518 /* scanp: Start here (read-write). */
1519 /* deltap: Write maxlen-minlen here. */
1520 /* last: Stop before this one. */
1521{
97aff369 1522 dVAR;
c277df42
IZ
1523 I32 min = 0, pars = 0, code;
1524 regnode *scan = *scanp, *next;
1525 I32 delta = 0;
1526 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1527 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1528 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1529 scan_data_t data_fake;
653099ff 1530 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1531 SV *re_trie_maxbuff = NULL;
1532
1533 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1534
c277df42
IZ
1535 while (scan && OP(scan) != END && scan < last) {
1536 /* Peephole optimizer: */
a3621e74 1537 DEBUG_OPTIMISE_r({
c445ea15 1538 SV * const mysv=sv_newmortal();
a3621e74 1539 regprop( mysv, scan);
e4584336 1540 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
cfd0369c 1541 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
a3621e74 1542 });
c277df42 1543
22c35a8c 1544 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 1545 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1546 regnode *n = regnext(scan);
1547 U32 stringok = 1;
1548#ifdef DEBUGGING
1549 regnode *stop = scan;
b81d288d 1550#endif
c277df42 1551
cd439c50 1552 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1553 /* Skip NOTHING, merge EXACT*. */
1554 while (n &&
b81d288d 1555 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
1556 (stringok && (OP(n) == OP(scan))))
1557 && NEXT_OFF(n)
1558 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1559 if (OP(n) == TAIL || n > next)
1560 stringok = 0;
22c35a8c 1561 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
1562 NEXT_OFF(scan) += NEXT_OFF(n);
1563 next = n + NODE_STEP_REGNODE;
1564#ifdef DEBUGGING
1565 if (stringok)
1566 stop = n;
b81d288d 1567#endif
c277df42 1568 n = regnext(n);
a0ed51b3 1569 }
f49d4d0f 1570 else if (stringok) {
a3b680e6 1571 const int oldl = STR_LEN(scan);
c445ea15 1572 regnode * const nnext = regnext(n);
f49d4d0f 1573
b81d288d 1574 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1575 break;
1576 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1577 STR_LEN(scan) += STR_LEN(n);
1578 next = n + NODE_SZ_STR(n);
c277df42 1579 /* Now we can overwrite *n : */
f49d4d0f 1580 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1581#ifdef DEBUGGING
f49d4d0f 1582 stop = next - 1;
b81d288d 1583#endif
c277df42
IZ
1584 n = nnext;
1585 }
1586 }
61a36c01 1587
a3621e74 1588 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1589/*
1590 Two problematic code points in Unicode casefolding of EXACT nodes:
1591
1592 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1593 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1594
1595 which casefold to
1596
1597 Unicode UTF-8
1598
1599 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1600 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1601
1602 This means that in case-insensitive matching (or "loose matching",
1603 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1604 length of the above casefolded versions) can match a target string
1605 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1606 This would rather mess up the minimum length computation.
1607
1608 What we'll do is to look for the tail four bytes, and then peek
1609 at the preceding two bytes to see whether we need to decrease
1610 the minimum length by four (six minus two).
1611
1612 Thanks to the design of UTF-8, there cannot be false matches:
1613 A sequence of valid UTF-8 bytes cannot be a subsequence of
1614 another valid sequence of UTF-8 bytes.
1615
1616*/
c445ea15
AL
1617 char * const s0 = STRING(scan), *s, *t;
1618 char * const s1 = s0 + STR_LEN(scan) - 1;
1619 char * const s2 = s1 - 4;
a28509cc
AL
1620 const char * const t0 = "\xcc\x88\xcc\x81";
1621 const char * const t1 = t0 + 3;
2af232bd 1622
61a36c01
JH
1623 for (s = s0 + 2;
1624 s < s2 && (t = ninstr(s, s1, t0, t1));
1625 s = t + 4) {
1626 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1627 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1628 min -= 4;
1629 }
1630 }
1631
c277df42
IZ
1632#ifdef DEBUGGING
1633 /* Allow dumping */
cd439c50 1634 n = scan + NODE_SZ_STR(scan);
c277df42 1635 while (n <= stop) {
22c35a8c 1636 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1637 OP(n) = OPTIMIZED;
1638 NEXT_OFF(n) = 0;
1639 }
1640 n++;
1641 }
653099ff 1642#endif
c277df42 1643 }
a3621e74
YO
1644
1645
1646
653099ff
GS
1647 /* Follow the next-chain of the current node and optimize
1648 away all the NOTHINGs from it. */
c277df42 1649 if (OP(scan) != CURLYX) {
a3b680e6 1650 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
1651 ? I32_MAX
1652 /* I32 may be smaller than U16 on CRAYs! */
1653 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1654 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1655 int noff;
1656 regnode *n = scan;
b81d288d 1657
c277df42
IZ
1658 /* Skip NOTHING and LONGJMP. */
1659 while ((n = regnext(n))
22c35a8c 1660 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1661 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1662 && off + noff < max)
1663 off += noff;
1664 if (reg_off_by_arg[OP(scan)])
1665 ARG(scan) = off;
b81d288d 1666 else
c277df42
IZ
1667 NEXT_OFF(scan) = off;
1668 }
a3621e74 1669
653099ff
GS
1670 /* The principal pseudo-switch. Cannot be a switch, since we
1671 look into several different things. */
b81d288d 1672 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1673 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1674 next = regnext(scan);
1675 code = OP(scan);
a3621e74 1676 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1677
1678 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1679 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1680 struct regnode_charclass_class accum;
a3621e74 1681 regnode *startbranch=scan;
c277df42 1682
653099ff 1683 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1684 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1685 if (flags & SCF_DO_STCLASS)
830247a4 1686 cl_init_zero(pRExC_state, &accum);
a3621e74 1687
c277df42 1688 while (OP(scan) == code) {
830247a4 1689 I32 deltanext, minnext, f = 0, fake;
653099ff 1690 struct regnode_charclass_class this_class;
c277df42
IZ
1691
1692 num++;
1693 data_fake.flags = 0;
b81d288d 1694 if (data) {
2c2d71f5 1695 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1696 data_fake.last_closep = data->last_closep;
1697 }
1698 else
1699 data_fake.last_closep = &fake;
c277df42
IZ
1700 next = regnext(scan);
1701 scan = NEXTOPER(scan);
1702 if (code != BRANCH)
1703 scan = NEXTOPER(scan);
653099ff 1704 if (flags & SCF_DO_STCLASS) {
830247a4 1705 cl_init(pRExC_state, &this_class);
653099ff
GS
1706 data_fake.start_class = &this_class;
1707 f = SCF_DO_STCLASS_AND;
b81d288d 1708 }
e1901655
IZ
1709 if (flags & SCF_WHILEM_VISITED_POS)
1710 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1711
653099ff 1712 /* we suppose the run is continuous, last=next...*/
830247a4 1713 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1714 next, &data_fake, f,depth+1);
b81d288d 1715 if (min1 > minnext)
c277df42
IZ
1716 min1 = minnext;
1717 if (max1 < minnext + deltanext)
1718 max1 = minnext + deltanext;
1719 if (deltanext == I32_MAX)
aca2d497 1720 is_inf = is_inf_internal = 1;
c277df42
IZ
1721 scan = next;
1722 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1723 pars++;
405ff068 1724 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1725 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1726 if (data)
1727 data->whilem_c = data_fake.whilem_c;
653099ff 1728 if (flags & SCF_DO_STCLASS)
830247a4 1729 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1730 if (code == SUSPEND)
c277df42
IZ
1731 break;
1732 }
1733 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1734 min1 = 0;
1735 if (flags & SCF_DO_SUBSTR) {
1736 data->pos_min += min1;
1737 data->pos_delta += max1 - min1;
1738 if (max1 != min1 || is_inf)
1739 data->longest = &(data->longest_float);
1740 }
1741 min += min1;
1742 delta += max1 - min1;
653099ff 1743 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1744 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1745 if (min1) {
1746 cl_and(data->start_class, &and_with);
1747 flags &= ~SCF_DO_STCLASS;
1748 }
1749 }
1750 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1751 if (min1) {
1752 cl_and(data->start_class, &accum);
653099ff 1753 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1754 }
1755 else {
b81d288d 1756 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1757 * data->start_class */
1758 StructCopy(data->start_class, &and_with,
1759 struct regnode_charclass_class);
1760 flags &= ~SCF_DO_STCLASS_AND;
1761 StructCopy(&accum, data->start_class,
1762 struct regnode_charclass_class);
1763 flags |= SCF_DO_STCLASS_OR;
1764 data->start_class->flags |= ANYOF_EOS;
1765 }
653099ff 1766 }
a3621e74
YO
1767
1768 /* demq.
1769
1770 Assuming this was/is a branch we are dealing with: 'scan' now
1771 points at the item that follows the branch sequence, whatever
1772 it is. We now start at the beginning of the sequence and look
1773 for subsequences of
1774
1775 BRANCH->EXACT=>X
1776 BRANCH->EXACT=>X
1777
1778 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1779
1780 If we can find such a subseqence we need to turn the first
1781 element into a trie and then add the subsequent branch exact
1782 strings to the trie.
1783
1784 We have two cases
1785
1786 1. patterns where the whole set of branch can be converted to a trie,
1787
1788 2. patterns where only a subset of the alternations can be
1789 converted to a trie.
1790
1791 In case 1 we can replace the whole set with a single regop
1792 for the trie. In case 2 we need to keep the start and end
1793 branchs so
1794
1795 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1796 becomes BRANCH TRIE; BRANCH X;
1797
1798 Hypthetically when we know the regex isnt anchored we can
1799 turn a case 1 into a DFA and let it rip... Every time it finds a match
1800 it would just call its tail, no WHILEM/CURLY needed.
1801
1802 */
0111c4fd
RGS
1803 if (DO_TRIE) {
1804 if (!re_trie_maxbuff) {
1805 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1806 if (!SvIOK(re_trie_maxbuff))
1807 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1808 }
a3621e74
YO
1809 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1810 regnode *cur;
1811 regnode *first = (regnode *)NULL;
1812 regnode *last = (regnode *)NULL;
1813 regnode *tail = scan;
1814 U8 optype = 0;
1815 U32 count=0;
1816
1817#ifdef DEBUGGING
c445ea15 1818 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
1819#endif
1820 /* var tail is used because there may be a TAIL
1821 regop in the way. Ie, the exacts will point to the
1822 thing following the TAIL, but the last branch will
1823 point at the TAIL. So we advance tail. If we
1824 have nested (?:) we may have to move through several
1825 tails.
1826 */
1827
1828 while ( OP( tail ) == TAIL ) {
1829 /* this is the TAIL generated by (?:) */
1830 tail = regnext( tail );
1831 }
1832
1833 DEBUG_OPTIMISE_r({
1834 regprop( mysv, tail );
1835 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
cfd0369c 1836 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
a3621e74
YO
1837 (RExC_seen_evals) ? "[EVAL]" : ""
1838 );
1839 });
1840 /*
1841
1842 step through the branches, cur represents each
1843 branch, noper is the first thing to be matched
1844 as part of that branch and noper_next is the
1845 regnext() of that node. if noper is an EXACT
1846 and noper_next is the same as scan (our current
1847 position in the regex) then the EXACT branch is
1848 a possible optimization target. Once we have
1849 two or more consequetive such branches we can
1850 create a trie of the EXACT's contents and stich
1851 it in place. If the sequence represents all of
1852 the branches we eliminate the whole thing and
1853 replace it with a single TRIE. If it is a
1854 subsequence then we need to stitch it in. This
1855 means the first branch has to remain, and needs
1856 to be repointed at the item on the branch chain
1857 following the last branch optimized. This could
1858 be either a BRANCH, in which case the
1859 subsequence is internal, or it could be the
1860 item following the branch sequence in which
1861 case the subsequence is at the end.
1862
1863 */
1864
1865 /* dont use tail as the end marker for this traverse */
1866 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
1867 regnode * const noper = NEXTOPER( cur );
1868 regnode * const noper_next = regnext( noper );
a3621e74 1869
a3621e74
YO
1870 DEBUG_OPTIMISE_r({
1871 regprop( mysv, cur);
1872 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1873 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
a3621e74
YO
1874
1875 regprop( mysv, noper);
1876 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 1877 SvPV_nolen_const(mysv));
a3621e74
YO
1878
1879 if ( noper_next ) {
1880 regprop( mysv, noper_next );
1881 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 1882 SvPV_nolen_const(mysv));
a3621e74
YO
1883 }
1884 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1885 first, last, cur );
1886 });
1887 if ( ( first ? OP( noper ) == optype
1888 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1889 && noper_next == tail && count<U16_MAX)
1890 {
1891 count++;
1892 if ( !first ) {
1893 first = cur;
1894 optype = OP( noper );
1895 } else {
1896 DEBUG_OPTIMISE_r(
1897 if (!last ) {
1898 regprop( mysv, first);
1899 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1900 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
a3621e74
YO
1901 regprop( mysv, NEXTOPER(first) );
1902 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1903 SvPV_nolen_const( mysv ) );
a3621e74
YO
1904 }
1905 );
1906 last = cur;
1907 DEBUG_OPTIMISE_r({
1908 regprop( mysv, cur);
1909 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1910 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
a3621e74
YO
1911 regprop( mysv, noper );
1912 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1913 SvPV_nolen_const( mysv ) );
a3621e74
YO
1914 });
1915 }
1916 } else {
1917 if ( last ) {
1918 DEBUG_OPTIMISE_r(
1919 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1920 (int)depth * 2 + 2, "E:", "**END**" );
a3621e74
YO
1921 );
1922 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1923 }
1924 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1925 && noper_next == tail )
1926 {
1927 count = 1;
1928 first = cur;
1929 optype = OP( noper );
1930 } else {
1931 count = 0;
1932 first = NULL;
1933 optype = 0;
1934 }
1935 last = NULL;
1936 }
1937 }
1938 DEBUG_OPTIMISE_r({
1939 regprop( mysv, cur);
1940 PerlIO_printf( Perl_debug_log,
e4584336 1941 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
cfd0369c 1942 " ", SvPV_nolen_const( mysv ), first, last, cur);
a3621e74
YO
1943
1944 });
1945 if ( last ) {
1946 DEBUG_OPTIMISE_r(
1947 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1948 (int)depth * 2 + 2, "E:", "==END==" );
a3621e74
YO
1949 );
1950 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1951 }
1952 }
1953 }
a0ed51b3 1954 }
a3621e74 1955 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 1956 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 1957 } else /* single branch is optimized. */
c277df42
IZ
1958 scan = NEXTOPER(scan);
1959 continue;
a0ed51b3
LW
1960 }
1961 else if (OP(scan) == EXACT) {
cd439c50 1962 I32 l = STR_LEN(scan);
c445ea15 1963 UV uc;
a0ed51b3 1964 if (UTF) {
a3b680e6 1965 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 1966 l = utf8_length(s, s + l);
9041c2e3 1967 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
1968 } else {
1969 uc = *((U8*)STRING(scan));
a0ed51b3
LW
1970 }
1971 min += l;
c277df42 1972 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
1973 /* The code below prefers earlier match for fixed
1974 offset, later match for variable offset. */
1975 if (data->last_end == -1) { /* Update the start info. */
1976 data->last_start_min = data->pos_min;
1977 data->last_start_max = is_inf
b81d288d 1978 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 1979 }
cd439c50 1980 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292 1981 {
9a957fbc 1982 SV * const sv = data->last_found;
a28509cc 1983 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
1984 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1985 if (mg && mg->mg_len >= 0)
5e43f467
JH
1986 mg->mg_len += utf8_length((U8*)STRING(scan),
1987 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 1988 }
33b8afdf
JH
1989 if (UTF)
1990 SvUTF8_on(data->last_found);
c277df42
IZ
1991 data->last_end = data->pos_min + l;
1992 data->pos_min += l; /* As in the first entry. */
1993 data->flags &= ~SF_BEFORE_EOL;
1994 }
653099ff
GS
1995 if (flags & SCF_DO_STCLASS_AND) {
1996 /* Check whether it is compatible with what we know already! */
1997 int compat = 1;
1998
1aa99e6b 1999 if (uc >= 0x100 ||
516a5887 2000 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2001 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2002 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2003 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2004 )
653099ff
GS
2005 compat = 0;
2006 ANYOF_CLASS_ZERO(data->start_class);
2007 ANYOF_BITMAP_ZERO(data->start_class);
2008 if (compat)
1aa99e6b 2009 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2010 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2011 if (uc < 0x100)
2012 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2013 }
2014 else if (flags & SCF_DO_STCLASS_OR) {
2015 /* false positive possible if the class is case-folded */
1aa99e6b 2016 if (uc < 0x100)
9b877dbb
IH
2017 ANYOF_BITMAP_SET(data->start_class, uc);
2018 else
2019 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2020 data->start_class->flags &= ~ANYOF_EOS;
2021 cl_and(data->start_class, &and_with);
2022 }
2023 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2024 }
653099ff 2025 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2026 I32 l = STR_LEN(scan);
1aa99e6b 2027 UV uc = *((U8*)STRING(scan));
653099ff
GS
2028
2029 /* Search for fixed substrings supports EXACT only. */
b81d288d 2030 if (flags & SCF_DO_SUBSTR)
830247a4 2031 scan_commit(pRExC_state, data);
a0ed51b3 2032 if (UTF) {
6136c704 2033 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2034 l = utf8_length(s, s + l);
9041c2e3 2035 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2036 }
2037 min += l;
c277df42 2038 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 2039 data->pos_min += l;
653099ff
GS
2040 if (flags & SCF_DO_STCLASS_AND) {
2041 /* Check whether it is compatible with what we know already! */
2042 int compat = 1;
2043
1aa99e6b 2044 if (uc >= 0x100 ||
516a5887 2045 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2046 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2047 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2048 compat = 0;
2049 ANYOF_CLASS_ZERO(data->start_class);
2050 ANYOF_BITMAP_ZERO(data->start_class);
2051 if (compat) {
1aa99e6b 2052 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2053 data->start_class->flags &= ~ANYOF_EOS;
2054 data->start_class->flags |= ANYOF_FOLD;
2055 if (OP(scan) == EXACTFL)
2056 data->start_class->flags |= ANYOF_LOCALE;
2057 }
2058 }
2059 else if (flags & SCF_DO_STCLASS_OR) {
2060 if (data->start_class->flags & ANYOF_FOLD) {
2061 /* false positive possible if the class is case-folded.
2062 Assume that the locale settings are the same... */
1aa99e6b
IH
2063 if (uc < 0x100)
2064 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2065 data->start_class->flags &= ~ANYOF_EOS;
2066 }
2067 cl_and(data->start_class, &and_with);
2068 }
2069 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2070 }
bfed75c6 2071 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2072 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2073 I32 f = flags, pos_before = 0;
c277df42 2074 regnode *oscan = scan;
653099ff
GS
2075 struct regnode_charclass_class this_class;
2076 struct regnode_charclass_class *oclass = NULL;
727f22e3 2077 I32 next_is_eval = 0;
653099ff 2078
22c35a8c 2079 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2080 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2081 scan = NEXTOPER(scan);
2082 goto finish;
2083 case PLUS:
653099ff 2084 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2085 next = NEXTOPER(scan);
653099ff 2086 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2087 mincount = 1;
2088 maxcount = REG_INFTY;
c277df42
IZ
2089 next = regnext(scan);
2090 scan = NEXTOPER(scan);
2091 goto do_curly;
2092 }
2093 }
2094 if (flags & SCF_DO_SUBSTR)
2095 data->pos_min++;
2096 min++;
2097 /* Fall through. */
2098 case STAR:
653099ff
GS
2099 if (flags & SCF_DO_STCLASS) {
2100 mincount = 0;
b81d288d 2101 maxcount = REG_INFTY;
653099ff
GS
2102 next = regnext(scan);
2103 scan = NEXTOPER(scan);
2104 goto do_curly;
2105 }
b81d288d 2106 is_inf = is_inf_internal = 1;
c277df42
IZ
2107 scan = regnext(scan);
2108 if (flags & SCF_DO_SUBSTR) {
830247a4 2109 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2110 data->longest = &(data->longest_float);
2111 }
2112 goto optimize_curly_tail;
2113 case CURLY:
b81d288d 2114 mincount = ARG1(scan);
c277df42
IZ
2115 maxcount = ARG2(scan);
2116 next = regnext(scan);
cb434fcc
IZ
2117 if (OP(scan) == CURLYX) {
2118 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2119 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2120 }
c277df42 2121 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2122 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2123 do_curly:
2124 if (flags & SCF_DO_SUBSTR) {
830247a4 2125 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2126 pos_before = data->pos_min;
2127 }
2128 if (data) {
2129 fl = data->flags;
2130 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2131 if (is_inf)
2132 data->flags |= SF_IS_INF;
2133 }
653099ff 2134 if (flags & SCF_DO_STCLASS) {
830247a4 2135 cl_init(pRExC_state, &this_class);
653099ff
GS
2136 oclass = data->start_class;
2137 data->start_class = &this_class;
2138 f |= SCF_DO_STCLASS_AND;
2139 f &= ~SCF_DO_STCLASS_OR;
2140 }
e1901655
IZ
2141 /* These are the cases when once a subexpression
2142 fails at a particular position, it cannot succeed
2143 even after backtracking at the enclosing scope.
b81d288d 2144
e1901655
IZ
2145 XXXX what if minimal match and we are at the
2146 initial run of {n,m}? */
2147 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2148 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2149
c277df42 2150 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2151 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2152 (mincount == 0
2153 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2154
2155 if (flags & SCF_DO_STCLASS)
2156 data->start_class = oclass;
2157 if (mincount == 0 || minnext == 0) {
2158 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2159 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2160 }
2161 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2162 /* Switch to OR mode: cache the old value of
653099ff
GS
2163 * data->start_class */
2164 StructCopy(data->start_class, &and_with,
2165 struct regnode_charclass_class);
2166 flags &= ~SCF_DO_STCLASS_AND;
2167 StructCopy(&this_class, data->start_class,
2168 struct regnode_charclass_class);
2169 flags |= SCF_DO_STCLASS_OR;
2170 data->start_class->flags |= ANYOF_EOS;
2171 }
2172 } else { /* Non-zero len */
2173 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2174 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2175 cl_and(data->start_class, &and_with);
2176 }
2177 else if (flags & SCF_DO_STCLASS_AND)
2178 cl_and(data->start_class, &this_class);
2179 flags &= ~SCF_DO_STCLASS;
2180 }
c277df42
IZ
2181 if (!scan) /* It was not CURLYX, but CURLY. */
2182 scan = next;
041457d9
DM
2183 if ( /* ? quantifier ok, except for (?{ ... }) */
2184 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2185 && (minnext == 0) && (deltanext == 0)
99799961 2186 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2187 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2188 && ckWARN(WARN_REGEXP))
b45f050a 2189 {
830247a4 2190 vWARN(RExC_parse,
b45f050a
JF
2191 "Quantifier unexpected on zero-length expression");
2192 }
2193
c277df42 2194 min += minnext * mincount;
b81d288d 2195 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2196 && (minnext + deltanext) > 0)
2197 || deltanext == I32_MAX);
aca2d497 2198 is_inf |= is_inf_internal;
c277df42
IZ
2199 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2200
2201 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2202 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2203 && data->flags & SF_IN_PAR
2204 && !(data->flags & SF_HAS_EVAL)
2205 && !deltanext && minnext == 1 ) {
2206 /* Try to optimize to CURLYN. */
2207 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
2208 regnode *nxt1 = nxt;
2209#ifdef DEBUGGING
2210 regnode *nxt2;
2211#endif
c277df42
IZ
2212
2213 /* Skip open. */
2214 nxt = regnext(nxt);
bfed75c6 2215 if (!strchr((const char*)PL_simple,OP(nxt))
22c35a8c 2216 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 2217 && STR_LEN(nxt) == 1))
c277df42 2218 goto nogo;
497b47a8 2219#ifdef DEBUGGING
c277df42 2220 nxt2 = nxt;
497b47a8 2221#endif
c277df42 2222 nxt = regnext(nxt);
b81d288d 2223 if (OP(nxt) != CLOSE)
c277df42
IZ
2224 goto nogo;
2225 /* Now we know that nxt2 is the only contents: */
eb160463 2226 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2227 OP(oscan) = CURLYN;
2228 OP(nxt1) = NOTHING; /* was OPEN. */
2229#ifdef DEBUGGING
2230 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2231 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2232 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2233 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2234 OP(nxt + 1) = OPTIMIZED; /* was count. */
2235 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2236#endif
c277df42 2237 }
c277df42
IZ
2238 nogo:
2239
2240 /* Try optimization CURLYX => CURLYM. */
b81d288d 2241 if ( OP(oscan) == CURLYX && data
c277df42 2242 && !(data->flags & SF_HAS_PAR)
c277df42 2243 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2244 && !deltanext /* atom is fixed width */
2245 && minnext != 0 /* CURLYM can't handle zero width */
2246 ) {
c277df42
IZ
2247 /* XXXX How to optimize if data == 0? */
2248 /* Optimize to a simpler form. */
2249 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2250 regnode *nxt2;
2251
2252 OP(oscan) = CURLYM;
2253 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2254 && (OP(nxt2) != WHILEM))
c277df42
IZ
2255 nxt = nxt2;
2256 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2257 /* Need to optimize away parenths. */
2258 if (data->flags & SF_IN_PAR) {
2259 /* Set the parenth number. */
2260 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2261
b81d288d 2262 if (OP(nxt) != CLOSE)
b45f050a 2263 FAIL("Panic opt close");
eb160463 2264 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2265 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2266 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2267#ifdef DEBUGGING
2268 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2269 OP(nxt + 1) = OPTIMIZED; /* was count. */
2270 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2271 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2272#endif
c277df42
IZ
2273#if 0
2274 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2275 regnode *nnxt = regnext(nxt1);
b81d288d 2276
c277df42
IZ
2277 if (nnxt == nxt) {
2278 if (reg_off_by_arg[OP(nxt1)])
2279 ARG_SET(nxt1, nxt2 - nxt1);
2280 else if (nxt2 - nxt1 < U16_MAX)
2281 NEXT_OFF(nxt1) = nxt2 - nxt1;
2282 else
2283 OP(nxt) = NOTHING; /* Cannot beautify */
2284 }
2285 nxt1 = nnxt;
2286 }
2287#endif
2288 /* Optimize again: */
b81d288d 2289 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2290 NULL, 0,depth+1);
a0ed51b3
LW
2291 }
2292 else
c277df42 2293 oscan->flags = 0;
c277df42 2294 }
e1901655
IZ
2295 else if ((OP(oscan) == CURLYX)
2296 && (flags & SCF_WHILEM_VISITED_POS)
2297 /* See the comment on a similar expression above.
2298 However, this time it not a subexpression
2299 we care about, but the expression itself. */
2300 && (maxcount == REG_INFTY)
2301 && data && ++data->whilem_c < 16) {
2302 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2303 /* Find WHILEM (as in regexec.c) */
2304 regnode *nxt = oscan + NEXT_OFF(oscan);
2305
2306 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2307 nxt += ARG(nxt);
eb160463
GS
2308 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2309 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2310 }
b81d288d 2311 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2312 pars++;
2313 if (flags & SCF_DO_SUBSTR) {
c445ea15 2314 SV *last_str = NULL;
c277df42
IZ
2315 int counted = mincount != 0;
2316
2317 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2318#if defined(SPARC64_GCC_WORKAROUND)
2319 I32 b = 0;
2320 STRLEN l = 0;
cfd0369c 2321 const char *s = NULL;
5d1c421c
JH
2322 I32 old = 0;
2323
2324 if (pos_before >= data->last_start_min)
2325 b = pos_before;
2326 else
2327 b = data->last_start_min;
2328
2329 l = 0;
cfd0369c 2330 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2331 old = b - data->last_start_min;
2332
2333#else
b81d288d 2334 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2335 ? pos_before : data->last_start_min;
2336 STRLEN l;
cfd0369c 2337 const char *s = SvPV_const(data->last_found, l);
a0ed51b3 2338 I32 old = b - data->last_start_min;
5d1c421c 2339#endif
a0ed51b3
LW
2340
2341 if (UTF)
2342 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2343
a0ed51b3 2344 l -= old;
c277df42 2345 /* Get the added string: */
79cb57f6 2346 last_str = newSVpvn(s + old, l);
0e933229
IH
2347 if (UTF)
2348 SvUTF8_on(last_str);
c277df42
IZ
2349 if (deltanext == 0 && pos_before == b) {
2350 /* What was added is a constant string */
2351 if (mincount > 1) {
2352 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2353 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2354 SvPVX_const(last_str), l, mincount - 1);
b162af07 2355 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2356 /* Add additional parts. */
b81d288d 2357 SvCUR_set(data->last_found,
c277df42
IZ
2358 SvCUR(data->last_found) - l);
2359 sv_catsv(data->last_found, last_str);
0eda9292
JH
2360 {
2361 SV * sv = data->last_found;
2362 MAGIC *mg =
2363 SvUTF8(sv) && SvMAGICAL(sv) ?
2364 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2365 if (mg && mg->mg_len >= 0)
2366 mg->mg_len += CHR_SVLEN(last_str);
2367 }
c277df42
IZ
2368 data->last_end += l * (mincount - 1);
2369 }
2a8d9689
HS
2370 } else {
2371 /* start offset must point into the last copy */
2372 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2373 data->last_start_max += is_inf ? I32_MAX
2374 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2375 }
2376 }
2377 /* It is counted once already... */
2378 data->pos_min += minnext * (mincount - counted);
2379 data->pos_delta += - counted * deltanext +
2380 (minnext + deltanext) * maxcount - minnext * mincount;
2381 if (mincount != maxcount) {
653099ff
GS
2382 /* Cannot extend fixed substrings found inside
2383 the group. */
830247a4 2384 scan_commit(pRExC_state,data);
c277df42
IZ
2385 if (mincount && last_str) {
2386 sv_setsv(data->last_found, last_str);
2387 data->last_end = data->pos_min;
b81d288d 2388 data->last_start_min =
a0ed51b3 2389 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2390 data->last_start_max = is_inf
2391 ? I32_MAX
c277df42 2392 : data->pos_min + data->pos_delta
a0ed51b3 2393 - CHR_SVLEN(last_str);
c277df42
IZ
2394 }
2395 data->longest = &(data->longest_float);
2396 }
aca2d497 2397 SvREFCNT_dec(last_str);
c277df42 2398 }
405ff068 2399 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2400 data->flags |= SF_HAS_EVAL;
2401 optimize_curly_tail:
c277df42 2402 if (OP(oscan) != CURLYX) {
22c35a8c 2403 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2404 && NEXT_OFF(next))
2405 NEXT_OFF(oscan) += NEXT_OFF(next);
2406 }
c277df42 2407 continue;
653099ff 2408 default: /* REF and CLUMP only? */
c277df42 2409 if (flags & SCF_DO_SUBSTR) {
830247a4 2410 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2411 data->longest = &(data->longest_float);
2412 }
aca2d497 2413 is_inf = is_inf_internal = 1;
653099ff 2414 if (flags & SCF_DO_STCLASS_OR)
830247a4 2415 cl_anything(pRExC_state, data->start_class);
653099ff 2416 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2417 break;
2418 }
a0ed51b3 2419 }
bfed75c6 2420 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2421 int value = 0;
653099ff 2422
c277df42 2423 if (flags & SCF_DO_SUBSTR) {
830247a4 2424 scan_commit(pRExC_state,data);
c277df42
IZ
2425 data->pos_min++;
2426 }
2427 min++;
653099ff
GS
2428 if (flags & SCF_DO_STCLASS) {
2429 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2430
2431 /* Some of the logic below assumes that switching
2432 locale on will only add false positives. */
2433 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2434 case SANY:
653099ff
GS
2435 default:
2436 do_default:
2437 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2438 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2439 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2440 break;
2441 case REG_ANY:
2442 if (OP(scan) == SANY)
2443 goto do_default;
2444 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2445 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2446 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2447 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2448 }
2449 if (flags & SCF_DO_STCLASS_AND || !value)
2450 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2451 break;
2452 case ANYOF:
2453 if (flags & SCF_DO_STCLASS_AND)
2454 cl_and(data->start_class,
2455 (struct regnode_charclass_class*)scan);
2456 else
830247a4 2457 cl_or(pRExC_state, data->start_class,
653099ff
GS
2458 (struct regnode_charclass_class*)scan);
2459 break;
2460 case ALNUM:
2461 if (flags & SCF_DO_STCLASS_AND) {
2462 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2463 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2464 for (value = 0; value < 256; value++)
2465 if (!isALNUM(value))
2466 ANYOF_BITMAP_CLEAR(data->start_class, value);
2467 }
2468 }
2469 else {
2470 if (data->start_class->flags & ANYOF_LOCALE)
2471 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2472 else {
2473 for (value = 0; value < 256; value++)
2474 if (isALNUM(value))
b81d288d 2475 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2476 }
2477 }
2478 break;
2479 case ALNUML:
2480 if (flags & SCF_DO_STCLASS_AND) {
2481 if (data->start_class->flags & ANYOF_LOCALE)
2482 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2483 }
2484 else {
2485 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2486 data->start_class->flags |= ANYOF_LOCALE;
2487 }
2488 break;
2489 case NALNUM:
2490 if (flags & SCF_DO_STCLASS_AND) {
2491 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2492 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2493 for (value = 0; value < 256; value++)
2494 if (isALNUM(value))
2495 ANYOF_BITMAP_CLEAR(data->start_class, value);
2496 }
2497 }
2498 else {
2499 if (data->start_class->flags & ANYOF_LOCALE)
2500 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2501 else {
2502 for (value = 0; value < 256; value++)
2503 if (!isALNUM(value))
b81d288d 2504 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2505 }
2506 }
2507 break;
2508 case NALNUML:
2509 if (flags & SCF_DO_STCLASS_AND) {
2510 if (data->start_class->flags & ANYOF_LOCALE)
2511 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2512 }
2513 else {
2514 data->start_class->flags |= ANYOF_LOCALE;
2515 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2516 }
2517 break;
2518 case SPACE:
2519 if (flags & SCF_DO_STCLASS_AND) {
2520 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2521 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2522 for (value = 0; value < 256; value++)
2523 if (!isSPACE(value))
2524 ANYOF_BITMAP_CLEAR(data->start_class, value);
2525 }
2526 }
2527 else {
2528 if (data->start_class->flags & ANYOF_LOCALE)
2529 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2530 else {
2531 for (value = 0; value < 256; value++)
2532 if (isSPACE(value))
b81d288d 2533 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2534 }
2535 }
2536 break;
2537 case SPACEL:
2538 if (flags & SCF_DO_STCLASS_AND) {
2539 if (data->start_class->flags & ANYOF_LOCALE)
2540 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2541 }
2542 else {
2543 data->start_class->flags |= ANYOF_LOCALE;
2544 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2545 }
2546 break;
2547 case NSPACE:
2548 if (flags & SCF_DO_STCLASS_AND) {
2549 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2550 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2551 for (value = 0; value < 256; value++)
2552 if (isSPACE(value))
2553 ANYOF_BITMAP_CLEAR(data->start_class, value);
2554 }
2555 }
2556 else {
2557 if (data->start_class->flags & ANYOF_LOCALE)
2558 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2559 else {
2560 for (value = 0; value < 256; value++)
2561 if (!isSPACE(value))
b81d288d 2562 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2563 }
2564 }
2565 break;
2566 case NSPACEL:
2567 if (flags & SCF_DO_STCLASS_AND) {
2568 if (data->start_class->flags & ANYOF_LOCALE) {
2569 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2570 for (value = 0; value < 256; value++)
2571 if (!isSPACE(value))
2572 ANYOF_BITMAP_CLEAR(data->start_class, value);
2573 }
2574 }
2575 else {
2576 data->start_class->flags |= ANYOF_LOCALE;
2577 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2578 }
2579 break;
2580 case DIGIT:
2581 if (flags & SCF_DO_STCLASS_AND) {
2582 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2583 for (value = 0; value < 256; value++)
2584 if (!isDIGIT(value))
2585 ANYOF_BITMAP_CLEAR(data->start_class, value);
2586 }
2587 else {
2588 if (data->start_class->flags & ANYOF_LOCALE)
2589 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2590 else {
2591 for (value = 0; value < 256; value++)
2592 if (isDIGIT(value))
b81d288d 2593 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2594 }
2595 }
2596 break;
2597 case NDIGIT:
2598 if (flags & SCF_DO_STCLASS_AND) {
2599 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2600 for (value = 0; value < 256; value++)
2601 if (isDIGIT(value))
2602 ANYOF_BITMAP_CLEAR(data->start_class, value);
2603 }
2604 else {
2605 if (data->start_class->flags & ANYOF_LOCALE)
2606 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2607 else {
2608 for (value = 0; value < 256; value++)
2609 if (!isDIGIT(value))
b81d288d 2610 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2611 }
2612 }
2613 break;
2614 }
2615 if (flags & SCF_DO_STCLASS_OR)
2616 cl_and(data->start_class, &and_with);
2617 flags &= ~SCF_DO_STCLASS;
2618 }
a0ed51b3 2619 }
22c35a8c 2620 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2621 data->flags |= (OP(scan) == MEOL
2622 ? SF_BEFORE_MEOL
2623 : SF_BEFORE_SEOL);
a0ed51b3 2624 }
653099ff
GS
2625 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2626 /* Lookbehind, or need to calculate parens/evals/stclass: */
2627 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2628 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2629 /* Lookahead/lookbehind */
cb434fcc 2630 I32 deltanext, minnext, fake = 0;
c277df42 2631 regnode *nscan;
653099ff
GS
2632 struct regnode_charclass_class intrnl;
2633 int f = 0;
c277df42
IZ
2634
2635 data_fake.flags = 0;
b81d288d 2636 if (data) {
2c2d71f5 2637 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2638 data_fake.last_closep = data->last_closep;
2639 }
2640 else
2641 data_fake.last_closep = &fake;
653099ff
GS
2642 if ( flags & SCF_DO_STCLASS && !scan->flags
2643 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2644 cl_init(pRExC_state, &intrnl);
653099ff 2645 data_fake.start_class = &intrnl;
e1901655 2646 f |= SCF_DO_STCLASS_AND;
653099ff 2647 }
e1901655
IZ
2648 if (flags & SCF_WHILEM_VISITED_POS)
2649 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2650 next = regnext(scan);
2651 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2652 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2653 if (scan->flags) {
2654 if (deltanext) {
9baa0206 2655 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2656 }
2657 else if (minnext > U8_MAX) {
9baa0206 2658 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2659 }
eb160463 2660 scan->flags = (U8)minnext;
c277df42
IZ
2661 }
2662 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2663 pars++;
405ff068 2664 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2665 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2666 if (data)
2667 data->whilem_c = data_fake.whilem_c;
e1901655 2668 if (f & SCF_DO_STCLASS_AND) {
a28509cc 2669 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
2670
2671 cl_and(data->start_class, &intrnl);
2672 if (was)
2673 data->start_class->flags |= ANYOF_EOS;
2674 }
a0ed51b3
LW
2675 }
2676 else if (OP(scan) == OPEN) {
c277df42 2677 pars++;
a0ed51b3 2678 }
cb434fcc 2679 else if (OP(scan) == CLOSE) {
eb160463 2680 if ((I32)ARG(scan) == is_par) {
cb434fcc 2681 next = regnext(scan);
c277df42 2682
cb434fcc
IZ
2683 if ( next && (OP(next) != WHILEM) && next < last)
2684 is_par = 0; /* Disable optimization */
2685 }
2686 if (data)
2687 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2688 }
2689 else if (OP(scan) == EVAL) {
c277df42
IZ
2690 if (data)
2691 data->flags |= SF_HAS_EVAL;
2692 }
96776eda 2693 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2694 if (flags & SCF_DO_SUBSTR) {
830247a4 2695 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2696 data->longest = &(data->longest_float);
2697 }
2698 is_inf = is_inf_internal = 1;
653099ff 2699 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2700 cl_anything(pRExC_state, data->start_class);
96776eda 2701 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2702 }
c277df42
IZ
2703 /* Else: zero-length, ignore. */
2704 scan = regnext(scan);
2705 }
2706
2707 finish:
2708 *scanp = scan;
aca2d497 2709 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2710 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2711 data->pos_delta = I32_MAX - data->pos_min;
2712 if (is_par > U8_MAX)
2713 is_par = 0;
2714 if (is_par && pars==1 && data) {
2715 data->flags |= SF_IN_PAR;
2716 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2717 }
2718 else if (pars && data) {
c277df42
IZ
2719 data->flags |= SF_HAS_PAR;
2720 data->flags &= ~SF_IN_PAR;
2721 }
653099ff
GS
2722 if (flags & SCF_DO_STCLASS_OR)
2723 cl_and(data->start_class, &and_with);
c277df42
IZ
2724 return min;
2725}
2726
76e3520e 2727STATIC I32
5f66b61c 2728S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2729{
830247a4 2730 if (RExC_rx->data) {
b81d288d
AB
2731 Renewc(RExC_rx->data,
2732 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2733 char, struct reg_data);
830247a4
IZ
2734 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2735 RExC_rx->data->count += n;
a0ed51b3
LW
2736 }
2737 else {
a02a5408 2738 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2739 char, struct reg_data);
a02a5408 2740 Newx(RExC_rx->data->what, n, U8);
830247a4 2741 RExC_rx->data->count = n;
c277df42 2742 }
830247a4
IZ
2743 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2744 return RExC_rx->data->count - n;
c277df42
IZ
2745}
2746
d88dccdf 2747void
864dbfa3 2748Perl_reginitcolors(pTHX)
d88dccdf 2749{
97aff369 2750 dVAR;
1df70142 2751 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 2752 if (s) {
1df70142
AL
2753 char *t = savepv(s);
2754 int i = 0;
2755 PL_colors[0] = t;
d88dccdf 2756 while (++i < 6) {
1df70142
AL
2757 t = strchr(t, '\t');
2758 if (t) {
2759 *t = '\0';
2760 PL_colors[i] = ++t;
d88dccdf
IZ
2761 }
2762 else
1df70142 2763 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
2764 }
2765 } else {
1df70142 2766 int i = 0;
b81d288d 2767 while (i < 6)
06b5626a 2768 PL_colors[i++] = (char *)"";
d88dccdf
IZ
2769 }
2770 PL_colorset = 1;
2771}
2772
8615cb43 2773
a687059c 2774/*
e50aee73 2775 - pregcomp - compile a regular expression into internal code
a687059c
LW
2776 *
2777 * We can't allocate space until we know how big the compiled form will be,
2778 * but we can't compile it (and thus know how big it is) until we've got a
2779 * place to put the code. So we cheat: we compile it twice, once with code
2780 * generation turned off and size counting turned on, and once "for real".
2781 * This also means that we don't allocate space until we are sure that the
2782 * thing really will compile successfully, and we never have to move the
2783 * code and thus invalidate pointers into it. (Note that it has to be in
2784 * one piece because free() must be able to free it all.) [NB: not true in perl]
2785 *
2786 * Beware that the optimization-preparation code in here knows about some
2787 * of the structure of the compiled regexp. [I'll say.]
2788 */
2789regexp *
864dbfa3 2790Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2791{
97aff369 2792 dVAR;
a0d0e21e 2793 register regexp *r;
c277df42 2794 regnode *scan;
c277df42 2795 regnode *first;
a0d0e21e 2796 I32 flags;
a0d0e21e
LW
2797 I32 minlen = 0;
2798 I32 sawplus = 0;
2799 I32 sawopen = 0;
2c2d71f5 2800 scan_data_t data;
830247a4
IZ
2801 RExC_state_t RExC_state;
2802 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 2803
a3621e74
YO
2804 GET_RE_DEBUG_FLAGS_DECL;
2805
a0d0e21e 2806 if (exp == NULL)
c277df42 2807 FAIL("NULL regexp argument");
a0d0e21e 2808
a5961de5 2809 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 2810
5cfc7842 2811 RExC_precomp = exp;
a3621e74
YO
2812 DEBUG_r(if (!PL_colorset) reginitcolors());
2813 DEBUG_COMPILE_r({
2814 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
2815 PL_colors[4],PL_colors[5],PL_colors[0],
2816 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2817 });
e2509266 2818 RExC_flags = pm->op_pmflags;
830247a4 2819 RExC_sawback = 0;
bbce6d69 2820
830247a4
IZ
2821 RExC_seen = 0;
2822 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2823 RExC_seen_evals = 0;
2824 RExC_extralen = 0;
c277df42 2825
bbce6d69 2826 /* First pass: determine size, legality. */
830247a4 2827 RExC_parse = exp;
fac92740 2828 RExC_start = exp;
830247a4
IZ
2829 RExC_end = xend;
2830 RExC_naughty = 0;
2831 RExC_npar = 1;
2832 RExC_size = 0L;
2833 RExC_emit = &PL_regdummy;
2834 RExC_whilem_seen = 0;
85ddcde9
JH
2835#if 0 /* REGC() is (currently) a NOP at the first pass.
2836 * Clever compilers notice this and complain. --jhi */
830247a4 2837 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 2838#endif
830247a4 2839 if (reg(pRExC_state, 0, &flags) == NULL) {
c445ea15 2840 RExC_precomp = NULL;
a0d0e21e
LW
2841 return(NULL);
2842 }
a3621e74 2843 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 2844
c277df42
IZ
2845 /* Small enough for pointer-storage convention?
2846 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
2847 if (RExC_size >= 0x10000L && RExC_extralen)
2848 RExC_size += RExC_extralen;
c277df42 2849 else
830247a4
IZ
2850 RExC_extralen = 0;
2851 if (RExC_whilem_seen > 15)
2852 RExC_whilem_seen = 15;
a0d0e21e 2853
bbce6d69 2854 /* Allocate space and initialize. */
a02a5408 2855 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 2856 char, regexp);
a0d0e21e 2857 if (r == NULL)
b45f050a
JF
2858 FAIL("Regexp out of space");
2859
0f79a09d
GS
2860#ifdef DEBUGGING
2861 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 2862 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 2863#endif
c277df42 2864 r->refcnt = 1;
bbce6d69 2865 r->prelen = xend - exp;
5cfc7842 2866 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 2867 r->subbeg = NULL;
f8c7b90f 2868#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 2869 r->saved_copy = NULL;
ed252734 2870#endif
cf93c79d 2871 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 2872 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 2873 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
2874
2875 r->substrs = 0; /* Useful during FAIL. */
2876 r->startp = 0; /* Useful during FAIL. */
2877 r->endp = 0; /* Useful during FAIL. */
2878
a02a5408 2879 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 2880 if (r->offsets) {
2af232bd 2881 r->offsets[0] = RExC_size;
fac92740 2882 }
a3621e74 2883 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
2884 "%s %"UVuf" bytes for offset annotations.\n",
2885 r->offsets ? "Got" : "Couldn't get",
392fbf5d 2886 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 2887
830247a4 2888 RExC_rx = r;
bbce6d69 2889
2890 /* Second pass: emit code. */
e2509266 2891 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
2892 RExC_parse = exp;
2893 RExC_end = xend;
2894 RExC_naughty = 0;
2895 RExC_npar = 1;
fac92740 2896 RExC_emit_start = r->program;
830247a4 2897 RExC_emit = r->program;
2cd61cdb 2898 /* Store the count of eval-groups for security checks: */
eb160463 2899 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 2900 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 2901 r->data = 0;
830247a4 2902 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
2903 return(NULL);
2904
a3621e74 2905
a0d0e21e 2906 /* Dig out information for optimizations. */
cf93c79d 2907 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 2908 pm->op_pmflags = RExC_flags;
a0ed51b3 2909 if (UTF)
5ff6fc6d 2910 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 2911 r->regstclass = NULL;
830247a4 2912 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 2913 r->reganch |= ROPT_NAUGHTY;
c277df42 2914 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
2915
2916 /* XXXX To minimize changes to RE engine we always allocate
2917 3-units-long substrs field. */
a02a5408 2918 Newxz(r->substrs, 1, struct reg_substr_data);
2779dcf1 2919
2c2d71f5 2920 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 2921 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 2922 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 2923 I32 fake;
c5254dd6 2924 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
2925 struct regnode_charclass_class ch_class;
2926 int stclass_flag;
cb434fcc 2927 I32 last_close = 0;
a0d0e21e
LW
2928
2929 first = scan;
c277df42 2930 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 2931 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 2932 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
2933 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2934 (OP(first) == PLUS) ||
2935 (OP(first) == MINMOD) ||
653099ff 2936 /* An {n,m} with n>0 */
22c35a8c 2937 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
2938 if (OP(first) == PLUS)
2939 sawplus = 1;
2940 else
2941 first += regarglen[(U8)OP(first)];
2942 first = NEXTOPER(first);
a687059c
LW
2943 }
2944
a0d0e21e
LW
2945 /* Starting-point info. */
2946 again:
653099ff 2947 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b 2948 if (OP(first) == EXACT)
bb263b4e 2949 /*EMPTY*/; /* Empty, get anchored substr later. */
1aa99e6b 2950 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
2951 r->regstclass = first;
2952 }
bfed75c6 2953 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 2954 r->regstclass = first;
22c35a8c
GS
2955 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2956 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 2957 r->regstclass = first;
22c35a8c 2958 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
2959 r->reganch |= (OP(first) == MBOL
2960 ? ROPT_ANCH_MBOL
2961 : (OP(first) == SBOL
2962 ? ROPT_ANCH_SBOL
2963 : ROPT_ANCH_BOL));
a0d0e21e 2964 first = NEXTOPER(first);
774d564b 2965 goto again;
2966 }
2967 else if (OP(first) == GPOS) {
2968 r->reganch |= ROPT_ANCH_GPOS;
2969 first = NEXTOPER(first);
2970 goto again;
a0d0e21e 2971 }
e09294f4 2972 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 2973 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
2974 !(r->reganch & ROPT_ANCH) )
2975 {
2976 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
2977 const int type =
2978 (OP(NEXTOPER(first)) == REG_ANY)
2979 ? ROPT_ANCH_MBOL
2980 : ROPT_ANCH_SBOL;
cad2e5aa 2981 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 2982 first = NEXTOPER(first);
774d564b 2983 goto again;
a0d0e21e 2984 }
b81d288d 2985 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 2986 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
2987 /* x+ must match at the 1st pos of run of x's */
2988 r->reganch |= ROPT_SKIP;
a0d0e21e 2989
c277df42 2990 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 2991 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 2992 (IV)(first - scan + 1)));
a0d0e21e
LW
2993 /*
2994 * If there's something expensive in the r.e., find the
2995 * longest literal string that must appear and make it the
2996 * regmust. Resolve ties in favor of later strings, since
2997 * the regstart check works with the beginning of the r.e.
2998 * and avoiding duplication strengthens checking. Not a
2999 * strong reason, but sufficient in the absence of others.
3000 * [Now we resolve ties in favor of the earlier string if
c277df42 3001 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3002 * earlier string may buy us something the later one won't.]
3003 */
a0d0e21e 3004 minlen = 0;
a687059c 3005
396482e1
GA
3006 data.longest_fixed = newSVpvs("");
3007 data.longest_float = newSVpvs("");
3008 data.last_found = newSVpvs("");
c277df42
IZ
3009 data.longest = &(data.longest_fixed);
3010 first = scan;
653099ff 3011 if (!r->regstclass) {
830247a4 3012 cl_init(pRExC_state, &ch_class);
653099ff
GS
3013 data.start_class = &ch_class;
3014 stclass_flag = SCF_DO_STCLASS_AND;
3015 } else /* XXXX Check for BOUND? */
3016 stclass_flag = 0;
cb434fcc 3017 data.last_closep = &last_close;
653099ff 3018
830247a4 3019 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3020 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3021 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3022 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3023 && !RExC_seen_zerolen
3024 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3025 r->reganch |= ROPT_CHECK_ALL;
830247a4 3026 scan_commit(pRExC_state, &data);
c277df42
IZ
3027 SvREFCNT_dec(data.last_found);
3028
a0ed51b3 3029 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3030 if (longest_float_length
c277df42
IZ
3031 || (data.flags & SF_FL_BEFORE_EOL
3032 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3033 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3034 int t;
3035
a0ed51b3 3036 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3037 && data.offset_fixed == data.offset_float_min
3038 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3039 goto remove_float; /* As in (a)+. */
3040
33b8afdf
JH
3041 if (SvUTF8(data.longest_float)) {
3042 r->float_utf8 = data.longest_float;
c445ea15 3043 r->float_substr = NULL;
33b8afdf
JH
3044 } else {
3045 r->float_substr = data.longest_float;
c445ea15 3046 r->float_utf8 = NULL;
33b8afdf 3047 }
c277df42
IZ
3048 r->float_min_offset = data.offset_float_min;
3049 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3050 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3051 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3052 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3053 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3054 }
3055 else {
aca2d497 3056 remove_float:
c445ea15 3057 r->float_substr = r->float_utf8 = NULL;
c277df42 3058 SvREFCNT_dec(data.longest_float);
c5254dd6 3059 longest_float_length = 0;
a0d0e21e 3060 }
c277df42 3061
a0ed51b3 3062 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3063 if (longest_fixed_length
c277df42
IZ
3064 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3065 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3066 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3067 int t;
3068
33b8afdf
JH
3069 if (SvUTF8(data.longest_fixed)) {
3070 r->anchored_utf8 = data.longest_fixed;
c445ea15 3071 r->anchored_substr = NULL;
33b8afdf
JH
3072 } else {
3073 r->anchored_substr = data.longest_fixed;
c445ea15 3074 r->anchored_utf8 = NULL;
33b8afdf 3075 }
c277df42 3076 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3077 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3078 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3079 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3080 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3081 }
3082 else {
c445ea15 3083 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 3084 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3085 longest_fixed_length = 0;
a0d0e21e 3086 }
b81d288d 3087 if (r->regstclass
ffc61ed2 3088 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3089 r->regstclass = NULL;
33b8afdf
JH
3090 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3091 && stclass_flag
653099ff 3092 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3093 && !cl_is_anything(data.start_class))
3094 {
1df70142 3095 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3096
a02a5408 3097 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3098 struct regnode_charclass_class);
3099 StructCopy(data.start_class,
830247a4 3100 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3101 struct regnode_charclass_class);
830247a4 3102 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3103 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 3104 PL_regdata = r->data; /* for regprop() */
a3621e74 3105 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
9c5ffd7c
JH
3106 regprop(sv, (regnode*)data.start_class);
3107 PerlIO_printf(Perl_debug_log,
a0288114 3108 "synthetic stclass \"%s\".\n",
3f7c398e 3109 SvPVX_const(sv));});
653099ff 3110 }
c277df42
IZ
3111
3112 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3113 if (longest_fixed_length > longest_float_length) {
c277df42 3114 r->check_substr = r->anchored_substr;
33b8afdf 3115 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3116 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3117 if (r->reganch & ROPT_ANCH_SINGLE)
3118 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3119 }
3120 else {
c277df42 3121 r->check_substr = r->float_substr;
33b8afdf 3122 r->check_utf8 = r->float_utf8;
c277df42
IZ
3123 r->check_offset_min = data.offset_float_min;
3124 r->check_offset_max = data.offset_float_max;
a0d0e21e 3125 }
30382c73
IZ
3126 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3127 This should be changed ASAP! */
33b8afdf 3128 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3129 r->reganch |= RE_USE_INTUIT;
33b8afdf 3130 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3131 r->reganch |= RE_INTUIT_TAIL;
3132 }
a0ed51b3
LW
3133 }
3134 else {
c277df42
IZ
3135 /* Several toplevels. Best we can is to set minlen. */
3136 I32 fake;
653099ff 3137 struct regnode_charclass_class ch_class;
cb434fcc 3138 I32 last_close = 0;
c277df42 3139
a3621e74 3140 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3141 scan = r->program + 1;
830247a4 3142 cl_init(pRExC_state, &ch_class);
653099ff 3143 data.start_class = &ch_class;
cb434fcc 3144 data.last_closep = &last_close;
a3621e74 3145 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf 3146 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 3147 = r->float_substr = r->float_utf8 = NULL;
653099ff 3148 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3149 && !cl_is_anything(data.start_class))
3150 {
1df70142 3151 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3152
a02a5408 3153 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3154 struct regnode_charclass_class);
3155 StructCopy(data.start_class,
830247a4 3156 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3157 struct regnode_charclass_class);
830247a4 3158 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3159 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3160 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
9c5ffd7c
JH
3161 regprop(sv, (regnode*)data.start_class);
3162 PerlIO_printf(Perl_debug_log,
a0288114 3163 "synthetic stclass \"%s\".\n",
3f7c398e 3164 SvPVX_const(sv));});
653099ff 3165 }
a0d0e21e
LW
3166 }
3167
a0d0e21e 3168 r->minlen = minlen;
b81d288d 3169 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3170 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3171 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3172 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3173 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3174 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3175 if (RExC_seen & REG_SEEN_CANY)
3176 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3177 Newxz(r->startp, RExC_npar, I32);
3178 Newxz(r->endp, RExC_npar, I32);
ffc61ed2 3179 PL_regdata = r->data; /* for regprop() */
a3621e74 3180 DEBUG_COMPILE_r(regdump(r));
a0d0e21e 3181 return(r);
a687059c
LW
3182}
3183
3184/*
3185 - reg - regular expression, i.e. main body or parenthesized thing
3186 *
3187 * Caller must absorb opening parenthesis.
3188 *
3189 * Combining parenthesis handling with the base level of regular expression
3190 * is a trifle forced, but the need to tie the tails of the branches to what
3191 * follows makes it hard to avoid.
3192 */
76e3520e 3193STATIC regnode *
830247a4 3194S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 3195 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3196{
27da23d5 3197 dVAR;
c277df42
IZ
3198 register regnode *ret; /* Will be the head of the group. */
3199 register regnode *br;
3200 register regnode *lastbr;
cbbf8932 3201 register regnode *ender = NULL;
a0d0e21e 3202 register I32 parno = 0;
cbbf8932
AL
3203 I32 flags;
3204 const I32 oregflags = RExC_flags;
6136c704
AL
3205 bool have_branch = 0;
3206 bool is_open = 0;
9d1d55b5
JP
3207
3208 /* for (?g), (?gc), and (?o) warnings; warning
3209 about (?c) will warn about (?g) -- japhy */
3210
6136c704
AL
3211#define WASTED_O 0x01
3212#define WASTED_G 0x02
3213#define WASTED_C 0x04
3214#define WASTED_GC (0x02|0x04)
cbbf8932 3215 I32 wastedflags = 0x00;
9d1d55b5 3216
fac92740 3217 char * parse_start = RExC_parse; /* MJD */
a28509cc 3218 char * const oregcomp_parse = RExC_parse;
a0d0e21e 3219
821b33a5 3220 *flagp = 0; /* Tentatively. */
a0d0e21e 3221
9d1d55b5 3222
a0d0e21e
LW
3223 /* Make an OPEN node, if parenthesized. */
3224 if (paren) {
fac92740 3225 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3226 U32 posflags = 0, negflags = 0;
3227 U32 *flagsp = &posflags;
6136c704 3228 bool is_logical = 0;
a28509cc 3229 const char * const seqstart = RExC_parse;
ca9dfc88 3230
830247a4
IZ
3231 RExC_parse++;
3232 paren = *RExC_parse++;
c277df42 3233 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3234 switch (paren) {
fac92740 3235 case '<': /* (?<...) */
830247a4 3236 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3237 if (*RExC_parse == '!')
c277df42 3238 paren = ',';
b81d288d 3239 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3240 goto unknown;
830247a4 3241 RExC_parse++;
fac92740
MJD
3242 case '=': /* (?=...) */
3243 case '!': /* (?!...) */
830247a4 3244 RExC_seen_zerolen++;
fac92740
MJD
3245 case ':': /* (?:...) */
3246 case '>': /* (?>...) */
a0d0e21e 3247 break;
fac92740
MJD
3248 case '$': /* (?$...) */
3249 case '@': /* (?@...) */
8615cb43 3250 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3251 break;
fac92740 3252 case '#': /* (?#...) */
830247a4
IZ
3253 while (*RExC_parse && *RExC_parse != ')')
3254 RExC_parse++;
3255 if (*RExC_parse != ')')
c277df42 3256 FAIL("Sequence (?#... not terminated");
830247a4 3257 nextchar(pRExC_state);
a0d0e21e
LW
3258 *flagp = TRYAGAIN;
3259 return NULL;
fac92740 3260 case 'p': /* (?p...) */
9014280d 3261 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3262 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3263 /* FALL THROUGH*/
fac92740 3264 case '?': /* (??...) */
6136c704 3265 is_logical = 1;
438a3801
YST
3266 if (*RExC_parse != '{')
3267 goto unknown;
830247a4 3268 paren = *RExC_parse++;
0f5d15d6 3269 /* FALL THROUGH */
fac92740 3270 case '{': /* (?{...}) */
c277df42 3271 {
c277df42
IZ
3272 I32 count = 1, n = 0;
3273 char c;
830247a4 3274 char *s = RExC_parse;
c277df42 3275
830247a4
IZ
3276 RExC_seen_zerolen++;
3277 RExC_seen |= REG_SEEN_EVAL;
3278 while (count && (c = *RExC_parse)) {
6136c704
AL
3279 if (c == '\\') {
3280 if (RExC_parse[1])
3281 RExC_parse++;
3282 }
b81d288d 3283 else if (c == '{')
c277df42 3284 count++;
b81d288d 3285 else if (c == '}')
c277df42 3286 count--;
830247a4 3287 RExC_parse++;
c277df42 3288 }
6136c704 3289 if (*RExC_parse != ')') {
b81d288d 3290 RExC_parse = s;
b45f050a
JF
3291 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3292 }
c277df42 3293 if (!SIZE_ONLY) {
f3548bdc 3294 PAD *pad;
6136c704
AL
3295 OP_4tree *sop, *rop;
3296 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3297
569233ed
SB
3298 ENTER;
3299 Perl_save_re_context(aTHX);
f3548bdc 3300 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3301 sop->op_private |= OPpREFCOUNTED;
3302 /* re_dup will OpREFCNT_inc */
3303 OpREFCNT_set(sop, 1);
569233ed 3304 LEAVE;
c277df42 3305
830247a4
IZ
3306 n = add_data(pRExC_state, 3, "nop");
3307 RExC_rx->data->data[n] = (void*)rop;
3308 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3309 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3310 SvREFCNT_dec(sv);
a0ed51b3 3311 }
e24b16f9 3312 else { /* First pass */
830247a4 3313 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3314 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3315 /* No compiled RE interpolated, has runtime
3316 components ===> unsafe. */
3317 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3318 if (PL_tainting && PL_tainted)
cc6b7395 3319 FAIL("Eval-group in insecure regular expression");
923e4eb5 3320 if (IN_PERL_COMPILETIME)
b5c19bd7 3321 PL_cv_has_eval = 1;
c277df42 3322 }
b5c19bd7 3323
830247a4 3324 nextchar(pRExC_state);
6136c704 3325 if (is_logical) {
830247a4 3326 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3327 if (!SIZE_ONLY)
3328 ret->flags = 2;
830247a4 3329 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3330 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3331 return ret;
3332 }
ccb2c380
MP
3333 ret = reganode(pRExC_state, EVAL, n);
3334 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3335 Set_Node_Offset(ret, parse_start);
3336 return ret;
c277df42 3337 }
fac92740 3338 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3339 {
fac92740 3340 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3341 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3342 || RExC_parse[1] == '<'
830247a4 3343 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3344 I32 flag;
3345
830247a4 3346 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3347 if (!SIZE_ONLY)
3348 ret->flags = 1;
830247a4 3349 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 3350 goto insert_if;
b81d288d 3351 }
a0ed51b3 3352 }
830247a4 3353 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3354 /* (?(1)...) */
6136c704 3355 char c;
830247a4 3356 parno = atoi(RExC_parse++);
c277df42 3357
830247a4
IZ
3358 while (isDIGIT(*RExC_parse))
3359 RExC_parse++;
fac92740 3360 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3361
830247a4 3362 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3363 vFAIL("Switch condition not recognized");
c277df42 3364 insert_if:
830247a4
IZ
3365 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3366 br = regbranch(pRExC_state, &flags, 1);
c277df42 3367 if (br == NULL)
830247a4 3368 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3369 else
830247a4
IZ
3370 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3371 c = *nextchar(pRExC_state);
d1b80229
IZ
3372 if (flags&HASWIDTH)
3373 *flagp |= HASWIDTH;
c277df42 3374 if (c == '|') {
830247a4
IZ
3375 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3376 regbranch(pRExC_state, &flags, 1);
3377 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
3378 if (flags&HASWIDTH)
3379 *flagp |= HASWIDTH;
830247a4 3380 c = *nextchar(pRExC_state);
a0ed51b3
LW
3381 }
3382 else
c277df42
IZ
3383 lastbr = NULL;
3384 if (c != ')')
8615cb43 3385 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
3386 ender = reg_node(pRExC_state, TAIL);
3387 regtail(pRExC_state, br, ender);
c277df42 3388 if (lastbr) {
830247a4
IZ
3389 regtail(pRExC_state, lastbr, ender);
3390 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3391 }
3392 else
830247a4 3393 regtail(pRExC_state, ret, ender);
c277df42 3394 return ret;
a0ed51b3
LW
3395 }
3396 else {
830247a4 3397 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3398 }
3399 }
1b1626e4 3400 case 0:
830247a4 3401 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3402 vFAIL("Sequence (? incomplete");
1b1626e4 3403 break;
a0d0e21e 3404 default:
830247a4 3405 --RExC_parse;
fac92740 3406 parse_flags: /* (?i) */
830247a4 3407 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3408 /* (?g), (?gc) and (?o) are useless here
3409 and must be globally applied -- japhy */
3410
3411 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3412 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 3413 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
3414 if (! (wastedflags & wflagbit) ) {
3415 wastedflags |= wflagbit;
3416 vWARN5(
3417 RExC_parse + 1,
3418 "Useless (%s%c) - %suse /%c modifier",
3419 flagsp == &negflags ? "?-" : "?",
3420 *RExC_parse,
3421 flagsp == &negflags ? "don't " : "",
3422 *RExC_parse
3423 );
3424 }
3425 }
3426 }
3427 else if (*RExC_parse == 'c') {
3428 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
3429 if (! (wastedflags & WASTED_C) ) {
3430 wastedflags |= WASTED_GC;
9d1d55b5
JP
3431 vWARN3(
3432 RExC_parse + 1,
3433 "Useless (%sc) - %suse /gc modifier",
3434 flagsp == &negflags ? "?-" : "?",
3435 flagsp == &negflags ? "don't " : ""
3436 );
3437 }
3438 }
3439 }
3440 else { pmflag(flagsp, *RExC_parse); }
3441
830247a4 3442 ++RExC_parse;
ca9dfc88 3443 }
830247a4 3444 if (*RExC_parse == '-') {
ca9dfc88 3445 flagsp = &negflags;
9d1d55b5 3446 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3447 ++RExC_parse;
ca9dfc88 3448 goto parse_flags;
48c036b1 3449 }
e2509266
JH
3450 RExC_flags |= posflags;
3451 RExC_flags &= ~negflags;
830247a4
IZ
3452 if (*RExC_parse == ':') {
3453 RExC_parse++;
ca9dfc88
IZ
3454 paren = ':';
3455 break;
3456 }
c277df42 3457 unknown:
830247a4
IZ
3458 if (*RExC_parse != ')') {
3459 RExC_parse++;
3460 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3461 }
830247a4 3462 nextchar(pRExC_state);
a0d0e21e
LW
3463 *flagp = TRYAGAIN;
3464 return NULL;
3465 }
3466 }
fac92740 3467 else { /* (...) */
830247a4
IZ
3468 parno = RExC_npar;
3469 RExC_npar++;
3470 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3471 Set_Node_Length(ret, 1); /* MJD */
3472 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 3473 is_open = 1;
a0d0e21e 3474 }
a0ed51b3 3475 }
fac92740 3476 else /* ! paren */
a0d0e21e
LW
3477 ret = NULL;
3478
3479 /* Pick up the branches, linking them together. */
fac92740 3480 parse_start = RExC_parse; /* MJD */
830247a4 3481 br = regbranch(pRExC_state, &flags, 1);
fac92740 3482 /* branch_len = (paren != 0); */
2af232bd 3483
a0d0e21e
LW
3484 if (br == NULL)
3485 return(NULL);
830247a4
IZ
3486 if (*RExC_parse == '|') {
3487 if (!SIZE_ONLY && RExC_extralen) {
3488 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3489 }
fac92740 3490 else { /* MJD */
830247a4 3491 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3492 Set_Node_Length(br, paren != 0);
3493 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3494 }
c277df42
IZ
3495 have_branch = 1;
3496 if (SIZE_ONLY)
830247a4 3497 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3498 }
3499 else if (paren == ':') {
c277df42
IZ
3500 *flagp |= flags&SIMPLE;
3501 }
6136c704 3502 if (is_open) { /* Starts with OPEN. */
830247a4 3503 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3504 }
3505 else if (paren != '?') /* Not Conditional */
a0d0e21e 3506 ret = br;
32a0ca98 3507 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3508 lastbr = br;
830247a4
IZ
3509 while (*RExC_parse == '|') {
3510 if (!SIZE_ONLY && RExC_extralen) {
3511 ender = reganode(pRExC_state, LONGJMP,0);
3512 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3513 }
3514 if (SIZE_ONLY)
830247a4
IZ
3515 RExC_extralen += 2; /* Account for LONGJMP. */
3516 nextchar(pRExC_state);
3517 br = regbranch(pRExC_state, &flags, 0);
2af232bd 3518
a687059c 3519 if (br == NULL)
a0d0e21e 3520 return(NULL);
830247a4 3521 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3522 lastbr = br;
821b33a5
IZ
3523 if (flags&HASWIDTH)
3524 *flagp |= HASWIDTH;
a687059c 3525 *flagp |= flags&SPSTART;
a0d0e21e
LW
3526 }
3527
c277df42
IZ
3528 if (have_branch || paren != ':') {
3529 /* Make a closing node, and hook it on the end. */
3530 switch (paren) {
3531 case ':':
830247a4 3532 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3533 break;
3534 case 1:
830247a4 3535 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3536 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3537 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3538 break;
3539 case '<':
c277df42
IZ
3540 case ',':
3541 case '=':
3542 case '!':
c277df42 3543 *flagp &= ~HASWIDTH;
821b33a5
IZ
3544 /* FALL THROUGH */
3545 case '>':
830247a4 3546 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3547 break;
3548 case 0:
830247a4 3549 ender = reg_node(pRExC_state, END);
c277df42
IZ
3550 break;
3551 }
830247a4 3552 regtail(pRExC_state, lastbr, ender);
a0d0e21e 3553
c277df42
IZ
3554 if (have_branch) {
3555 /* Hook the tails of the branches to the closing node. */
3556 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 3557 regoptail(pRExC_state, br, ender);
c277df42
IZ
3558 }
3559 }
a0d0e21e 3560 }
c277df42
IZ
3561
3562 {
e1ec3a88
AL
3563 const char *p;
3564 static const char parens[] = "=!<,>";
c277df42
IZ
3565
3566 if (paren && (p = strchr(parens, paren))) {
eb160463 3567 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
3568 int flag = (p - parens) > 1;
3569
3570 if (paren == '>')
3571 node = SUSPEND, flag = 0;
830247a4 3572 reginsert(pRExC_state, node,ret);
45948336
EP
3573 Set_Node_Cur_Length(ret);
3574 Set_Node_Offset(ret, parse_start + 1);
c277df42 3575 ret->flags = flag;
830247a4 3576 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 3577 }
a0d0e21e
LW
3578 }
3579
3580 /* Check for proper termination. */
ce3e6498 3581 if (paren) {
e2509266 3582 RExC_flags = oregflags;
830247a4
IZ
3583 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3584 RExC_parse = oregcomp_parse;
380a0633 3585 vFAIL("Unmatched (");
ce3e6498 3586 }
a0ed51b3 3587 }
830247a4
IZ
3588 else if (!paren && RExC_parse < RExC_end) {
3589 if (*RExC_parse == ')') {
3590 RExC_parse++;
380a0633 3591 vFAIL("Unmatched )");
a0ed51b3
LW
3592 }
3593 else
b45f050a 3594 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
3595 /* NOTREACHED */
3596 }
a687059c 3597
a0d0e21e 3598 return(ret);
a687059c
LW
3599}
3600
3601/*
3602 - regbranch - one alternative of an | operator
3603 *
3604 * Implements the concatenation operator.
3605 */
76e3520e 3606STATIC regnode *
830247a4 3607S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 3608{
97aff369 3609 dVAR;
c277df42
IZ
3610 register regnode *ret;
3611 register regnode *chain = NULL;
3612 register regnode *latest;
3613 I32 flags = 0, c = 0;
a0d0e21e 3614
b81d288d 3615 if (first)
c277df42
IZ
3616 ret = NULL;
3617 else {
b81d288d 3618 if (!SIZE_ONLY && RExC_extralen)
830247a4 3619 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 3620 else {
830247a4 3621 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
3622 Set_Node_Length(ret, 1);
3623 }
c277df42
IZ
3624 }
3625
b81d288d 3626 if (!first && SIZE_ONLY)
830247a4 3627 RExC_extralen += 1; /* BRANCHJ */
b81d288d 3628
c277df42 3629 *flagp = WORST; /* Tentatively. */
a0d0e21e 3630
830247a4
IZ
3631 RExC_parse--;
3632 nextchar(pRExC_state);
3633 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 3634 flags &= ~TRYAGAIN;
830247a4 3635 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
3636 if (latest == NULL) {
3637 if (flags & TRYAGAIN)
3638 continue;
3639 return(NULL);
a0ed51b3
LW
3640 }
3641 else if (ret == NULL)
c277df42 3642 ret = latest;
a0d0e21e 3643 *flagp |= flags&HASWIDTH;
c277df42 3644 if (chain == NULL) /* First piece. */
a0d0e21e
LW
3645 *flagp |= flags&SPSTART;
3646 else {
830247a4
IZ
3647 RExC_naughty++;
3648 regtail(pRExC_state, chain, latest);
a687059c 3649 }
a0d0e21e 3650 chain = latest;
c277df42
IZ
3651 c++;
3652 }
3653 if (chain == NULL) { /* Loop ran zero times. */
830247a4 3654 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
3655 if (ret == NULL)
3656 ret = chain;
3657 }
3658 if (c == 1) {
3659 *flagp |= flags&SIMPLE;
a0d0e21e 3660 }
a687059c 3661
a0d0e21e 3662 return(ret);
a687059c
LW
3663}
3664
3665/*
3666 - regpiece - something followed by possible [*+?]
3667 *
3668 * Note that the branching code sequences used for ? and the general cases
3669 * of * and + are somewhat optimized: they use the same NOTHING node as
3670 * both the endmarker for their branch list and the body of the last branch.
3671 * It might seem that this node could be dispensed with entirely, but the
3672 * endmarker role is not redundant.
3673 */
76e3520e 3674STATIC regnode *
830247a4 3675S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3676{
97aff369 3677 dVAR;
c277df42 3678 register regnode *ret;
a0d0e21e
LW
3679 register char op;
3680 register char *next;
3681 I32 flags;
1df70142 3682 const char * const origparse = RExC_parse;
a0d0e21e
LW
3683 char *maxpos;
3684 I32 min;
c277df42 3685 I32 max = REG_INFTY;
fac92740 3686 char *parse_start;
a0d0e21e 3687
830247a4 3688 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
3689 if (ret == NULL) {
3690 if (flags & TRYAGAIN)
3691 *flagp |= TRYAGAIN;
3692 return(NULL);
3693 }
3694
830247a4 3695 op = *RExC_parse;
a0d0e21e 3696
830247a4 3697 if (op == '{' && regcurly(RExC_parse)) {
fac92740 3698 parse_start = RExC_parse; /* MJD */
830247a4 3699 next = RExC_parse + 1;
c445ea15 3700 maxpos = NULL;
a0d0e21e
LW
3701 while (isDIGIT(*next) || *next == ',') {
3702 if (*next == ',') {
3703 if (maxpos)
3704 break;
3705 else
3706 maxpos = next;
a687059c 3707 }
a0d0e21e
LW
3708 next++;
3709 }
3710 if (*next == '}') { /* got one */
3711 if (!maxpos)
3712 maxpos = next;
830247a4
IZ
3713 RExC_parse++;
3714 min = atoi(RExC_parse);
a0d0e21e
LW
3715 if (*maxpos == ',')
3716 maxpos++;
3717 else
830247a4 3718 maxpos = RExC_parse;
a0d0e21e
LW
3719 max = atoi(maxpos);
3720 if (!max && *maxpos != '0')
c277df42
IZ
3721 max = REG_INFTY; /* meaning "infinity" */
3722 else if (max >= REG_INFTY)
8615cb43 3723 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
3724 RExC_parse = next;
3725 nextchar(pRExC_state);
a0d0e21e
LW
3726
3727 do_curly:
3728 if ((flags&SIMPLE)) {
830247a4
IZ
3729 RExC_naughty += 2 + RExC_naughty / 2;
3730 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
3731 Set_Node_Offset(ret, parse_start+1); /* MJD */
3732 Set_Node_Cur_Length(ret);
a0d0e21e
LW
3733 }
3734 else {
830247a4 3735 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
3736
3737 w->flags = 0;
830247a4
IZ
3738 regtail(pRExC_state, ret, w);
3739 if (!SIZE_ONLY && RExC_extralen) {
3740 reginsert(pRExC_state, LONGJMP,ret);
3741 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
3742 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3743 }
830247a4 3744 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
3745 /* MJD hk */
3746 Set_Node_Offset(ret, parse_start+1);
2af232bd 3747 Set_Node_Length(ret,
fac92740 3748 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 3749
830247a4 3750 if (!SIZE_ONLY && RExC_extralen)
c277df42 3751 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 3752 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 3753 if (SIZE_ONLY)
830247a4
IZ
3754 RExC_whilem_seen++, RExC_extralen += 3;
3755 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 3756 }
c277df42 3757 ret->flags = 0;
a0d0e21e
LW
3758
3759 if (min > 0)
821b33a5
IZ
3760 *flagp = WORST;
3761 if (max > 0)
3762 *flagp |= HASWIDTH;
a0d0e21e 3763 if (max && max < min)
8615cb43 3764 vFAIL("Can't do {n,m} with n > m");
c277df42 3765 if (!SIZE_ONLY) {
eb160463
GS
3766 ARG1_SET(ret, (U16)min);
3767 ARG2_SET(ret, (U16)max);
a687059c 3768 }
a687059c 3769
a0d0e21e 3770 goto nest_check;
a687059c 3771 }
a0d0e21e 3772 }
a687059c 3773
a0d0e21e
LW
3774 if (!ISMULT1(op)) {
3775 *flagp = flags;
a687059c 3776 return(ret);
a0d0e21e 3777 }
bb20fd44 3778
c277df42 3779#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
3780
3781 /* if this is reinstated, don't forget to put this back into perldiag:
3782
3783 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3784
3785 (F) The part of the regexp subject to either the * or + quantifier
3786 could match an empty string. The {#} shows in the regular
3787 expression about where the problem was discovered.
3788
3789 */
3790
bb20fd44 3791 if (!(flags&HASWIDTH) && op != '?')
b45f050a 3792 vFAIL("Regexp *+ operand could be empty");
b81d288d 3793#endif
bb20fd44 3794
fac92740 3795 parse_start = RExC_parse;
830247a4 3796 nextchar(pRExC_state);
a0d0e21e 3797
821b33a5 3798 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
3799
3800 if (op == '*' && (flags&SIMPLE)) {
830247a4 3801 reginsert(pRExC_state, STAR, ret);
c277df42 3802 ret->flags = 0;
830247a4 3803 RExC_naughty += 4;
a0d0e21e
LW
3804 }
3805 else if (op == '*') {
3806 min = 0;
3807 goto do_curly;
a0ed51b3
LW
3808 }
3809 else if (op == '+' && (flags&SIMPLE)) {
830247a4 3810 reginsert(pRExC_state, PLUS, ret);
c277df42 3811 ret->flags = 0;
830247a4 3812 RExC_naughty += 3;
a0d0e21e
LW
3813 }
3814 else if (op == '+') {
3815 min = 1;
3816 goto do_curly;
a0ed51b3
LW
3817 }
3818 else if (op == '?') {
a0d0e21e
LW
3819 min = 0; max = 1;
3820 goto do_curly;
3821 }
3822 nest_check:
041457d9 3823 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 3824 vWARN3(RExC_parse,
b45f050a 3825 "%.*s matches null string many times",
afd78fd5 3826 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 3827 origparse);
a0d0e21e
LW
3828 }
3829
830247a4
IZ
3830 if (*RExC_parse == '?') {
3831 nextchar(pRExC_state);
3832 reginsert(pRExC_state, MINMOD, ret);
3833 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 3834 }
830247a4
IZ
3835 if (ISMULT2(RExC_parse)) {
3836 RExC_parse++;
b45f050a
JF
3837 vFAIL("Nested quantifiers");
3838 }
a0d0e21e
LW
3839
3840 return(ret);
a687059c
LW
3841}
3842
3843/*
3844 - regatom - the lowest level
3845 *
3846 * Optimization: gobbles an entire sequence of ordinary characters so that
3847 * it can turn them into a single node, which is smaller to store and
3848 * faster to run. Backslashed characters are exceptions, each becoming a
3849 * separate node; the code is simpler that way and it's not worth fixing.
3850 *
b45f050a 3851 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 3852STATIC regnode *
830247a4 3853S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3854{
97aff369 3855 dVAR;
cbbf8932 3856 register regnode *ret = NULL;
a0d0e21e 3857 I32 flags;
45948336 3858 char *parse_start = RExC_parse;
a0d0e21e
LW
3859
3860 *flagp = WORST; /* Tentatively. */
3861
3862tryagain:
830247a4 3863 switch (*RExC_parse) {
a0d0e21e 3864 case '^':
830247a4
IZ
3865 RExC_seen_zerolen++;
3866 nextchar(pRExC_state);
e2509266 3867 if (RExC_flags & PMf_MULTILINE)
830247a4 3868 ret = reg_node(pRExC_state, MBOL);
e2509266 3869 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3870 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3871 else
830247a4 3872 ret = reg_node(pRExC_state, BOL);
fac92740 3873 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3874 break;
3875 case '$':
830247a4 3876 nextchar(pRExC_state);
b81d288d 3877 if (*RExC_parse)
830247a4 3878 RExC_seen_zerolen++;
e2509266 3879 if (RExC_flags & PMf_MULTILINE)
830247a4 3880 ret = reg_node(pRExC_state, MEOL);
e2509266 3881 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3882 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3883 else
830247a4 3884 ret = reg_node(pRExC_state, EOL);
fac92740 3885 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3886 break;
3887 case '.':
830247a4 3888 nextchar(pRExC_state);
e2509266 3889 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
3890 ret = reg_node(pRExC_state, SANY);
3891 else
3892 ret = reg_node(pRExC_state, REG_ANY);
3893 *flagp |= HASWIDTH|SIMPLE;
830247a4 3894 RExC_naughty++;
fac92740 3895 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3896 break;
3897 case '[':
b45f050a 3898 {
830247a4 3899 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 3900 ret = regclass(pRExC_state);
830247a4
IZ
3901 if (*RExC_parse != ']') {
3902 RExC_parse = oregcomp_parse;
b45f050a
JF
3903 vFAIL("Unmatched [");
3904 }
830247a4 3905 nextchar(pRExC_state);
a0d0e21e 3906 *flagp |= HASWIDTH|SIMPLE;
fac92740 3907 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 3908 break;
b45f050a 3909 }
a0d0e21e 3910 case '(':
830247a4
IZ
3911 nextchar(pRExC_state);
3912 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 3913 if (ret == NULL) {
bf93d4cc 3914 if (flags & TRYAGAIN) {
830247a4 3915 if (RExC_parse == RExC_end) {
bf93d4cc
GS
3916 /* Make parent create an empty node if needed. */
3917 *flagp |= TRYAGAIN;
3918 return(NULL);
3919 }
a0d0e21e 3920 goto tryagain;
bf93d4cc 3921 }
a0d0e21e
LW
3922 return(NULL);
3923 }
c277df42 3924 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
3925 break;
3926 case '|':
3927 case ')':
3928 if (flags & TRYAGAIN) {
3929 *flagp |= TRYAGAIN;
3930 return NULL;
3931 }
b45f050a 3932 vFAIL("Internal urp");
a0d0e21e
LW
3933 /* Supposed to be caught earlier. */
3934 break;
85afd4ae 3935 case '{':
830247a4
IZ
3936 if (!regcurly(RExC_parse)) {
3937 RExC_parse++;
85afd4ae
CS
3938 goto defchar;
3939 }
3940 /* FALL THROUGH */
a0d0e21e
LW
3941 case '?':
3942 case '+':
3943 case '*':
830247a4 3944 RExC_parse++;
b45f050a 3945 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
3946 break;
3947 case '\\':
830247a4 3948 switch (*++RExC_parse) {
a0d0e21e 3949 case 'A':
830247a4
IZ
3950 RExC_seen_zerolen++;
3951 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3952 *flagp |= SIMPLE;
830247a4 3953 nextchar(pRExC_state);
fac92740 3954 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3955 break;
3956 case 'G':
830247a4
IZ
3957 ret = reg_node(pRExC_state, GPOS);
3958 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 3959 *flagp |= SIMPLE;
830247a4 3960 nextchar(pRExC_state);
fac92740 3961 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3962 break;
3963 case 'Z':
830247a4 3964 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3965 *flagp |= SIMPLE;
a1917ab9 3966 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 3967 nextchar(pRExC_state);
a0d0e21e 3968 break;
b85d18e9 3969 case 'z':
830247a4 3970 ret = reg_node(pRExC_state, EOS);
b85d18e9 3971 *flagp |= SIMPLE;
830247a4
IZ
3972 RExC_seen_zerolen++; /* Do not optimize RE away */
3973 nextchar(pRExC_state);
fac92740 3974 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 3975 break;
4a2d328f 3976 case 'C':
f33976b4
DB
3977 ret = reg_node(pRExC_state, CANY);
3978 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 3979 *flagp |= HASWIDTH|SIMPLE;
830247a4 3980 nextchar(pRExC_state);
fac92740 3981 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
3982 break;
3983 case 'X':
830247a4 3984 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 3985 *flagp |= HASWIDTH;
830247a4 3986 nextchar(pRExC_state);
fac92740 3987 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 3988 break;
a0d0e21e 3989 case 'w':
eb160463 3990 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 3991 *flagp |= HASWIDTH|SIMPLE;
830247a4 3992 nextchar(pRExC_state);
fac92740 3993 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3994 break;
3995 case 'W':
eb160463 3996 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 3997 *flagp |= HASWIDTH|SIMPLE;
830247a4 3998 nextchar(pRExC_state);
fac92740 3999 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4000 break;
4001 case 'b':
830247a4
IZ
4002 RExC_seen_zerolen++;
4003 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4004 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4005 *flagp |= SIMPLE;
830247a4 4006 nextchar(pRExC_state);
fac92740 4007 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4008 break;
4009 case 'B':
830247a4
IZ
4010 RExC_seen_zerolen++;
4011 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4012 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4013 *flagp |= SIMPLE;
830247a4 4014 nextchar(pRExC_state);
fac92740 4015 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4016 break;
4017 case 's':
eb160463 4018 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4019 *flagp |= HASWIDTH|SIMPLE;
830247a4 4020 nextchar(pRExC_state);
fac92740 4021 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4022 break;
4023 case 'S':
eb160463 4024 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4025 *flagp |= HASWIDTH|SIMPLE;
830247a4 4026 nextchar(pRExC_state);
fac92740 4027 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4028 break;
4029 case 'd':
ffc61ed2 4030 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4031 *flagp |= HASWIDTH|SIMPLE;
830247a4 4032 nextchar(pRExC_state);
fac92740 4033 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4034 break;
4035 case 'D':
ffc61ed2 4036 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4037 *flagp |= HASWIDTH|SIMPLE;
830247a4 4038 nextchar(pRExC_state);
fac92740 4039 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4040 break;
a14b48bc
LW
4041 case 'p':
4042 case 'P':
3568d838 4043 {
830247a4 4044 char* oldregxend = RExC_end;
ccb2c380 4045 char* parse_start = RExC_parse - 2;
a14b48bc 4046
830247a4 4047 if (RExC_parse[1] == '{') {
3568d838 4048 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4049 RExC_end = strchr(RExC_parse, '}');
4050 if (!RExC_end) {
0da60cf5 4051 U8 c = (U8)*RExC_parse;
830247a4
IZ
4052 RExC_parse += 2;
4053 RExC_end = oldregxend;
0da60cf5 4054 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4055 }
830247a4 4056 RExC_end++;
a14b48bc 4057 }
af6f566e 4058 else {
830247a4 4059 RExC_end = RExC_parse + 2;
af6f566e
HS
4060 if (RExC_end > oldregxend)
4061 RExC_end = oldregxend;
4062 }
830247a4 4063 RExC_parse--;
a14b48bc 4064
ffc61ed2 4065 ret = regclass(pRExC_state);
a14b48bc 4066
830247a4
IZ
4067 RExC_end = oldregxend;
4068 RExC_parse--;
ccb2c380
MP
4069
4070 Set_Node_Offset(ret, parse_start + 2);
4071 Set_Node_Cur_Length(ret);
830247a4 4072 nextchar(pRExC_state);
a14b48bc
LW
4073 *flagp |= HASWIDTH|SIMPLE;
4074 }
4075 break;
a0d0e21e
LW
4076 case 'n':
4077 case 'r':
4078 case 't':
4079 case 'f':
4080 case 'e':
4081 case 'a':
4082 case 'x':
4083 case 'c':
4084 case '0':
4085 goto defchar;
4086 case '1': case '2': case '3': case '4':
4087 case '5': case '6': case '7': case '8': case '9':
4088 {
1df70142 4089 const I32 num = atoi(RExC_parse);
a0d0e21e 4090
830247a4 4091 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4092 goto defchar;
4093 else {
fac92740 4094 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4095 while (isDIGIT(*RExC_parse))
4096 RExC_parse++;
b45f050a 4097
eb160463 4098 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4099 vFAIL("Reference to nonexistent group");
830247a4 4100 RExC_sawback = 1;
eb160463
GS
4101 ret = reganode(pRExC_state,
4102 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4103 num);
a0d0e21e 4104 *flagp |= HASWIDTH;
2af232bd 4105
fac92740 4106 /* override incorrect value set in reganode MJD */
2af232bd 4107 Set_Node_Offset(ret, parse_start+1);
fac92740 4108 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4109 RExC_parse--;
4110 nextchar(pRExC_state);
a0d0e21e
LW
4111 }
4112 }
4113 break;
4114 case '\0':
830247a4 4115 if (RExC_parse >= RExC_end)
b45f050a 4116 FAIL("Trailing \\");
a0d0e21e
LW
4117 /* FALL THROUGH */
4118 default:
a0288114 4119 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 4120 back into the quick-grab loop below */
45948336 4121 parse_start--;
a0d0e21e
LW
4122 goto defchar;
4123 }
4124 break;
4633a7c4
LW
4125
4126 case '#':
e2509266 4127 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4128 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4129 if (RExC_parse < RExC_end)
4633a7c4
LW
4130 goto tryagain;
4131 }
4132 /* FALL THROUGH */
4133
a0d0e21e 4134 default: {
ba210ebe 4135 register STRLEN len;
58ae7d3f 4136 register UV ender;
a0d0e21e 4137 register char *p;
c277df42 4138 char *oldp, *s;
80aecb99 4139 STRLEN foldlen;
89ebb4a3 4140 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4141
4142 parse_start = RExC_parse - 1;
a0d0e21e 4143
830247a4 4144 RExC_parse++;
a0d0e21e
LW
4145
4146 defchar:
58ae7d3f 4147 ender = 0;
eb160463
GS
4148 ret = reg_node(pRExC_state,
4149 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4150 s = STRING(ret);
830247a4
IZ
4151 for (len = 0, p = RExC_parse - 1;
4152 len < 127 && p < RExC_end;
a0d0e21e
LW
4153 len++)
4154 {
4155 oldp = p;
5b5a24f7 4156
e2509266 4157 if (RExC_flags & PMf_EXTENDED)
830247a4 4158 p = regwhite(p, RExC_end);
a0d0e21e
LW
4159 switch (*p) {
4160 case '^':
4161 case '$':
4162 case '.':
4163 case '[':
4164 case '(':
4165 case ')':
4166 case '|':
4167 goto loopdone;
4168 case '\\':
4169 switch (*++p) {
4170 case 'A':
1ed8eac0
JF
4171 case 'C':
4172 case 'X':
a0d0e21e
LW
4173 case 'G':
4174 case 'Z':
b85d18e9 4175 case 'z':
a0d0e21e
LW
4176 case 'w':
4177 case 'W':
4178 case 'b':
4179 case 'B':
4180 case 's':
4181 case 'S':
4182 case 'd':
4183 case 'D':
a14b48bc
LW
4184 case 'p':
4185 case 'P':
a0d0e21e
LW
4186 --p;
4187 goto loopdone;
4188 case 'n':
4189 ender = '\n';
4190 p++;
a687059c 4191 break;
a0d0e21e
LW
4192 case 'r':
4193 ender = '\r';
4194 p++;
a687059c 4195 break;
a0d0e21e
LW
4196 case 't':
4197 ender = '\t';
4198 p++;
a687059c 4199 break;
a0d0e21e
LW
4200 case 'f':
4201 ender = '\f';
4202 p++;
a687059c 4203 break;
a0d0e21e 4204 case 'e':
c7f1f016 4205 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4206 p++;
a687059c 4207 break;
a0d0e21e 4208 case 'a':
c7f1f016 4209 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4210 p++;
a687059c 4211 break;
a0d0e21e 4212 case 'x':
a0ed51b3 4213 if (*++p == '{') {
1df70142 4214 char* const e = strchr(p, '}');
b81d288d 4215
b45f050a 4216 if (!e) {
830247a4 4217 RExC_parse = p + 1;
b45f050a
JF
4218 vFAIL("Missing right brace on \\x{}");
4219 }
de5f0749 4220 else {
a4c04bdc
NC
4221 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4222 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 4223 STRLEN numlen = e - p - 1;
53305cf1 4224 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4225 if (ender > 0xff)
4226 RExC_utf8 = 1;
a0ed51b3
LW
4227 p = e + 1;
4228 }
a0ed51b3
LW
4229 }
4230 else {
a4c04bdc 4231 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 4232 STRLEN numlen = 2;
53305cf1 4233 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4234 p += numlen;
4235 }
a687059c 4236 break;
a0d0e21e
LW
4237 case 'c':
4238 p++;
bbce6d69 4239 ender = UCHARAT(p++);
4240 ender = toCTRL(ender);
a687059c 4241 break;
a0d0e21e
LW
4242 case '0': case '1': case '2': case '3':case '4':
4243 case '5': case '6': case '7': case '8':case '9':
4244 if (*p == '0' ||
830247a4 4245 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 4246 I32 flags = 0;
1df70142 4247 STRLEN numlen = 3;
53305cf1 4248 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4249 p += numlen;
4250 }
4251 else {
4252 --p;
4253 goto loopdone;
a687059c
LW
4254 }
4255 break;
a0d0e21e 4256 case '\0':
830247a4 4257 if (p >= RExC_end)
b45f050a 4258 FAIL("Trailing \\");
a687059c 4259 /* FALL THROUGH */
a0d0e21e 4260 default:
041457d9 4261 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 4262 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4263 goto normal_default;
a0d0e21e
LW
4264 }
4265 break;
a687059c 4266 default:
a0ed51b3 4267 normal_default:
fd400ab9 4268 if (UTF8_IS_START(*p) && UTF) {
1df70142 4269 STRLEN numlen;
5e12f4fb 4270 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 4271 &numlen, 0);
a0ed51b3
LW
4272 p += numlen;
4273 }
4274 else
4275 ender = *p++;
a0d0e21e 4276 break;
a687059c 4277 }
e2509266 4278 if (RExC_flags & PMf_EXTENDED)
830247a4 4279 p = regwhite(p, RExC_end);
60a8b682
JH
4280 if (UTF && FOLD) {
4281 /* Prime the casefolded buffer. */
ac7e0132 4282 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4283 }
a0d0e21e
LW
4284 if (ISMULT2(p)) { /* Back off on ?+*. */
4285 if (len)
4286 p = oldp;
16ea2a2e 4287 else if (UTF) {
0ebc6274
JH
4288 STRLEN unilen;
4289
80aecb99 4290 if (FOLD) {
60a8b682 4291 /* Emit all the Unicode characters. */
1df70142 4292 STRLEN numlen;
80aecb99
JH
4293 for (foldbuf = tmpbuf;
4294 foldlen;
4295 foldlen -= numlen) {
4296 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4297 if (numlen > 0) {
0ebc6274
JH
4298 reguni(pRExC_state, ender, s, &unilen);
4299 s += unilen;
4300 len += unilen;
4301 /* In EBCDIC the numlen
4302 * and unilen can differ. */
9dc45d57 4303 foldbuf += numlen;
47654450
JH
4304 if (numlen >= foldlen)
4305 break;
9dc45d57
JH
4306 }
4307 else
4308 break; /* "Can't happen." */
80aecb99
JH
4309 }
4310 }
4311 else {
0ebc6274 4312 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4313 if (unilen > 0) {
0ebc6274
JH
4314 s += unilen;
4315 len += unilen;
9dc45d57 4316 }
80aecb99 4317 }
a0ed51b3 4318 }
a0d0e21e
LW
4319 else {
4320 len++;
eb160463 4321 REGC((char)ender, s++);
a0d0e21e
LW
4322 }
4323 break;
a687059c 4324 }
16ea2a2e 4325 if (UTF) {
0ebc6274
JH
4326 STRLEN unilen;
4327
80aecb99 4328 if (FOLD) {
60a8b682 4329 /* Emit all the Unicode characters. */
1df70142 4330 STRLEN numlen;
80aecb99
JH
4331 for (foldbuf = tmpbuf;
4332 foldlen;
4333 foldlen -= numlen) {
4334 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4335 if (numlen > 0) {
0ebc6274
JH
4336 reguni(pRExC_state, ender, s, &unilen);
4337 len += unilen;
4338 s += unilen;
4339 /* In EBCDIC the numlen
4340 * and unilen can differ. */
9dc45d57 4341 foldbuf += numlen;
47654450
JH
4342 if (numlen >= foldlen)
4343 break;
9dc45d57
JH
4344 }
4345 else
4346 break;
80aecb99
JH
4347 }
4348 }
4349 else {
0ebc6274 4350 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4351 if (unilen > 0) {
0ebc6274
JH
4352 s += unilen;
4353 len += unilen;
9dc45d57 4354 }
80aecb99
JH
4355 }
4356 len--;
a0ed51b3
LW
4357 }
4358 else
eb160463 4359 REGC((char)ender, s++);
a0d0e21e
LW
4360 }
4361 loopdone:
830247a4 4362 RExC_parse = p - 1;
fac92740 4363 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4364 nextchar(pRExC_state);
793db0cb
JH
4365 {
4366 /* len is STRLEN which is unsigned, need to copy to signed */
4367 IV iv = len;
4368 if (iv < 0)
4369 vFAIL("Internal disaster");
4370 }
a0d0e21e
LW
4371 if (len > 0)
4372 *flagp |= HASWIDTH;
090f7165 4373 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4374 *flagp |= SIMPLE;
c277df42 4375 if (!SIZE_ONLY)
cd439c50
IZ
4376 STR_LEN(ret) = len;
4377 if (SIZE_ONLY)
830247a4 4378 RExC_size += STR_SZ(len);
cd439c50 4379 else
830247a4 4380 RExC_emit += STR_SZ(len);
a687059c 4381 }
a0d0e21e
LW
4382 break;
4383 }
a687059c 4384
60a8b682
JH
4385 /* If the encoding pragma is in effect recode the text of
4386 * any EXACT-kind nodes. */
22c54be3 4387 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
4388 STRLEN oldlen = STR_LEN(ret);
4389 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4390
4391 if (RExC_utf8)
4392 SvUTF8_on(sv);
4393 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
4394 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4395 const STRLEN newlen = SvCUR(sv);
d0063567
DK
4396
4397 if (SvUTF8(sv))
4398 RExC_utf8 = 1;
4399 if (!SIZE_ONLY) {
a3621e74
YO
4400 GET_RE_DEBUG_FLAGS_DECL;
4401 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
4402 (int)oldlen, STRING(ret),
4403 (int)newlen, s));
4404 Copy(s, STRING(ret), newlen, char);
4405 STR_LEN(ret) += newlen - oldlen;
4406 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4407 } else
4408 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4409 }
a72c7584
JH
4410 }
4411
a0d0e21e 4412 return(ret);
a687059c
LW
4413}
4414
873ef191 4415STATIC char *
5f66b61c 4416S_regwhite(char *p, const char *e)
5b5a24f7
CS
4417{
4418 while (p < e) {
4419 if (isSPACE(*p))
4420 ++p;
4421 else if (*p == '#') {
4422 do {
4423 p++;
4424 } while (p < e && *p != '\n');
4425 }
4426 else
4427 break;
4428 }
4429 return p;
4430}
4431
b8c5462f
JH
4432/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4433 Character classes ([:foo:]) can also be negated ([:^foo:]).
4434 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4435 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 4436 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
4437
4438#define POSIXCC_DONE(c) ((c) == ':')
4439#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4440#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4441
b8c5462f 4442STATIC I32
830247a4 4443S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 4444{
97aff369 4445 dVAR;
936ed897 4446 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 4447
830247a4 4448 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 4449 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 4450 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 4451 const char c = UCHARAT(RExC_parse);
830247a4 4452 char* s = RExC_parse++;
b81d288d 4453
9a86a77b 4454 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
4455 RExC_parse++;
4456 if (RExC_parse == RExC_end)
620e46c5 4457 /* Grandfather lone [:, [=, [. */
830247a4 4458 RExC_parse = s;
620e46c5 4459 else {
1df70142 4460 const char* t = RExC_parse++; /* skip over the c */
a28509cc 4461 const char *posixcc;
b8c5462f 4462
80916619
NC
4463 assert(*t == c);
4464
9a86a77b 4465 if (UCHARAT(RExC_parse) == ']') {
830247a4 4466 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
4467 posixcc = s + 1;
4468 if (*s == ':') {
1df70142
AL
4469 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4470 const I32 skip = t - posixcc;
80916619
NC
4471
4472 /* Initially switch on the length of the name. */
4473 switch (skip) {
4474 case 4:
4475 if (memEQ(posixcc, "word", 4)) {
4476 /* this is not POSIX, this is the Perl \w */;
4477 namedclass
4478 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4479 }
cc4319de 4480 break;
80916619
NC
4481 case 5:
4482 /* Names all of length 5. */
4483 /* alnum alpha ascii blank cntrl digit graph lower
4484 print punct space upper */
4485 /* Offset 4 gives the best switch position. */
4486 switch (posixcc[4]) {
4487 case 'a':
4488 if (memEQ(posixcc, "alph", 4)) {
4489 /* a */
4490 namedclass
4491 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4492 }
4493 break;
4494 case 'e':
4495 if (memEQ(posixcc, "spac", 4)) {
4496 /* e */
4497 namedclass
4498 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4499 }
4500 break;
4501 case 'h':
4502 if (memEQ(posixcc, "grap", 4)) {
4503 /* h */
4504 namedclass
4505 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4506 }
4507 break;
4508 case 'i':
4509 if (memEQ(posixcc, "asci", 4)) {
4510 /* i */
4511 namedclass
4512 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4513 }
4514 break;
4515 case 'k':
4516 if (memEQ(posixcc, "blan", 4)) {
4517 /* k */
4518 namedclass
4519 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4520 }
4521 break;
4522 case 'l':
4523 if (memEQ(posixcc, "cntr", 4)) {
4524 /* l */
4525 namedclass
4526 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4527 }
4528 break;
4529 case 'm':
4530 if (memEQ(posixcc, "alnu", 4)) {
4531 /* m */
4532 namedclass
4533 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4534 }
4535 break;
4536 case 'r':
4537 if (memEQ(posixcc, "lowe", 4)) {
4538 /* r */
4539 namedclass
4540 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4541 }
4542 if (memEQ(posixcc, "uppe", 4)) {
8fdec511 4543 /* r */
80916619
NC
4544 namedclass
4545 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4546 }
4547 break;
4548 case 't':
4549 if (memEQ(posixcc, "digi", 4)) {
4550 /* t */
4551 namedclass
4552 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4553 }
4554 if (memEQ(posixcc, "prin", 4)) {
8fdec511 4555 /* t */
80916619
NC
4556 namedclass
4557 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4558 }
4559 if (memEQ(posixcc, "punc", 4)) {
4560 /* t */
4561 namedclass
4562 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4563 }
4564 break;
b8c5462f
JH
4565 }
4566 break;
80916619
NC
4567 case 6:
4568 if (memEQ(posixcc, "xdigit", 6)) {
4569 namedclass
4570 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
4571 }
4572 break;
4573 }
80916619
NC
4574
4575 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
4576 {
4577 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4578 t - s - 1, s + 1);
4579 }
80916619
NC
4580 assert (posixcc[skip] == ':');
4581 assert (posixcc[skip+1] == ']');
b45f050a 4582 } else if (!SIZE_ONLY) {
b8c5462f 4583 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 4584
830247a4 4585 /* adjust RExC_parse so the warning shows after
b45f050a 4586 the class closes */
9a86a77b 4587 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 4588 RExC_parse++;
b45f050a
JF
4589 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4590 }
b8c5462f
JH
4591 } else {
4592 /* Maternal grandfather:
4593 * "[:" ending in ":" but not in ":]" */
830247a4 4594 RExC_parse = s;
767d463e 4595 }
620e46c5
JH
4596 }
4597 }
4598
b8c5462f
JH
4599 return namedclass;
4600}
4601
4602STATIC void
830247a4 4603S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 4604{
97aff369 4605 dVAR;
b938889d 4606 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
4607 const char *s = RExC_parse;
4608 const char c = *s++;
b8c5462f
JH
4609
4610 while(*s && isALNUM(*s))
4611 s++;
4612 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
4613 if (ckWARN(WARN_REGEXP))
4614 vWARN3(s+2,
4615 "POSIX syntax [%c %c] belongs inside character classes",
4616 c, c);
b45f050a
JF
4617
4618 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 4619 if (POSIXCC_NOTYET(c)) {
830247a4 4620 /* adjust RExC_parse so the error shows after
b45f050a 4621 the class closes */
9a86a77b 4622 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
4623 ;
4624 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4625 }
b8c5462f
JH
4626 }
4627 }
620e46c5
JH
4628}
4629
76e3520e 4630STATIC regnode *
830247a4 4631S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 4632{
97aff369 4633 dVAR;
ffc61ed2 4634 register UV value;
9a86a77b 4635 register UV nextvalue;
3568d838 4636 register IV prevvalue = OOB_UNICODE;
ffc61ed2 4637 register IV range = 0;
c277df42 4638 register regnode *ret;
ba210ebe 4639 STRLEN numlen;
ffc61ed2 4640 IV namedclass;
cbbf8932 4641 char *rangebegin = NULL;
936ed897 4642 bool need_class = 0;
c445ea15 4643 SV *listsv = NULL;
ffc61ed2
JH
4644 register char *e;
4645 UV n;
9e55ce06 4646 bool optimize_invert = TRUE;
cbbf8932 4647 AV* unicode_alternate = NULL;
1b2d223b
JH
4648#ifdef EBCDIC
4649 UV literal_endpoint = 0;
4650#endif
ffc61ed2
JH
4651
4652 ret = reganode(pRExC_state, ANYOF, 0);
4653
4654 if (!SIZE_ONLY)
4655 ANYOF_FLAGS(ret) = 0;
4656
9a86a77b 4657 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
4658 RExC_naughty++;
4659 RExC_parse++;
4660 if (!SIZE_ONLY)
4661 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4662 }
a0d0e21e 4663
936ed897 4664 if (SIZE_ONLY)
830247a4 4665 RExC_size += ANYOF_SKIP;
936ed897 4666 else {
830247a4 4667 RExC_emit += ANYOF_SKIP;
936ed897
IZ
4668 if (FOLD)
4669 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4670 if (LOC)
4671 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 4672 ANYOF_BITMAP_ZERO(ret);
396482e1 4673 listsv = newSVpvs("# comment\n");
a0d0e21e 4674 }
b8c5462f 4675
9a86a77b
JH
4676 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4677
b938889d 4678 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 4679 checkposixcc(pRExC_state);
b8c5462f 4680
f064b6ad
HS
4681 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4682 if (UCHARAT(RExC_parse) == ']')
4683 goto charclassloop;
ffc61ed2 4684
9a86a77b 4685 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
4686
4687 charclassloop:
4688
4689 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4690
73b437c8 4691 if (!range)
830247a4 4692 rangebegin = RExC_parse;
ffc61ed2 4693 if (UTF) {
5e12f4fb 4694 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
4695 RExC_end - RExC_parse,
4696 &numlen, 0);
ffc61ed2
JH
4697 RExC_parse += numlen;
4698 }
4699 else
4700 value = UCHARAT(RExC_parse++);
9a86a77b
JH
4701 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4702 if (value == '[' && POSIXCC(nextvalue))
830247a4 4703 namedclass = regpposixcc(pRExC_state, value);
620e46c5 4704 else if (value == '\\') {
ffc61ed2 4705 if (UTF) {
5e12f4fb 4706 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
4707 RExC_end - RExC_parse,
4708 &numlen, 0);
4709 RExC_parse += numlen;
4710 }
4711 else
4712 value = UCHARAT(RExC_parse++);
470c3474 4713 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 4714 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
4715 * be a problem later if we want switch on Unicode.
4716 * A similar issue a little bit later when switching on
4717 * namedclass. --jhi */
ffc61ed2 4718 switch ((I32)value) {
b8c5462f
JH
4719 case 'w': namedclass = ANYOF_ALNUM; break;
4720 case 'W': namedclass = ANYOF_NALNUM; break;
4721 case 's': namedclass = ANYOF_SPACE; break;
4722 case 'S': namedclass = ANYOF_NSPACE; break;
4723 case 'd': namedclass = ANYOF_DIGIT; break;
4724 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
4725 case 'p':
4726 case 'P':
af6f566e 4727 if (RExC_parse >= RExC_end)
2a4859cd 4728 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 4729 if (*RExC_parse == '{') {
1df70142 4730 const U8 c = (U8)value;
ffc61ed2
JH
4731 e = strchr(RExC_parse++, '}');
4732 if (!e)
0da60cf5 4733 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
4734 while (isSPACE(UCHARAT(RExC_parse)))
4735 RExC_parse++;
4736 if (e == RExC_parse)
0da60cf5 4737 vFAIL2("Empty \\%c{}", c);
ffc61ed2 4738 n = e - RExC_parse;
ab13f0c7
JH
4739 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4740 n--;
ffc61ed2
JH
4741 }
4742 else {
4743 e = RExC_parse;
4744 n = 1;
4745 }
4746 if (!SIZE_ONLY) {
ab13f0c7
JH
4747 if (UCHARAT(RExC_parse) == '^') {
4748 RExC_parse++;
4749 n--;
4750 value = value == 'p' ? 'P' : 'p'; /* toggle */
4751 while (isSPACE(UCHARAT(RExC_parse))) {
4752 RExC_parse++;
4753 n--;
4754 }
4755 }
ffc61ed2 4756 if (value == 'p')
ab13f0c7
JH
4757 Perl_sv_catpvf(aTHX_ listsv,
4758 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 4759 else
ab13f0c7
JH
4760 Perl_sv_catpvf(aTHX_ listsv,
4761 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
4762 }
4763 RExC_parse = e + 1;
4764 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2
JP
4765 namedclass = ANYOF_MAX; /* no official name, but it's named */
4766 break;
b8c5462f
JH
4767 case 'n': value = '\n'; break;
4768 case 'r': value = '\r'; break;
4769 case 't': value = '\t'; break;
4770 case 'f': value = '\f'; break;
4771 case 'b': value = '\b'; break;
c7f1f016
NIS
4772 case 'e': value = ASCII_TO_NATIVE('\033');break;
4773 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 4774 case 'x':
ffc61ed2 4775 if (*RExC_parse == '{') {
a4c04bdc
NC
4776 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4777 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 4778 e = strchr(RExC_parse++, '}');
b81d288d 4779 if (!e)
ffc61ed2 4780 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
4781
4782 numlen = e - RExC_parse;
4783 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4784 RExC_parse = e + 1;
4785 }
4786 else {
a4c04bdc 4787 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4788 numlen = 2;
4789 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4790 RExC_parse += numlen;
4791 }
b8c5462f
JH
4792 break;
4793 case 'c':
830247a4 4794 value = UCHARAT(RExC_parse++);
b8c5462f
JH
4795 value = toCTRL(value);
4796 break;
4797 case '0': case '1': case '2': case '3': case '4':
4798 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
4799 {
4800 I32 flags = 0;
4801 numlen = 3;
4802 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 4803 RExC_parse += numlen;
b8c5462f 4804 break;
53305cf1 4805 }
1028017a 4806 default:
041457d9 4807 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
4808 vWARN2(RExC_parse,
4809 "Unrecognized escape \\%c in character class passed through",
4810 (int)value);
1028017a 4811 break;
b8c5462f 4812 }
ffc61ed2 4813 } /* end of \blah */
1b2d223b
JH
4814#ifdef EBCDIC
4815 else
4816 literal_endpoint++;
4817#endif
ffc61ed2
JH
4818
4819 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4820
4821 if (!SIZE_ONLY && !need_class)
936ed897 4822 ANYOF_CLASS_ZERO(ret);
ffc61ed2 4823
936ed897 4824 need_class = 1;
ffc61ed2
JH
4825
4826 /* a bad range like a-\d, a-[:digit:] ? */
4827 if (range) {
73b437c8 4828 if (!SIZE_ONLY) {
afd78fd5
JH
4829 if (ckWARN(WARN_REGEXP)) {
4830 int w =
4831 RExC_parse >= rangebegin ?
4832 RExC_parse - rangebegin : 0;
830247a4 4833 vWARN4(RExC_parse,
b45f050a 4834 "False [] range \"%*.*s\"",
afd78fd5
JH
4835 w,
4836 w,
b45f050a 4837 rangebegin);
afd78fd5 4838 }
3568d838
JH
4839 if (prevvalue < 256) {
4840 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
4841 ANYOF_BITMAP_SET(ret, '-');
4842 }
4843 else {
4844 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4845 Perl_sv_catpvf(aTHX_ listsv,
3568d838 4846 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 4847 }
b8c5462f 4848 }
ffc61ed2
JH
4849
4850 range = 0; /* this was not a true range */
73b437c8 4851 }
ffc61ed2 4852
73b437c8 4853 if (!SIZE_ONLY) {
c49a72a9
NC
4854 const char *what = NULL;
4855 char yesno = 0;
4856
3568d838
JH
4857 if (namedclass > OOB_NAMEDCLASS)
4858 optimize_invert = FALSE;
e2962f66
JH
4859 /* Possible truncation here but in some 64-bit environments
4860 * the compiler gets heartburn about switch on 64-bit values.
4861 * A similar issue a little earlier when switching on value.
98f323fa 4862 * --jhi */
e2962f66 4863 switch ((I32)namedclass) {
73b437c8
JH
4864 case ANYOF_ALNUM:
4865 if (LOC)
936ed897 4866 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
4867 else {
4868 for (value = 0; value < 256; value++)
4869 if (isALNUM(value))
936ed897 4870 ANYOF_BITMAP_SET(ret, value);
73b437c8 4871 }
c49a72a9
NC
4872 yesno = '+';
4873 what = "Word";
73b437c8
JH
4874 break;
4875 case ANYOF_NALNUM:
4876 if (LOC)
936ed897 4877 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
4878 else {
4879 for (value = 0; value < 256; value++)
4880 if (!isALNUM(value))
936ed897 4881 ANYOF_BITMAP_SET(ret, value);
73b437c8 4882 }
c49a72a9
NC
4883 yesno = '!';
4884 what = "Word";
73b437c8 4885 break;
ffc61ed2 4886 case ANYOF_ALNUMC:
73b437c8 4887 if (LOC)
ffc61ed2 4888 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
4889 else {
4890 for (value = 0; value < 256; value++)
ffc61ed2 4891 if (isALNUMC(value))
936ed897 4892 ANYOF_BITMAP_SET(ret, value);
73b437c8 4893 }
c49a72a9
NC
4894 yesno = '+';
4895 what = "Alnum";
73b437c8
JH
4896 break;
4897 case ANYOF_NALNUMC:
4898 if (LOC)
936ed897 4899 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
4900 else {
4901 for (value = 0; value < 256; value++)
4902 if (!isALNUMC(value))
936ed897 4903 ANYOF_BITMAP_SET(ret, value);
73b437c8 4904 }
c49a72a9
NC
4905 yesno = '!';
4906 what = "Alnum";
73b437c8
JH
4907 break;
4908 case ANYOF_ALPHA:
4909 if (LOC)
936ed897 4910 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
4911 else {
4912 for (value = 0; value < 256; value++)
4913 if (isALPHA(value))
936ed897 4914 ANYOF_BITMAP_SET(ret, value);
73b437c8 4915 }
c49a72a9
NC
4916 yesno = '+';
4917 what = "Alpha";
73b437c8
JH
4918 break;
4919 case ANYOF_NALPHA:
4920 if (LOC)
936ed897 4921 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
4922 else {
4923 for (value = 0; value < 256; value++)
4924 if (!isALPHA(value))
936ed897 4925 ANYOF_BITMAP_SET(ret, value);
73b437c8 4926 }
c49a72a9
NC
4927 yesno = '!';
4928 what = "Alpha";
73b437c8
JH
4929 break;
4930 case ANYOF_ASCII:
4931 if (LOC)
936ed897 4932 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 4933 else {
c7f1f016 4934#ifndef EBCDIC
1ba5c669
JH
4935 for (value = 0; value < 128; value++)
4936 ANYOF_BITMAP_SET(ret, value);
4937#else /* EBCDIC */
ffbc6a93 4938 for (value = 0; value < 256; value++) {
3a3c4447
JH
4939 if (isASCII(value))
4940 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4941 }
1ba5c669 4942#endif /* EBCDIC */
73b437c8 4943 }
c49a72a9
NC
4944 yesno = '+';
4945 what = "ASCII";
73b437c8
JH
4946 break;
4947 case ANYOF_NASCII:
4948 if (LOC)
936ed897 4949 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 4950 else {
c7f1f016 4951#ifndef EBCDIC
1ba5c669
JH
4952 for (value = 128; value < 256; value++)
4953 ANYOF_BITMAP_SET(ret, value);
4954#else /* EBCDIC */
ffbc6a93 4955 for (value = 0; value < 256; value++) {
3a3c4447
JH
4956 if (!isASCII(value))
4957 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4958 }
1ba5c669 4959#endif /* EBCDIC */
73b437c8 4960 }
c49a72a9
NC
4961 yesno = '!';
4962 what = "ASCII";
73b437c8 4963 break;
aaa51d5e
JF
4964 case ANYOF_BLANK:
4965 if (LOC)
4966 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4967 else {
4968 for (value = 0; value < 256; value++)
4969 if (isBLANK(value))
4970 ANYOF_BITMAP_SET(ret, value);
4971 }
c49a72a9
NC
4972 yesno = '+';
4973 what = "Blank";
aaa51d5e
JF
4974 break;
4975 case ANYOF_NBLANK:
4976 if (LOC)
4977 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4978 else {
4979 for (value = 0; value < 256; value++)
4980 if (!isBLANK(value))
4981 ANYOF_BITMAP_SET(ret, value);
4982 }
c49a72a9
NC
4983 yesno = '!';
4984 what = "Blank";
aaa51d5e 4985 break;
73b437c8
JH
4986 case ANYOF_CNTRL:
4987 if (LOC)
936ed897 4988 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
4989 else {
4990 for (value = 0; value < 256; value++)
4991 if (isCNTRL(value))
936ed897 4992 ANYOF_BITMAP_SET(ret, value);
73b437c8 4993 }
c49a72a9
NC
4994 yesno = '+';
4995 what = "Cntrl";
73b437c8
JH
4996 break;
4997 case ANYOF_NCNTRL:
4998 if (LOC)
936ed897 4999 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5000 else {
5001 for (value = 0; value < 256; value++)
5002 if (!isCNTRL(value))
936ed897 5003 ANYOF_BITMAP_SET(ret, value);
73b437c8 5004 }
c49a72a9
NC
5005 yesno = '!';
5006 what = "Cntrl";
ffc61ed2
JH
5007 break;
5008 case ANYOF_DIGIT:
5009 if (LOC)
5010 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5011 else {
5012 /* consecutive digits assumed */
5013 for (value = '0'; value <= '9'; value++)
5014 ANYOF_BITMAP_SET(ret, value);
5015 }
c49a72a9
NC
5016 yesno = '+';
5017 what = "Digit";
ffc61ed2
JH
5018 break;
5019 case ANYOF_NDIGIT:
5020 if (LOC)
5021 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5022 else {
5023 /* consecutive digits assumed */
5024 for (value = 0; value < '0'; value++)
5025 ANYOF_BITMAP_SET(ret, value);
5026 for (value = '9' + 1; value < 256; value++)
5027 ANYOF_BITMAP_SET(ret, value);
5028 }
c49a72a9
NC
5029 yesno = '!';
5030 what = "Digit";
73b437c8
JH
5031 break;
5032 case ANYOF_GRAPH:
5033 if (LOC)
936ed897 5034 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5035 else {
5036 for (value = 0; value < 256; value++)
5037 if (isGRAPH(value))
936ed897 5038 ANYOF_BITMAP_SET(ret, value);
73b437c8 5039 }
c49a72a9
NC
5040 yesno = '+';
5041 what = "Graph";
73b437c8
JH
5042 break;
5043 case ANYOF_NGRAPH:
5044 if (LOC)
936ed897 5045 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5046 else {
5047 for (value = 0; value < 256; value++)
5048 if (!isGRAPH(value))
936ed897 5049 ANYOF_BITMAP_SET(ret, value);
73b437c8 5050 }
c49a72a9
NC
5051 yesno = '!';
5052 what = "Graph";
73b437c8
JH
5053 break;
5054 case ANYOF_LOWER:
5055 if (LOC)
936ed897 5056 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5057 else {
5058 for (value = 0; value < 256; value++)
5059 if (isLOWER(value))
936ed897 5060 ANYOF_BITMAP_SET(ret, value);
73b437c8 5061 }
c49a72a9
NC
5062 yesno = '+';
5063 what = "Lower";
73b437c8
JH
5064 break;
5065 case ANYOF_NLOWER:
5066 if (LOC)
936ed897 5067 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5068 else {
5069 for (value = 0; value < 256; value++)
5070 if (!isLOWER(value))
936ed897 5071 ANYOF_BITMAP_SET(ret, value);
73b437c8 5072 }
c49a72a9
NC
5073 yesno = '!';
5074 what = "Lower";
73b437c8
JH
5075 break;
5076 case ANYOF_PRINT:
5077 if (LOC)
936ed897 5078 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5079 else {
5080 for (value = 0; value < 256; value++)
5081 if (isPRINT(value))
936ed897 5082 ANYOF_BITMAP_SET(ret, value);
73b437c8 5083 }
c49a72a9
NC
5084 yesno = '+';
5085 what = "Print";
73b437c8
JH
5086 break;
5087 case ANYOF_NPRINT:
5088 if (LOC)
936ed897 5089 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5090 else {
5091 for (value = 0; value < 256; value++)
5092 if (!isPRINT(value))
936ed897 5093 ANYOF_BITMAP_SET(ret, value);
73b437c8 5094 }
c49a72a9
NC
5095 yesno = '!';
5096 what = "Print";
73b437c8 5097 break;
aaa51d5e
JF
5098 case ANYOF_PSXSPC:
5099 if (LOC)
5100 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5101 else {
5102 for (value = 0; value < 256; value++)
5103 if (isPSXSPC(value))
5104 ANYOF_BITMAP_SET(ret, value);
5105 }
c49a72a9
NC
5106 yesno = '+';
5107 what = "Space";
aaa51d5e
JF
5108 break;
5109 case ANYOF_NPSXSPC:
5110 if (LOC)
5111 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5112 else {
5113 for (value = 0; value < 256; value++)
5114 if (!isPSXSPC(value))
5115 ANYOF_BITMAP_SET(ret, value);
5116 }
c49a72a9
NC
5117 yesno = '!';
5118 what = "Space";
aaa51d5e 5119 break;
73b437c8
JH
5120 case ANYOF_PUNCT:
5121 if (LOC)
936ed897 5122 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5123 else {
5124 for (value = 0; value < 256; value++)
5125 if (isPUNCT(value))
936ed897 5126 ANYOF_BITMAP_SET(ret, value);
73b437c8 5127 }
c49a72a9
NC
5128 yesno = '+';
5129 what = "Punct";
73b437c8
JH
5130 break;
5131 case ANYOF_NPUNCT:
5132 if (LOC)
936ed897 5133 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5134 else {
5135 for (value = 0; value < 256; value++)
5136 if (!isPUNCT(value))
936ed897 5137 ANYOF_BITMAP_SET(ret, value);
73b437c8 5138 }
c49a72a9
NC
5139 yesno = '!';
5140 what = "Punct";
ffc61ed2
JH
5141 break;
5142 case ANYOF_SPACE:
5143 if (LOC)
5144 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5145 else {
5146 for (value = 0; value < 256; value++)
5147 if (isSPACE(value))
5148 ANYOF_BITMAP_SET(ret, value);
5149 }
c49a72a9
NC
5150 yesno = '+';
5151 what = "SpacePerl";
ffc61ed2
JH
5152 break;
5153 case ANYOF_NSPACE:
5154 if (LOC)
5155 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5156 else {
5157 for (value = 0; value < 256; value++)
5158 if (!isSPACE(value))
5159 ANYOF_BITMAP_SET(ret, value);
5160 }
c49a72a9
NC
5161 yesno = '!';
5162 what = "SpacePerl";
73b437c8
JH
5163 break;
5164 case ANYOF_UPPER:
5165 if (LOC)
936ed897 5166 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5167 else {
5168 for (value = 0; value < 256; value++)
5169 if (isUPPER(value))
936ed897 5170 ANYOF_BITMAP_SET(ret, value);
73b437c8 5171 }
c49a72a9
NC
5172 yesno = '+';
5173 what = "Upper";
73b437c8
JH
5174 break;
5175 case ANYOF_NUPPER:
5176 if (LOC)
936ed897 5177 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5178 else {
5179 for (value = 0; value < 256; value++)
5180 if (!isUPPER(value))
936ed897 5181 ANYOF_BITMAP_SET(ret, value);
73b437c8 5182 }
c49a72a9
NC
5183 yesno = '!';
5184 what = "Upper";
73b437c8
JH
5185 break;
5186 case ANYOF_XDIGIT:
5187 if (LOC)
936ed897 5188 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5189 else {
5190 for (value = 0; value < 256; value++)
5191 if (isXDIGIT(value))
936ed897 5192 ANYOF_BITMAP_SET(ret, value);
73b437c8 5193 }
c49a72a9
NC
5194 yesno = '+';
5195 what = "XDigit";
73b437c8
JH
5196 break;
5197 case ANYOF_NXDIGIT:
5198 if (LOC)
936ed897 5199 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5200 else {
5201 for (value = 0; value < 256; value++)
5202 if (!isXDIGIT(value))
936ed897 5203 ANYOF_BITMAP_SET(ret, value);
73b437c8 5204 }
c49a72a9
NC
5205 yesno = '!';
5206 what = "XDigit";
73b437c8 5207 break;
f81125e2
JP
5208 case ANYOF_MAX:
5209 /* this is to handle \p and \P */
5210 break;
73b437c8 5211 default:
b45f050a 5212 vFAIL("Invalid [::] class");
73b437c8 5213 break;
b8c5462f 5214 }
c49a72a9
NC
5215 if (what) {
5216 /* Strings such as "+utf8::isWord\n" */
5217 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5218 }
b8c5462f 5219 if (LOC)
936ed897 5220 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5221 continue;
a0d0e21e 5222 }
ffc61ed2
JH
5223 } /* end of namedclass \blah */
5224
a0d0e21e 5225 if (range) {
eb160463 5226 if (prevvalue > (IV)value) /* b-a */ {
b45f050a 5227 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
5228 RExC_parse - rangebegin,
5229 RExC_parse - rangebegin,
b45f050a 5230 rangebegin);
3568d838 5231 range = 0; /* not a valid range */
73b437c8 5232 }
a0d0e21e
LW
5233 }
5234 else {
3568d838 5235 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5236 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5237 RExC_parse[1] != ']') {
5238 RExC_parse++;
ffc61ed2
JH
5239
5240 /* a bad range like \w-, [:word:]- ? */
5241 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5
JH
5242 if (ckWARN(WARN_REGEXP)) {
5243 int w =
5244 RExC_parse >= rangebegin ?
5245 RExC_parse - rangebegin : 0;
830247a4 5246 vWARN4(RExC_parse,
b45f050a 5247 "False [] range \"%*.*s\"",
afd78fd5
JH
5248 w,
5249 w,
b45f050a 5250 rangebegin);
afd78fd5 5251 }
73b437c8 5252 if (!SIZE_ONLY)
936ed897 5253 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5254 } else
ffc61ed2
JH
5255 range = 1; /* yeah, it's a range! */
5256 continue; /* but do it the next time */
a0d0e21e 5257 }
a687059c 5258 }
ffc61ed2 5259
93733859 5260 /* now is the next time */
ae5c130c 5261 if (!SIZE_ONLY) {
3568d838
JH
5262 IV i;
5263
5264 if (prevvalue < 256) {
1df70142 5265 const IV ceilvalue = value < 256 ? value : 255;
3568d838
JH
5266
5267#ifdef EBCDIC
1b2d223b
JH
5268 /* In EBCDIC [\x89-\x91] should include
5269 * the \x8e but [i-j] should not. */
5270 if (literal_endpoint == 2 &&
5271 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5272 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5273 {
3568d838
JH
5274 if (isLOWER(prevvalue)) {
5275 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5276 if (isLOWER(i))
5277 ANYOF_BITMAP_SET(ret, i);
5278 } else {
3568d838 5279 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5280 if (isUPPER(i))
5281 ANYOF_BITMAP_SET(ret, i);
5282 }
8ada0baa 5283 }
ffc61ed2 5284 else
8ada0baa 5285#endif
a5961de5
JH
5286 for (i = prevvalue; i <= ceilvalue; i++)
5287 ANYOF_BITMAP_SET(ret, i);
3568d838 5288 }
a5961de5 5289 if (value > 255 || UTF) {
1df70142
AL
5290 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5291 const UV natvalue = NATIVE_TO_UNI(value);
b08decb7 5292
ffc61ed2 5293 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5294 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5295 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5296 prevnatvalue, natvalue);
5297 }
5298 else if (prevnatvalue == natvalue) {
5299 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5300 if (FOLD) {
89ebb4a3 5301 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5302 STRLEN foldlen;
1df70142 5303 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5304
c840d2a2
JH
5305 /* If folding and foldable and a single
5306 * character, insert also the folded version
5307 * to the charclass. */
9e55ce06 5308 if (f != value) {
eb160463 5309 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5310 Perl_sv_catpvf(aTHX_ listsv,
5311 "%04"UVxf"\n", f);
5312 else {
5313 /* Any multicharacter foldings
5314 * require the following transform:
5315 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5316 * where E folds into "pq" and F folds
5317 * into "rst", all other characters
5318 * fold to single characters. We save
5319 * away these multicharacter foldings,
5320 * to be later saved as part of the
5321 * additional "s" data. */
5322 SV *sv;
5323
5324 if (!unicode_alternate)
5325 unicode_alternate = newAV();
5326 sv = newSVpvn((char*)foldbuf, foldlen);
5327 SvUTF8_on(sv);
5328 av_push(unicode_alternate, sv);
5329 }
5330 }
254ba52a 5331
60a8b682
JH
5332 /* If folding and the value is one of the Greek
5333 * sigmas insert a few more sigmas to make the
5334 * folding rules of the sigmas to work right.
5335 * Note that not all the possible combinations
5336 * are handled here: some of them are handled
9e55ce06
JH
5337 * by the standard folding rules, and some of
5338 * them (literal or EXACTF cases) are handled
5339 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5340 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5341 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5342 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5343 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5344 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5345 }
5346 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5347 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5348 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5349 }
5350 }
ffc61ed2 5351 }
1b2d223b
JH
5352#ifdef EBCDIC
5353 literal_endpoint = 0;
5354#endif
8ada0baa 5355 }
ffc61ed2
JH
5356
5357 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5358 }
ffc61ed2 5359
936ed897 5360 if (need_class) {
4f66b38d 5361 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5362 if (SIZE_ONLY)
830247a4 5363 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5364 else
830247a4 5365 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5366 }
ffc61ed2 5367
ae5c130c 5368 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 5369 if (!SIZE_ONLY &&
ffc61ed2 5370 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5371 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5372 ) {
a0ed51b3 5373 for (value = 0; value < 256; ++value) {
936ed897 5374 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5375 UV fold = PL_fold[value];
ffc61ed2
JH
5376
5377 if (fold != value)
5378 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5379 }
5380 }
936ed897 5381 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5382 }
ffc61ed2 5383
ae5c130c 5384 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 5385 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
5386 /* If the only flag is inversion. */
5387 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5388 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5389 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5390 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5391 }
a0d0e21e 5392
b81d288d 5393 if (!SIZE_ONLY) {
fde631ed 5394 AV *av = newAV();
ffc61ed2
JH
5395 SV *rv;
5396
9e55ce06 5397 /* The 0th element stores the character class description
6a0407ee 5398 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5399 * to initialize the appropriate swash (which gets stored in
5400 * the 1st element), and also useful for dumping the regnode.
5401 * The 2nd element stores the multicharacter foldings,
6a0407ee 5402 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
5403 av_store(av, 0, listsv);
5404 av_store(av, 1, NULL);
9e55ce06 5405 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 5406 rv = newRV_noinc((SV*)av);
19860706 5407 n = add_data(pRExC_state, 1, "s");
830247a4 5408 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 5409 ARG_SET(ret, n);
a0ed51b3
LW
5410 }
5411
5412 return ret;
5413}
5414
76e3520e 5415STATIC char*
830247a4 5416S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 5417{
830247a4 5418 char* retval = RExC_parse++;
a0d0e21e 5419
4633a7c4 5420 for (;;) {
830247a4
IZ
5421 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5422 RExC_parse[2] == '#') {
e994fd66
AE
5423 while (*RExC_parse != ')') {
5424 if (RExC_parse == RExC_end)
5425 FAIL("Sequence (?#... not terminated");
830247a4 5426 RExC_parse++;
e994fd66 5427 }
830247a4 5428 RExC_parse++;
4633a7c4
LW
5429 continue;
5430 }
e2509266 5431 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
5432 if (isSPACE(*RExC_parse)) {
5433 RExC_parse++;
748a9306
LW
5434 continue;
5435 }
830247a4 5436 else if (*RExC_parse == '#') {
e994fd66
AE
5437 while (RExC_parse < RExC_end)
5438 if (*RExC_parse++ == '\n') break;
748a9306
LW
5439 continue;
5440 }
748a9306 5441 }
4633a7c4 5442 return retval;
a0d0e21e 5443 }
a687059c
LW
5444}
5445
5446/*
c277df42 5447- reg_node - emit a node
a0d0e21e 5448*/
76e3520e 5449STATIC regnode * /* Location. */
830247a4 5450S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 5451{
97aff369 5452 dVAR;
c277df42 5453 register regnode *ptr;
504618e9 5454 regnode * const ret = RExC_emit;
a687059c 5455
c277df42 5456 if (SIZE_ONLY) {
830247a4
IZ
5457 SIZE_ALIGN(RExC_size);
5458 RExC_size += 1;
a0d0e21e
LW
5459 return(ret);
5460 }
a687059c 5461
c277df42 5462 NODE_ALIGN_FILL(ret);
a0d0e21e 5463 ptr = ret;
c277df42 5464 FILL_ADVANCE_NODE(ptr, op);
fac92740 5465 if (RExC_offsets) { /* MJD */
ccb2c380 5466 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
5467 "reg_node", __LINE__,
5468 reg_name[op],
5469 RExC_emit - RExC_emit_start > RExC_offsets[0]
5470 ? "Overwriting end of array!\n" : "OK",
5471 RExC_emit - RExC_emit_start,
5472 RExC_parse - RExC_start,
5473 RExC_offsets[0]));
ccb2c380 5474 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
5475 }
5476
830247a4 5477 RExC_emit = ptr;
a687059c 5478
a0d0e21e 5479 return(ret);
a687059c
LW
5480}
5481
5482/*
a0d0e21e
LW
5483- reganode - emit a node with an argument
5484*/
76e3520e 5485STATIC regnode * /* Location. */
830247a4 5486S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 5487{
97aff369 5488 dVAR;
c277df42 5489 register regnode *ptr;
504618e9 5490 regnode * const ret = RExC_emit;
fe14fcc3 5491
c277df42 5492 if (SIZE_ONLY) {
830247a4
IZ
5493 SIZE_ALIGN(RExC_size);
5494 RExC_size += 2;
a0d0e21e
LW
5495 return(ret);
5496 }
fe14fcc3 5497
c277df42 5498 NODE_ALIGN_FILL(ret);
a0d0e21e 5499 ptr = ret;
c277df42 5500 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 5501 if (RExC_offsets) { /* MJD */
ccb2c380 5502 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5503 "reganode",
ccb2c380
MP
5504 __LINE__,
5505 reg_name[op],
fac92740
MJD
5506 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5507 "Overwriting end of array!\n" : "OK",
5508 RExC_emit - RExC_emit_start,
5509 RExC_parse - RExC_start,
5510 RExC_offsets[0]));
ccb2c380 5511 Set_Cur_Node_Offset;
fac92740
MJD
5512 }
5513
830247a4 5514 RExC_emit = ptr;
fe14fcc3 5515
a0d0e21e 5516 return(ret);
fe14fcc3
LW
5517}
5518
5519/*
cd439c50 5520- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
5521*/
5522STATIC void
a28509cc 5523S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 5524{
97aff369 5525 dVAR;
5e12f4fb 5526 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
5527}
5528
5529/*
a0d0e21e
LW
5530- reginsert - insert an operator in front of already-emitted operand
5531*
5532* Means relocating the operand.
5533*/
76e3520e 5534STATIC void
830247a4 5535S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 5536{
97aff369 5537 dVAR;
c277df42
IZ
5538 register regnode *src;
5539 register regnode *dst;
5540 register regnode *place;
504618e9 5541 const int offset = regarglen[(U8)op];
b81d288d 5542
22c35a8c 5543/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
5544
5545 if (SIZE_ONLY) {
830247a4 5546 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
5547 return;
5548 }
a687059c 5549
830247a4
IZ
5550 src = RExC_emit;
5551 RExC_emit += NODE_STEP_REGNODE + offset;
5552 dst = RExC_emit;
fac92740 5553 while (src > opnd) {
c277df42 5554 StructCopy(--src, --dst, regnode);
fac92740 5555 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 5556 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 5557 "reg_insert",
ccb2c380
MP
5558 __LINE__,
5559 reg_name[op],
fac92740
MJD
5560 dst - RExC_emit_start > RExC_offsets[0]
5561 ? "Overwriting end of array!\n" : "OK",
5562 src - RExC_emit_start,
5563 dst - RExC_emit_start,
5564 RExC_offsets[0]));
ccb2c380
MP
5565 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5566 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
5567 }
5568 }
5569
a0d0e21e
LW
5570
5571 place = opnd; /* Op node, where operand used to be. */
fac92740 5572 if (RExC_offsets) { /* MJD */
ccb2c380 5573 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5574 "reginsert",
ccb2c380
MP
5575 __LINE__,
5576 reg_name[op],
fac92740
MJD
5577 place - RExC_emit_start > RExC_offsets[0]
5578 ? "Overwriting end of array!\n" : "OK",
5579 place - RExC_emit_start,
5580 RExC_parse - RExC_start,
5581 RExC_offsets[0]));
ccb2c380 5582 Set_Node_Offset(place, RExC_parse);
45948336 5583 Set_Node_Length(place, 1);
fac92740 5584 }
c277df42
IZ
5585 src = NEXTOPER(place);
5586 FILL_ADVANCE_NODE(place, op);
5587 Zero(src, offset, regnode);
a687059c
LW
5588}
5589
5590/*
c277df42 5591- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 5592*/
76e3520e 5593STATIC void
830247a4 5594S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5595{
97aff369 5596 dVAR;
c277df42 5597 register regnode *scan;
a0d0e21e 5598
c277df42 5599 if (SIZE_ONLY)
a0d0e21e
LW
5600 return;
5601
5602 /* Find last node. */
5603 scan = p;
5604 for (;;) {
504618e9 5605 regnode * const temp = regnext(scan);
a0d0e21e
LW
5606 if (temp == NULL)
5607 break;
5608 scan = temp;
5609 }
a687059c 5610
c277df42
IZ
5611 if (reg_off_by_arg[OP(scan)]) {
5612 ARG_SET(scan, val - scan);
a0ed51b3
LW
5613 }
5614 else {
c277df42
IZ
5615 NEXT_OFF(scan) = val - scan;
5616 }
a687059c
LW
5617}
5618
5619/*
a0d0e21e
LW
5620- regoptail - regtail on operand of first argument; nop if operandless
5621*/
76e3520e 5622STATIC void
830247a4 5623S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5624{
97aff369 5625 dVAR;
a0d0e21e 5626 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
5627 if (p == NULL || SIZE_ONLY)
5628 return;
22c35a8c 5629 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 5630 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 5631 }
22c35a8c 5632 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 5633 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
5634 }
5635 else
a0d0e21e 5636 return;
a687059c
LW
5637}
5638
5639/*
5640 - regcurly - a little FSA that accepts {\d+,?\d*}
5641 */
79072805 5642STATIC I32
5f66b61c 5643S_regcurly(register const char *s)
a687059c
LW
5644{
5645 if (*s++ != '{')
5646 return FALSE;
f0fcb552 5647 if (!isDIGIT(*s))
a687059c 5648 return FALSE;
f0fcb552 5649 while (isDIGIT(*s))
a687059c
LW
5650 s++;
5651 if (*s == ',')
5652 s++;
f0fcb552 5653 while (isDIGIT(*s))
a687059c
LW
5654 s++;
5655 if (*s != '}')
5656 return FALSE;
5657 return TRUE;
5658}
5659
a687059c
LW
5660
5661/*
fd181c75 5662 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
5663 */
5664void
864dbfa3 5665Perl_regdump(pTHX_ regexp *r)
a687059c 5666{
35ff7856 5667#ifdef DEBUGGING
97aff369 5668 dVAR;
c445ea15 5669 SV * const sv = sv_newmortal();
a687059c 5670
c277df42 5671 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
5672
5673 /* Header fields of interest. */
c277df42 5674 if (r->anchored_substr)
7b0972df 5675 PerlIO_printf(Perl_debug_log,
a0288114 5676 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
3280af22 5677 PL_colors[0],
7b0972df 5678 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
3f7c398e 5679 SvPVX_const(r->anchored_substr),
3280af22 5680 PL_colors[1],
c277df42 5681 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 5682 (IV)r->anchored_offset);
33b8afdf
JH
5683 else if (r->anchored_utf8)
5684 PerlIO_printf(Perl_debug_log,
a0288114 5685 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
33b8afdf
JH
5686 PL_colors[0],
5687 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
3f7c398e 5688 SvPVX_const(r->anchored_utf8),
33b8afdf
JH
5689 PL_colors[1],
5690 SvTAIL(r->anchored_utf8) ? "$" : "",
5691 (IV)r->anchored_offset);
c277df42 5692 if (r->float_substr)
7b0972df 5693 PerlIO_printf(Perl_debug_log,
a0288114 5694 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
3280af22 5695 PL_colors[0],
b81d288d 5696 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
3f7c398e 5697 SvPVX_const(r->float_substr),
3280af22 5698 PL_colors[1],
c277df42 5699 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 5700 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
5701 else if (r->float_utf8)
5702 PerlIO_printf(Perl_debug_log,
a0288114 5703 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
33b8afdf
JH
5704 PL_colors[0],
5705 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
3f7c398e 5706 SvPVX_const(r->float_utf8),
33b8afdf
JH
5707 PL_colors[1],
5708 SvTAIL(r->float_utf8) ? "$" : "",
5709 (IV)r->float_min_offset, (UV)r->float_max_offset);
5710 if (r->check_substr || r->check_utf8)
b81d288d
AB
5711 PerlIO_printf(Perl_debug_log,
5712 r->check_substr == r->float_substr
33b8afdf 5713 && r->check_utf8 == r->float_utf8
c277df42
IZ
5714 ? "(checking floating" : "(checking anchored");
5715 if (r->reganch & ROPT_NOSCAN)
5716 PerlIO_printf(Perl_debug_log, " noscan");
5717 if (r->reganch & ROPT_CHECK_ALL)
5718 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 5719 if (r->check_substr || r->check_utf8)
c277df42
IZ
5720 PerlIO_printf(Perl_debug_log, ") ");
5721
46fc3d4c 5722 if (r->regstclass) {
5723 regprop(sv, r->regstclass);
3f7c398e 5724 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
46fc3d4c 5725 }
774d564b 5726 if (r->reganch & ROPT_ANCH) {
5727 PerlIO_printf(Perl_debug_log, "anchored");
5728 if (r->reganch & ROPT_ANCH_BOL)
5729 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
5730 if (r->reganch & ROPT_ANCH_MBOL)
5731 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
5732 if (r->reganch & ROPT_ANCH_SBOL)
5733 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 5734 if (r->reganch & ROPT_ANCH_GPOS)
5735 PerlIO_printf(Perl_debug_log, "(GPOS)");
5736 PerlIO_putc(Perl_debug_log, ' ');
5737 }
c277df42
IZ
5738 if (r->reganch & ROPT_GPOS_SEEN)
5739 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 5740 if (r->reganch & ROPT_SKIP)
760ac839 5741 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 5742 if (r->reganch & ROPT_IMPLICIT)
760ac839 5743 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 5744 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
5745 if (r->reganch & ROPT_EVAL_SEEN)
5746 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 5747 PerlIO_printf(Perl_debug_log, "\n");
fac92740 5748 if (r->offsets) {
e4584336 5749 const U32 len = r->offsets[0];
a3621e74
YO
5750 GET_RE_DEBUG_FLAGS_DECL;
5751 DEBUG_OFFSETS_r({
1df70142 5752 U32 i;
e4584336
RB
5753 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5754 for (i = 1; i <= len; i++)
5755 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5756 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5757 PerlIO_printf(Perl_debug_log, "\n");
a3621e74 5758 });
fac92740 5759 }
65e66c80 5760#else
96a5add6 5761 PERL_UNUSED_CONTEXT;
65e66c80 5762 PERL_UNUSED_ARG(r);
17c3b450 5763#endif /* DEBUGGING */
a687059c
LW
5764}
5765
5766/*
a0d0e21e
LW
5767- regprop - printable representation of opcode
5768*/
46fc3d4c 5769void
a3b680e6 5770Perl_regprop(pTHX_ SV *sv, const regnode *o)
a687059c 5771{
35ff7856 5772#ifdef DEBUGGING
97aff369 5773 dVAR;
9b155405 5774 register int k;
a0d0e21e 5775
54dc92de 5776 sv_setpvn(sv, "", 0);
9b155405 5777 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
5778 /* It would be nice to FAIL() here, but this may be called from
5779 regexec.c, and it would be hard to supply pRExC_state. */
5780 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 5781 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405
IZ
5782
5783 k = PL_regkind[(U8)OP(o)];
5784
2a782b5b 5785 if (k == EXACT) {
396482e1 5786 SV * const dsv = sv_2mortal(newSVpvs(""));
c728cb41
JH
5787 /* Using is_utf8_string() is a crude hack but it may
5788 * be the best for now since we have no flag "this EXACTish
5789 * node was UTF-8" --jhi */
1df70142 5790 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
2d03de9c 5791 const char * const s = do_utf8 ?
c728cb41
JH
5792 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5793 UNI_DISPLAY_REGEX) :
2a782b5b 5794 STRING(o);
e1ec3a88 5795 const int len = do_utf8 ?
2a782b5b
JH
5796 strlen(s) :
5797 STR_LEN(o);
5798 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5799 PL_colors[0],
5800 len, s,
5801 PL_colors[1]);
bb263b4e
AL
5802 } else if (k == TRIE) {
5803 /*EMPTY*/;
5804 /*
a3621e74
YO
5805 this isn't always safe, as Pl_regdata may not be for this regex yet
5806 (depending on where its called from) so its being moved to dumpuntil
5807 I32 n = ARG(o);
5808 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5809 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5810 trie->wordcount,
5811 trie->charcount,
5812 trie->uniquecharcount,
5813 trie->laststate);
5814 */
5815 } else if (k == CURLY) {
cb434fcc 5816 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
5817 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5818 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 5819 }
2c2d71f5
JH
5820 else if (k == WHILEM && o->flags) /* Ordinal/of */
5821 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 5822 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 5823 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 5824 else if (k == LOGICAL)
04ebc1ab 5825 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
5826 else if (k == ANYOF) {
5827 int i, rangestart = -1;
2d03de9c 5828 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
5829
5830 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
5831 static const char * const anyofs[] = {
653099ff
GS
5832 "\\w",
5833 "\\W",
5834 "\\s",
5835 "\\S",
5836 "\\d",
5837 "\\D",
5838 "[:alnum:]",
5839 "[:^alnum:]",
5840 "[:alpha:]",
5841 "[:^alpha:]",
5842 "[:ascii:]",
5843 "[:^ascii:]",
5844 "[:ctrl:]",
5845 "[:^ctrl:]",
5846 "[:graph:]",
5847 "[:^graph:]",
5848 "[:lower:]",
5849 "[:^lower:]",
5850 "[:print:]",
5851 "[:^print:]",
5852 "[:punct:]",
5853 "[:^punct:]",
5854 "[:upper:]",
aaa51d5e 5855 "[:^upper:]",
653099ff 5856 "[:xdigit:]",
aaa51d5e
JF
5857 "[:^xdigit:]",
5858 "[:space:]",
5859 "[:^space:]",
5860 "[:blank:]",
5861 "[:^blank:]"
653099ff
GS
5862 };
5863
19860706 5864 if (flags & ANYOF_LOCALE)
396482e1 5865 sv_catpvs(sv, "{loc}");
19860706 5866 if (flags & ANYOF_FOLD)
396482e1 5867 sv_catpvs(sv, "{i}");
653099ff 5868 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 5869 if (flags & ANYOF_INVERT)
396482e1 5870 sv_catpvs(sv, "^");
ffc61ed2
JH
5871 for (i = 0; i <= 256; i++) {
5872 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5873 if (rangestart == -1)
5874 rangestart = i;
5875 } else if (rangestart != -1) {
5876 if (i <= rangestart + 3)
5877 for (; rangestart < i; rangestart++)
653099ff 5878 put_byte(sv, rangestart);
ffc61ed2
JH
5879 else {
5880 put_byte(sv, rangestart);
396482e1 5881 sv_catpvs(sv, "-");
ffc61ed2 5882 put_byte(sv, i - 1);
653099ff 5883 }
ffc61ed2 5884 rangestart = -1;
653099ff 5885 }
847a199f 5886 }
ffc61ed2
JH
5887
5888 if (o->flags & ANYOF_CLASS)
5889 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
5890 if (ANYOF_CLASS_TEST(o,i))
5891 sv_catpv(sv, anyofs[i]);
5892
5893 if (flags & ANYOF_UNICODE)
396482e1 5894 sv_catpvs(sv, "{unicode}");
1aa99e6b 5895 else if (flags & ANYOF_UNICODE_ALL)
396482e1 5896 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
5897
5898 {
5899 SV *lv;
2d03de9c 5900 SV * const sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 5901
ffc61ed2
JH
5902 if (lv) {
5903 if (sw) {
89ebb4a3 5904 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 5905
ffc61ed2 5906 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 5907 uvchr_to_utf8(s, i);
ffc61ed2 5908
3568d838 5909 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
5910 if (rangestart == -1)
5911 rangestart = i;
5912 } else if (rangestart != -1) {
ffc61ed2
JH
5913 if (i <= rangestart + 3)
5914 for (; rangestart < i; rangestart++) {
2d03de9c
AL
5915 const U8 * const e = uvchr_to_utf8(s,rangestart);
5916 U8 *p;
5917 for(p = s; p < e; p++)
ffc61ed2
JH
5918 put_byte(sv, *p);
5919 }
5920 else {
2d03de9c
AL
5921 const U8 *e = uvchr_to_utf8(s,rangestart);
5922 U8 *p;
5923 for (p = s; p < e; p++)
ffc61ed2 5924 put_byte(sv, *p);
396482e1 5925 sv_catpvs(sv, "-");
2d03de9c
AL
5926 e = uvchr_to_utf8(s, i-1);
5927 for (p = s; p < e; p++)
1df70142 5928 put_byte(sv, *p);
ffc61ed2
JH
5929 }
5930 rangestart = -1;
5931 }
19860706 5932 }
ffc61ed2 5933
396482e1 5934 sv_catpvs(sv, "..."); /* et cetera */
19860706 5935 }
fde631ed 5936
ffc61ed2 5937 {
2e0de35c 5938 char *s = savesvpv(lv);
c445ea15 5939 char * const origs = s;
b81d288d 5940
ffc61ed2 5941 while(*s && *s != '\n') s++;
b81d288d 5942
ffc61ed2 5943 if (*s == '\n') {
2d03de9c 5944 const char * const t = ++s;
ffc61ed2
JH
5945
5946 while (*s) {
5947 if (*s == '\n')
5948 *s = ' ';
5949 s++;
5950 }
5951 if (s[-1] == ' ')
5952 s[-1] = 0;
5953
5954 sv_catpv(sv, t);
fde631ed 5955 }
b81d288d 5956
ffc61ed2 5957 Safefree(origs);
fde631ed
JH
5958 }
5959 }
653099ff 5960 }
ffc61ed2 5961
653099ff
GS
5962 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5963 }
9b155405 5964 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 5965 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
65e66c80 5966#else
96a5add6 5967 PERL_UNUSED_CONTEXT;
65e66c80
SP
5968 PERL_UNUSED_ARG(sv);
5969 PERL_UNUSED_ARG(o);
17c3b450 5970#endif /* DEBUGGING */
35ff7856 5971}
a687059c 5972
cad2e5aa
JH
5973SV *
5974Perl_re_intuit_string(pTHX_ regexp *prog)
5975{ /* Assume that RE_INTUIT is set */
97aff369 5976 dVAR;
a3621e74 5977 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
5978 PERL_UNUSED_CONTEXT;
5979
a3621e74 5980 DEBUG_COMPILE_r(
cfd0369c 5981 {
2d03de9c 5982 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 5983 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
5984
5985 if (!PL_colorset) reginitcolors();
5986 PerlIO_printf(Perl_debug_log,
a0288114 5987 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
5988 PL_colors[4],
5989 prog->check_substr ? "" : "utf8 ",
5990 PL_colors[5],PL_colors[0],
cad2e5aa
JH
5991 s,
5992 PL_colors[1],
5993 (strlen(s) > 60 ? "..." : ""));
5994 } );
5995
33b8afdf 5996 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
5997}
5998
2b69d0c2 5999void
864dbfa3 6000Perl_pregfree(pTHX_ struct regexp *r)
a687059c 6001{
27da23d5 6002 dVAR;
9e55ce06 6003#ifdef DEBUGGING
c445ea15
AL
6004 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
6005 SV * const re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
9e55ce06 6006#endif
7821416a 6007
a3621e74 6008
7821416a
IZ
6009 if (!r || (--r->refcnt > 0))
6010 return;
a3621e74 6011 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
c445ea15 6012 const char * const s = (r->reganch & ROPT_UTF8)
e1ec3a88 6013 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 6014 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
e1ec3a88 6015 const int len = SvCUR(dsv);
9e55ce06
JH
6016 if (!PL_colorset)
6017 reginitcolors();
6018 PerlIO_printf(Perl_debug_log,
a3621e74 6019 "%sFreeing REx:%s %s%*.*s%s%s\n",
9e55ce06
JH
6020 PL_colors[4],PL_colors[5],PL_colors[0],
6021 len, len, s,
6022 PL_colors[1],
6023 len > 60 ? "..." : "");
6024 });
cad2e5aa 6025
43c5f42d
NC
6026 /* gcov results gave these as non-null 100% of the time, so there's no
6027 optimisation in checking them before calling Safefree */
6028 Safefree(r->precomp);
6029 Safefree(r->offsets); /* 20010421 MJD */
ed252734 6030 RX_MATCH_COPY_FREE(r);
f8c7b90f 6031#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
6032 if (r->saved_copy)
6033 SvREFCNT_dec(r->saved_copy);
6034#endif
a193d654
GS
6035 if (r->substrs) {
6036 if (r->anchored_substr)
6037 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6038 if (r->anchored_utf8)
6039 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6040 if (r->float_substr)
6041 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6042 if (r->float_utf8)
6043 SvREFCNT_dec(r->float_utf8);
2779dcf1 6044 Safefree(r->substrs);
a193d654 6045 }
c277df42
IZ
6046 if (r->data) {
6047 int n = r->data->count;
f3548bdc
DM
6048 PAD* new_comppad = NULL;
6049 PAD* old_comppad;
4026c95a 6050 PADOFFSET refcnt;
dfad63ad 6051
c277df42 6052 while (--n >= 0) {
261faec3 6053 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6054 switch (r->data->what[n]) {
6055 case 's':
6056 SvREFCNT_dec((SV*)r->data->data[n]);
6057 break;
653099ff
GS
6058 case 'f':
6059 Safefree(r->data->data[n]);
6060 break;
dfad63ad
HS
6061 case 'p':
6062 new_comppad = (AV*)r->data->data[n];
6063 break;
c277df42 6064 case 'o':
dfad63ad 6065 if (new_comppad == NULL)
cea2e8a9 6066 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6067 PAD_SAVE_LOCAL(old_comppad,
6068 /* Watch out for global destruction's random ordering. */
c445ea15 6069 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 6070 );
b34c0dd4 6071 OP_REFCNT_LOCK;
4026c95a
SH
6072 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6073 OP_REFCNT_UNLOCK;
6074 if (!refcnt)
9b978d73 6075 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6076
f3548bdc 6077 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6078 SvREFCNT_dec((SV*)new_comppad);
6079 new_comppad = NULL;
c277df42
IZ
6080 break;
6081 case 'n':
9e55ce06 6082 break;
a3621e74
YO
6083 case 't':
6084 {
c445ea15 6085 reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
a3621e74
YO
6086 U32 refcount;
6087 OP_REFCNT_LOCK;
e27afef8 6088 refcount = --trie->refcount;
a3621e74
YO
6089 OP_REFCNT_UNLOCK;
6090 if ( !refcount ) {
43c5f42d 6091 Safefree(trie->charmap);
a3621e74
YO
6092 if (trie->widecharmap)
6093 SvREFCNT_dec((SV*)trie->widecharmap);
43c5f42d
NC
6094 Safefree(trie->states);
6095 Safefree(trie->trans);
a3621e74
YO
6096#ifdef DEBUGGING
6097 if (trie->words)
6098 SvREFCNT_dec((SV*)trie->words);
6099 if (trie->revcharmap)
6100 SvREFCNT_dec((SV*)trie->revcharmap);
6101#endif
6102 Safefree(r->data->data[n]); /* do this last!!!! */
6103 }
6104 break;
6105 }
c277df42 6106 default:
830247a4 6107 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6108 }
6109 }
6110 Safefree(r->data->what);
6111 Safefree(r->data);
a0d0e21e
LW
6112 }
6113 Safefree(r->startp);
6114 Safefree(r->endp);
6115 Safefree(r);
a687059c 6116}
c277df42
IZ
6117
6118/*
6119 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6120 */
6121regnode *
864dbfa3 6122Perl_regnext(pTHX_ register regnode *p)
c277df42 6123{
97aff369 6124 dVAR;
c277df42
IZ
6125 register I32 offset;
6126
3280af22 6127 if (p == &PL_regdummy)
c277df42
IZ
6128 return(NULL);
6129
6130 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6131 if (offset == 0)
6132 return(NULL);
6133
c277df42 6134 return(p+offset);
c277df42
IZ
6135}
6136
01f988be 6137STATIC void
cea2e8a9 6138S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6139{
6140 va_list args;
6141 STRLEN l1 = strlen(pat1);
6142 STRLEN l2 = strlen(pat2);
6143 char buf[512];
06bf62c7 6144 SV *msv;
73d840c0 6145 const char *message;
c277df42
IZ
6146
6147 if (l1 > 510)
6148 l1 = 510;
6149 if (l1 + l2 > 510)
6150 l2 = 510 - l1;
6151 Copy(pat1, buf, l1 , char);
6152 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6153 buf[l1 + l2] = '\n';
6154 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6155#ifdef I_STDARG
6156 /* ANSI variant takes additional second argument */
c277df42 6157 va_start(args, pat2);
8736538c
AS
6158#else
6159 va_start(args);
6160#endif
5a844595 6161 msv = vmess(buf, &args);
c277df42 6162 va_end(args);
cfd0369c 6163 message = SvPV_const(msv,l1);
c277df42
IZ
6164 if (l1 > 512)
6165 l1 = 512;
6166 Copy(message, buf, l1 , char);
197cf9b9 6167 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6168 Perl_croak(aTHX_ "%s", buf);
c277df42 6169}
a0ed51b3
LW
6170
6171/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6172
6173void
864dbfa3 6174Perl_save_re_context(pTHX)
b81d288d 6175{
97aff369 6176 dVAR;
830247a4 6177 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 6178 SAVEPPTR(PL_bostr);
a0ed51b3
LW
6179 SAVEPPTR(PL_reginput); /* String-input pointer. */
6180 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6181 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
6182 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6183 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6184 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a5db57d6 6185 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
a0ed51b3 6186 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 6187 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 6188 PL_reg_start_tmp = 0;
a0ed51b3
LW
6189 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6190 PL_reg_start_tmpl = 0;
7766f137 6191 SAVEVPTR(PL_regdata);
a0ed51b3
LW
6192 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6193 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 6194 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 6195 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
6196 SAVEVPTR(PL_regcc); /* from regexec.c */
6197 SAVEVPTR(PL_curcop);
7766f137
GS
6198 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6199 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
6200 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6201 SAVESPTR(PL_reg_sv); /* from regexec.c */
9febdf04 6202 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
7766f137 6203 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 6204 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
6205 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6206 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
a5db57d6 6207 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
c445ea15 6208 PL_reg_oldsaved = NULL;
a5db57d6
GS
6209 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6210 PL_reg_oldsavedlen = 0;
f8c7b90f 6211#ifdef PERL_OLD_COPY_ON_WRITE
ed252734 6212 SAVESPTR(PL_nrs);
c445ea15 6213 PL_nrs = NULL;
ed252734 6214#endif
a5db57d6
GS
6215 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6216 PL_reg_maxiter = 0;
6217 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6218 PL_reg_leftiter = 0;
6219 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
c445ea15 6220 PL_reg_poscache = NULL;
a5db57d6
GS
6221 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6222 PL_reg_poscache_size = 0;
6223 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5fb7366e 6224 SAVEI32(PL_regnpar); /* () count. */
e49a9654 6225 SAVEI32(PL_regsize); /* from regexec.c */
ada6e8a9 6226
c445ea15
AL
6227 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6228 if (PL_curpm) {
6229 const REGEXP * const rx = PM_GETRE(PL_curpm);
6230 if (rx) {
1df70142 6231 U32 i;
ada6e8a9 6232 for (i = 1; i <= rx->nparens; i++) {
1df70142 6233 char digits[TYPE_CHARS(long)];
e5105eda 6234 const STRLEN len = my_sprintf(digits, "%lu", (long)i);
49f27e4b
NC
6235 GV *const *const gvp
6236 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6237
b37c2d43
AL
6238 if (gvp) {
6239 GV * const gv = *gvp;
6240 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6241 save_scalar(gv);
49f27e4b 6242 }
ada6e8a9
AMS
6243 }
6244 }
6245 }
6246
54b6e2fa 6247#ifdef DEBUGGING
b81d288d 6248 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 6249#endif
a0ed51b3 6250}
51371543 6251
51371543 6252static void
acfe0abc 6253clear_re(pTHX_ void *r)
51371543 6254{
97aff369 6255 dVAR;
51371543
GS
6256 ReREFCNT_dec((regexp *)r);
6257}
ffbc6a93 6258
a28509cc
AL
6259#ifdef DEBUGGING
6260
6261STATIC void
6262S_put_byte(pTHX_ SV *sv, int c)
6263{
6264 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6265 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6266 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6267 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6268 else
6269 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6270}
6271
6272
6273STATIC regnode *
6274S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
6275{
97aff369 6276 dVAR;
a28509cc
AL
6277 register U8 op = EXACT; /* Arbitrary non-END op. */
6278 register regnode *next;
6279
6280 while (op != END && (!last || node < last)) {
6281 /* While that wasn't END last time... */
6282
6283 NODE_ALIGN(node);
6284 op = OP(node);
6285 if (op == CLOSE)
6286 l--;
6287 next = regnext(node);
6288 /* Where, what. */
6289 if (OP(node) == OPTIMIZED)
6290 goto after_print;
6291 regprop(sv, node);
6292 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6293 (int)(2*l + 1), "", SvPVX_const(sv));
6294 if (next == NULL) /* Next ptr. */
6295 PerlIO_printf(Perl_debug_log, "(0)");
6296 else
6297 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6298 (void)PerlIO_putc(Perl_debug_log, '\n');
6299 after_print:
6300 if (PL_regkind[(U8)op] == BRANCHJ) {
6301 register regnode *nnode = (OP(next) == LONGJMP
6302 ? regnext(next)
6303 : next);
6304 if (last && nnode > last)
6305 nnode = last;
6306 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
6307 }
6308 else if (PL_regkind[(U8)op] == BRANCH) {
6309 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
6310 }
6311 else if ( PL_regkind[(U8)op] == TRIE ) {
6312 const I32 n = ARG(node);
6313 const reg_trie_data * const trie = (reg_trie_data*)PL_regdata->data[n];
6314 const I32 arry_len = av_len(trie->words)+1;
6315 I32 word_idx;
6316 PerlIO_printf(Perl_debug_log,
6317 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6318 (int)(2*(l+3)),
6319 "",
6320 trie->wordcount,
6321 (int)trie->charcount,
6322 trie->uniquecharcount,
6323 (IV)trie->laststate-1,
6324 node->flags ? " EVAL mode" : "");
6325
6326 for (word_idx=0; word_idx < arry_len; word_idx++) {
6327 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
6328 if (elem_ptr) {
6329 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6330 (int)(2*(l+4)), "",
6331 PL_colors[0],
cfd0369c 6332 SvPV_nolen_const(*elem_ptr),
a28509cc
AL
6333 PL_colors[1]
6334 );
6335 /*
6336 if (next == NULL)
6337 PerlIO_printf(Perl_debug_log, "(0)\n");
6338 else
6339 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6340 */
6341 }
6342
6343 }
6344
6345 node = NEXTOPER(node);
6346 node += regarglen[(U8)op];
6347
6348 }
6349 else if ( op == CURLY) { /* "next" might be very big: optimizer */
6350 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6351 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6352 }
6353 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
6354 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
6355 next, sv, l + 1);
6356 }
6357 else if ( op == PLUS || op == STAR) {
6358 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
6359 }
6360 else if (op == ANYOF) {
6361 /* arglen 1 + class block */
6362 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6363 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6364 node = NEXTOPER(node);
6365 }
6366 else if (PL_regkind[(U8)op] == EXACT) {
6367 /* Literal string, where present. */
6368 node += NODE_SZ_STR(node) - 1;
6369 node = NEXTOPER(node);
6370 }
6371 else {
6372 node = NEXTOPER(node);
6373 node += regarglen[(U8)op];
6374 }
6375 if (op == CURLYX || op == OPEN)
6376 l++;
6377 else if (op == WHILEM)
6378 l--;
6379 }
6380 return node;
6381}
6382
6383#endif /* DEBUGGING */
6384
241d1a3b
NC
6385/*
6386 * Local variables:
6387 * c-indentation-style: bsd
6388 * c-basic-offset: 4
6389 * indent-tabs-mode: t
6390 * End:
6391 *
37442d52
RGS
6392 * ex: set ts=8 sts=4 sw=4 noet:
6393 */