This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Note that PVIOs probably aren't worth shrinking. Note 3 possible
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e
AD
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
cad2e5aa 38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
d06ea78c 44/* We *really* need to overwrite these symbols: */
56953603
IZ
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
d06ea78c 48# define Perl_pregfree my_regfree
cad2e5aa
JH
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
d06ea78c 51# define Perl_regnext my_regnext
f0b8d043 52# define Perl_save_re_context my_save_re_context
b81d288d 53# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
b81d288d 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 *
79 **** Alterations to Henry's code are...
80 ****
4bb101f2 81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 82 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 83 ****
9ef589d8
LW
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
86
a687059c
LW
87 *
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
91 */
92#include "EXTERN.h"
864dbfa3 93#define PERL_IN_REGCOMP_C
a687059c 94#include "perl.h"
d06ea78c 95
acfe0abc 96#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
97# include "INTERN.h"
98#endif
c277df42
IZ
99
100#define REG_COMP_C
a687059c
LW
101#include "regcomp.h"
102
d4cce5f1 103#ifdef op
11343788 104#undef op
d4cce5f1 105#endif /* op */
11343788 106
fe14fcc3 107#ifdef MSDOS
7e4e8c89 108# if defined(BUGGY_MSC6)
fe14fcc3 109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 110# pragma optimize("a",off)
fe14fcc3 111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
112# pragma optimize("w",on )
113# endif /* BUGGY_MSC6 */
fe14fcc3
LW
114#endif /* MSDOS */
115
a687059c
LW
116#ifndef STATIC
117#define STATIC static
118#endif
119
830247a4 120typedef struct RExC_state_t {
e2509266 121 U32 flags; /* are we folding, multilining? */
830247a4
IZ
122 char *precomp; /* uncompiled string. */
123 regexp *rx;
fac92740 124 char *start; /* Start of input for compile */
830247a4
IZ
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 128 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 129 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
132 U32 seen;
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
135 I32 extralen;
136 I32 seen_zerolen;
137 I32 seen_evals;
1aa99e6b 138 I32 utf8;
830247a4
IZ
139#if ADD_TO_REGEXEC
140 char *starttry; /* -Dr: where regtry was called. */
141#define RExC_starttry (pRExC_state->starttry)
142#endif
143} RExC_state_t;
144
e2509266 145#define RExC_flags (pRExC_state->flags)
830247a4
IZ
146#define RExC_precomp (pRExC_state->precomp)
147#define RExC_rx (pRExC_state->rx)
fac92740 148#define RExC_start (pRExC_state->start)
830247a4
IZ
149#define RExC_end (pRExC_state->end)
150#define RExC_parse (pRExC_state->parse)
151#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 152#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 153#define RExC_emit (pRExC_state->emit)
fac92740 154#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
155#define RExC_naughty (pRExC_state->naughty)
156#define RExC_sawback (pRExC_state->sawback)
157#define RExC_seen (pRExC_state->seen)
158#define RExC_size (pRExC_state->size)
159#define RExC_npar (pRExC_state->npar)
160#define RExC_extralen (pRExC_state->extralen)
161#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 163#define RExC_utf8 (pRExC_state->utf8)
830247a4 164
a687059c
LW
165#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
a687059c 168
35c8bce7
LW
169#ifdef SPSTART
170#undef SPSTART /* dratted cpp namespace... */
171#endif
a687059c
LW
172/*
173 * Flags to be passed up and down.
174 */
a687059c 175#define WORST 0 /* Worst case. */
821b33a5 176#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
177#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178#define SPSTART 0x4 /* Starts with * or +. */
179#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 180
2c2d71f5
JH
181/* Length of a variant. */
182
183typedef struct scan_data_t {
184 I32 len_min;
185 I32 len_delta;
186 I32 pos_min;
187 I32 pos_delta;
188 SV *last_found;
189 I32 last_end; /* min value, <0 unless valid. */
190 I32 last_start_min;
191 I32 last_start_max;
192 SV **longest; /* Either &l_fixed, or &l_float. */
193 SV *longest_fixed;
194 I32 offset_fixed;
195 SV *longest_float;
196 I32 offset_float_min;
197 I32 offset_float_max;
198 I32 flags;
199 I32 whilem_c;
cb434fcc 200 I32 *last_closep;
653099ff 201 struct regnode_charclass_class *start_class;
2c2d71f5
JH
202} scan_data_t;
203
a687059c 204/*
e50aee73 205 * Forward declarations for pregcomp()'s friends.
a687059c 206 */
a0d0e21e 207
27da23d5
JH
208static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
210
211#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212#define SF_BEFORE_SEOL 0x1
213#define SF_BEFORE_MEOL 0x2
214#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
216
09b7f37c
CB
217#ifdef NO_UNARY_PLUS
218# define SF_FIX_SHIFT_EOL (0+2)
219# define SF_FL_SHIFT_EOL (0+4)
220#else
221# define SF_FIX_SHIFT_EOL (+2)
222# define SF_FL_SHIFT_EOL (+4)
223#endif
c277df42
IZ
224
225#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
227
228#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230#define SF_IS_INF 0x40
231#define SF_HAS_PAR 0x80
232#define SF_IN_PAR 0x100
233#define SF_HAS_EVAL 0x200
4bfe0158 234#define SCF_DO_SUBSTR 0x400
653099ff
GS
235#define SCF_DO_STCLASS_AND 0x0800
236#define SCF_DO_STCLASS_OR 0x1000
237#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 238#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 239
eb160463 240#define UTF (RExC_utf8 != 0)
e2509266
JH
241#define LOC ((RExC_flags & PMf_LOCALE) != 0)
242#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 243
ffc61ed2 244#define OOB_UNICODE 12345678
93733859 245#define OOB_NAMEDCLASS -1
b8c5462f 246
a0ed51b3
LW
247#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
249
8615cb43 250
b45f050a
JF
251/* length of regex to show in messages that don't mark a position within */
252#define RegexLengthToShowInErrorMessages 127
253
254/*
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
258 */
7253e4e3
RK
259#define MARKER1 "<-- HERE" /* marker as it appears in the description */
260#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 261
7253e4e3 262#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
263
264/*
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
267 * "...".
268 */
ccb2c380 269#define FAIL(msg) STMT_START { \
bfed75c6 270 const char *ellipses = ""; \
ccb2c380
MP
271 IV len = RExC_end - RExC_precomp; \
272 \
273 if (!SIZE_ONLY) \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
278 ellipses = "..."; \
279 } \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
282} STMT_END
8615cb43 283
b45f050a 284/*
b45f050a
JF
285 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
286 */
ccb2c380 287#define Simple_vFAIL(m) STMT_START { \
a28509cc 288 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
289 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
290 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
291} STMT_END
b45f050a
JF
292
293/*
294 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
295 */
ccb2c380
MP
296#define vFAIL(m) STMT_START { \
297 if (!SIZE_ONLY) \
298 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
299 Simple_vFAIL(m); \
300} STMT_END
b45f050a
JF
301
302/*
303 * Like Simple_vFAIL(), but accepts two arguments.
304 */
ccb2c380 305#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 306 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
307 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
308 (int)offset, RExC_precomp, RExC_precomp + offset); \
309} STMT_END
b45f050a
JF
310
311/*
312 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
313 */
ccb2c380
MP
314#define vFAIL2(m,a1) STMT_START { \
315 if (!SIZE_ONLY) \
316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
317 Simple_vFAIL2(m, a1); \
318} STMT_END
b45f050a
JF
319
320
321/*
322 * Like Simple_vFAIL(), but accepts three arguments.
323 */
ccb2c380 324#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 325 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
326 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
327 (int)offset, RExC_precomp, RExC_precomp + offset); \
328} STMT_END
b45f050a
JF
329
330/*
331 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
332 */
ccb2c380
MP
333#define vFAIL3(m,a1,a2) STMT_START { \
334 if (!SIZE_ONLY) \
335 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
336 Simple_vFAIL3(m, a1, a2); \
337} STMT_END
b45f050a
JF
338
339/*
340 * Like Simple_vFAIL(), but accepts four arguments.
341 */
ccb2c380 342#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 343 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
344 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
345 (int)offset, RExC_precomp, RExC_precomp + offset); \
346} STMT_END
b45f050a 347
ccb2c380 348#define vWARN(loc,m) STMT_START { \
a28509cc 349 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
350 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
351 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
352} STMT_END
353
354#define vWARNdep(loc,m) STMT_START { \
a28509cc 355 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
356 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
357 "%s" REPORT_LOCATION, \
358 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
359} STMT_END
360
361
362#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 363 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
364 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
365 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
366} STMT_END
367
368#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 369 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
370 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
371 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
372} STMT_END
373
374#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 375 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
376 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
377 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
378} STMT_END
379
380#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 381 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
383 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
384} STMT_END
9d1d55b5 385
8615cb43 386
cd439c50 387/* Allow for side effects in s */
ccb2c380
MP
388#define REGC(c,s) STMT_START { \
389 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
390} STMT_END
cd439c50 391
fac92740
MJD
392/* Macros for recording node offsets. 20001227 mjd@plover.com
393 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
394 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
395 * Element 0 holds the number n.
396 */
397
398#define MJD_OFFSET_DEBUG(x)
a3621e74 399/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
400
401
402#define Set_Node_Offset_To_R(node,byte) STMT_START { \
403 if (! SIZE_ONLY) { \
404 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
405 __LINE__, (node), (byte))); \
406 if((node) < 0) { \
551405c4 407 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
408 } else { \
409 RExC_offsets[2*(node)-1] = (byte); \
410 } \
411 } \
412} STMT_END
413
414#define Set_Node_Offset(node,byte) \
415 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
416#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
417
418#define Set_Node_Length_To_R(node,len) STMT_START { \
419 if (! SIZE_ONLY) { \
420 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 421 __LINE__, (int)(node), (int)(len))); \
ccb2c380 422 if((node) < 0) { \
551405c4 423 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
424 } else { \
425 RExC_offsets[2*(node)] = (len); \
426 } \
427 } \
428} STMT_END
429
430#define Set_Node_Length(node,len) \
431 Set_Node_Length_To_R((node)-RExC_emit_start, len)
432#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
433#define Set_Node_Cur_Length(node) \
434 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
435
436/* Get offsets and lengths */
437#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
438#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
439
acfe0abc 440static void clear_re(pTHX_ void *r);
4327152a 441
653099ff
GS
442/* Mark that we cannot extend a found fixed substring at this point.
443 Updata the longest found anchored substring and the longest found
444 floating substrings if needed. */
445
4327152a 446STATIC void
097eb12c 447S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 448{
e1ec3a88
AL
449 const STRLEN l = CHR_SVLEN(data->last_found);
450 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 451
c277df42 452 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 453 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
454 if (*data->longest == data->longest_fixed) {
455 data->offset_fixed = l ? data->last_start_min : data->pos_min;
456 if (data->flags & SF_BEFORE_EOL)
b81d288d 457 data->flags
c277df42
IZ
458 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
459 else
460 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
461 }
462 else {
c277df42 463 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
464 data->offset_float_max = (l
465 ? data->last_start_max
c277df42 466 : data->pos_min + data->pos_delta);
9051bda5
HS
467 if ((U32)data->offset_float_max > (U32)I32_MAX)
468 data->offset_float_max = I32_MAX;
c277df42 469 if (data->flags & SF_BEFORE_EOL)
b81d288d 470 data->flags
c277df42
IZ
471 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
472 else
473 data->flags &= ~SF_FL_BEFORE_EOL;
474 }
475 }
476 SvCUR_set(data->last_found, 0);
0eda9292 477 {
a28509cc 478 SV * const sv = data->last_found;
097eb12c
AL
479 if (SvUTF8(sv) && SvMAGICAL(sv)) {
480 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
481 if (mg)
482 mg->mg_len = 0;
483 }
0eda9292 484 }
c277df42
IZ
485 data->last_end = -1;
486 data->flags &= ~SF_BEFORE_EOL;
487}
488
653099ff
GS
489/* Can match anything (initialization) */
490STATIC void
097eb12c 491S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 492{
653099ff 493 ANYOF_CLASS_ZERO(cl);
f8bef550 494 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 495 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
496 if (LOC)
497 cl->flags |= ANYOF_LOCALE;
498}
499
500/* Can match anything (initialization) */
501STATIC int
5f66b61c 502S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
503{
504 int value;
505
aaa51d5e 506 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
507 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
508 return 1;
1aa99e6b
IH
509 if (!(cl->flags & ANYOF_UNICODE_ALL))
510 return 0;
f8bef550
NC
511 if (!ANYOF_BITMAP_TESTALLSET(cl))
512 return 0;
653099ff
GS
513 return 1;
514}
515
516/* Can match anything (initialization) */
517STATIC void
097eb12c 518S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 519{
8ecf7187 520 Zero(cl, 1, struct regnode_charclass_class);
653099ff 521 cl->type = ANYOF;
830247a4 522 cl_anything(pRExC_state, cl);
653099ff
GS
523}
524
525STATIC void
097eb12c 526S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 527{
8ecf7187 528 Zero(cl, 1, struct regnode_charclass_class);
653099ff 529 cl->type = ANYOF;
830247a4 530 cl_anything(pRExC_state, cl);
653099ff
GS
531 if (LOC)
532 cl->flags |= ANYOF_LOCALE;
533}
534
535/* 'And' a given class with another one. Can create false positives */
536/* We assume that cl is not inverted */
537STATIC void
5f66b61c 538S_cl_and(struct regnode_charclass_class *cl,
a28509cc 539 const struct regnode_charclass_class *and_with)
653099ff 540{
653099ff
GS
541 if (!(and_with->flags & ANYOF_CLASS)
542 && !(cl->flags & ANYOF_CLASS)
543 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
544 && !(and_with->flags & ANYOF_FOLD)
545 && !(cl->flags & ANYOF_FOLD)) {
546 int i;
547
548 if (and_with->flags & ANYOF_INVERT)
549 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
550 cl->bitmap[i] &= ~and_with->bitmap[i];
551 else
552 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
553 cl->bitmap[i] &= and_with->bitmap[i];
554 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
555 if (!(and_with->flags & ANYOF_EOS))
556 cl->flags &= ~ANYOF_EOS;
1aa99e6b 557
14ebb1a2
JH
558 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
559 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
560 cl->flags &= ~ANYOF_UNICODE_ALL;
561 cl->flags |= ANYOF_UNICODE;
562 ARG_SET(cl, ARG(and_with));
563 }
14ebb1a2
JH
564 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
565 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 566 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
567 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
568 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 569 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
570}
571
572/* 'OR' a given class with another one. Can create false positives */
573/* We assume that cl is not inverted */
574STATIC void
097eb12c 575S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 576{
653099ff
GS
577 if (or_with->flags & ANYOF_INVERT) {
578 /* We do not use
579 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
580 * <= (B1 | !B2) | (CL1 | !CL2)
581 * which is wasteful if CL2 is small, but we ignore CL2:
582 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
583 * XXXX Can we handle case-fold? Unclear:
584 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
585 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
586 */
587 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
588 && !(or_with->flags & ANYOF_FOLD)
589 && !(cl->flags & ANYOF_FOLD) ) {
590 int i;
591
592 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
593 cl->bitmap[i] |= ~or_with->bitmap[i];
594 } /* XXXX: logic is complicated otherwise */
595 else {
830247a4 596 cl_anything(pRExC_state, cl);
653099ff
GS
597 }
598 } else {
599 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
600 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 601 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
602 || (cl->flags & ANYOF_FOLD)) ) {
603 int i;
604
605 /* OR char bitmap and class bitmap separately */
606 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
607 cl->bitmap[i] |= or_with->bitmap[i];
608 if (or_with->flags & ANYOF_CLASS) {
609 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
610 cl->classflags[i] |= or_with->classflags[i];
611 cl->flags |= ANYOF_CLASS;
612 }
613 }
614 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 615 cl_anything(pRExC_state, cl);
653099ff
GS
616 }
617 }
618 if (or_with->flags & ANYOF_EOS)
619 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
620
621 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
622 ARG(cl) != ARG(or_with)) {
623 cl->flags |= ANYOF_UNICODE_ALL;
624 cl->flags &= ~ANYOF_UNICODE;
625 }
626 if (or_with->flags & ANYOF_UNICODE_ALL) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
629 }
653099ff
GS
630}
631
5d1c421c 632/*
a3621e74
YO
633
634 make_trie(startbranch,first,last,tail,flags)
635 startbranch: the first branch in the whole branch sequence
636 first : start branch of sequence of branch-exact nodes.
637 May be the same as startbranch
638 last : Thing following the last branch.
639 May be the same as tail.
640 tail : item following the branch sequence
641 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
642
643Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
644
645A trie is an N'ary tree where the branches are determined by digital
646decomposition of the key. IE, at the root node you look up the 1st character and
647follow that branch repeat until you find the end of the branches. Nodes can be
648marked as "accepting" meaning they represent a complete word. Eg:
649
650 /he|she|his|hers/
651
652would convert into the following structure. Numbers represent states, letters
653following numbers represent valid transitions on the letter from that state, if
654the number is in square brackets it represents an accepting state, otherwise it
655will be in parenthesis.
656
657 +-h->+-e->[3]-+-r->(8)-+-s->[9]
658 | |
659 | (2)
660 | |
661 (1) +-i->(6)-+-s->[7]
662 |
663 +-s->(3)-+-h->(4)-+-e->[5]
664
665 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
666
667This shows that when matching against the string 'hers' we will begin at state 1
668read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
669then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
670is also accepting. Thus we know that we can match both 'he' and 'hers' with a
671single traverse. We store a mapping from accepting to state to which word was
672matched, and then when we have multiple possibilities we try to complete the
673rest of the regex in the order in which they occured in the alternation.
674
675The only prior NFA like behaviour that would be changed by the TRIE support is
676the silent ignoring of duplicate alternations which are of the form:
677
678 / (DUPE|DUPE) X? (?{ ... }) Y /x
679
680Thus EVAL blocks follwing a trie may be called a different number of times with
681and without the optimisation. With the optimisations dupes will be silently
682ignored. This inconsistant behaviour of EVAL type nodes is well established as
683the following demonstrates:
684
685 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
686
687which prints out 'word' three times, but
688
689 'words'=~/(word|word|word)(?{ print $1 })S/
690
691which doesnt print it out at all. This is due to other optimisations kicking in.
692
693Example of what happens on a structural level:
694
695The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
696
697 1: CURLYM[1] {1,32767}(18)
698 5: BRANCH(8)
699 6: EXACT <ac>(16)
700 8: BRANCH(11)
701 9: EXACT <ad>(16)
702 11: BRANCH(14)
703 12: EXACT <ab>(16)
704 16: SUCCEED(0)
705 17: NOTHING(18)
706 18: END(0)
707
708This would be optimizable with startbranch=5, first=5, last=16, tail=16
709and should turn into:
710
711 1: CURLYM[1] {1,32767}(18)
712 5: TRIE(16)
713 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
714 <ac>
715 <ad>
716 <ab>
717 16: SUCCEED(0)
718 17: NOTHING(18)
719 18: END(0)
720
721Cases where tail != last would be like /(?foo|bar)baz/:
722
723 1: BRANCH(4)
724 2: EXACT <foo>(8)
725 4: BRANCH(7)
726 5: EXACT <bar>(8)
727 7: TAIL(8)
728 8: EXACT <baz>(10)
729 10: END(0)
730
731which would be optimizable with startbranch=1, first=1, last=7, tail=8
732and would end up looking like:
733
734 1: TRIE(8)
735 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
736 <foo>
737 <bar>
738 7: TAIL(8)
739 8: EXACT <baz>(10)
740 10: END(0)
741
742*/
743
744#define TRIE_DEBUG_CHAR \
745 DEBUG_TRIE_COMPILE_r({ \
746 SV *tmp; \
747 if ( UTF ) { \
6136c704 748 tmp = newSVpvs( "" ); \
a3621e74
YO
749 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
750 } else { \
e4584336 751 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
a3621e74
YO
752 } \
753 av_push( trie->revcharmap, tmp ); \
754 })
755
756#define TRIE_READ_CHAR STMT_START { \
757 if ( UTF ) { \
758 if ( folder ) { \
759 if ( foldlen > 0 ) { \
760 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
761 foldlen -= len; \
762 scan += len; \
763 len = 0; \
764 } else { \
e1ec3a88 765 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
766 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
767 foldlen -= UNISKIP( uvc ); \
768 scan = foldbuf + UNISKIP( uvc ); \
769 } \
770 } else { \
e1ec3a88 771 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
772 } \
773 } else { \
774 uvc = (U32)*uc; \
775 len = 1; \
776 } \
777} STMT_END
778
779
780#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
781#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
782#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
783#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
784
785#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
786 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
787 TRIE_LIST_LEN( state ) *= 2; \
788 Renew( trie->states[ state ].trans.list, \
789 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
790 } \
791 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
792 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
793 TRIE_LIST_CUR( state )++; \
794} STMT_END
795
796#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 797 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
798 4, reg_trie_trans_le ); \
799 TRIE_LIST_CUR( state ) = 1; \
800 TRIE_LIST_LEN( state ) = 4; \
801} STMT_END
802
803STATIC I32
804S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
805{
27da23d5 806 dVAR;
a3621e74
YO
807 /* first pass, loop through and scan words */
808 reg_trie_data *trie;
809 regnode *cur;
9f7f3913 810 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
811 STRLEN len = 0;
812 UV uvc = 0;
813 U16 curword = 0;
814 U32 next_alloc = 0;
815 /* we just use folder as a flag in utf8 */
e1ec3a88 816 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
817 ? PL_fold
818 : ( flags == EXACTFL
819 ? PL_fold_locale
820 : NULL
821 )
822 );
823
e1ec3a88 824 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74
YO
825 SV *re_trie_maxbuff;
826
827 GET_RE_DEBUG_FLAGS_DECL;
828
a02a5408 829 Newxz( trie, 1, reg_trie_data );
a3621e74
YO
830 trie->refcount = 1;
831 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 832 Newxz( trie->charmap, 256, U16 );
a3621e74
YO
833 DEBUG_r({
834 trie->words = newAV();
835 trie->revcharmap = newAV();
836 });
837
838
0111c4fd 839 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 840 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 841 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74
YO
842 }
843
844 /* -- First loop and Setup --
845
846 We first traverse the branches and scan each word to determine if it
847 contains widechars, and how many unique chars there are, this is
848 important as we have to build a table with at least as many columns as we
849 have unique chars.
850
851 We use an array of integers to represent the character codes 0..255
852 (trie->charmap) and we use a an HV* to store unicode characters. We use the
853 native representation of the character value as the key and IV's for the
854 coded index.
855
856 *TODO* If we keep track of how many times each character is used we can
857 remap the columns so that the table compression later on is more
858 efficient in terms of memory by ensuring most common value is in the
859 middle and the least common are on the outside. IMO this would be better
860 than a most to least common mapping as theres a decent chance the most
861 common letter will share a node with the least common, meaning the node
862 will not be compressable. With a middle is most common approach the worst
863 case is when we have the least common nodes twice.
864
865 */
866
867
868 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 869 regnode * const noper = NEXTOPER( cur );
e1ec3a88 870 const U8 *uc = (U8*)STRING( noper );
a28509cc 871 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
872 STRLEN foldlen = 0;
873 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 874 const U8 *scan = (U8*)NULL;
a3621e74
YO
875
876 for ( ; uc < e ; uc += len ) {
877 trie->charcount++;
878 TRIE_READ_CHAR;
879 if ( uvc < 256 ) {
880 if ( !trie->charmap[ uvc ] ) {
881 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
882 if ( folder )
883 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
884 TRIE_DEBUG_CHAR;
885 }
886 } else {
887 SV** svpp;
888 if ( !trie->widecharmap )
889 trie->widecharmap = newHV();
890
891 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
892
893 if ( !svpp )
e4584336 894 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
895
896 if ( !SvTRUE( *svpp ) ) {
897 sv_setiv( *svpp, ++trie->uniquecharcount );
898 TRIE_DEBUG_CHAR;
899 }
900 }
901 }
902 trie->wordcount++;
903 } /* end first pass */
904 DEBUG_TRIE_COMPILE_r(
905 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
906 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
5d7488b2 907 (int)trie->charcount, trie->uniquecharcount )
a3621e74
YO
908 );
909
910
911 /*
912 We now know what we are dealing with in terms of unique chars and
913 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
914 representation using a flat table will take. If it's over a reasonable
915 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
916 conservative but potentially much slower representation using an array
917 of lists.
918
919 At the end we convert both representations into the same compressed
920 form that will be used in regexec.c for matching with. The latter
921 is a form that cannot be used to construct with but has memory
922 properties similar to the list form and access properties similar
923 to the table form making it both suitable for fast searches and
924 small enough that its feasable to store for the duration of a program.
925
926 See the comment in the code where the compressed table is produced
927 inplace from the flat tabe representation for an explanation of how
928 the compression works.
929
930 */
931
932
933 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
934 /*
935 Second Pass -- Array Of Lists Representation
936
937 Each state will be represented by a list of charid:state records
938 (reg_trie_trans_le) the first such element holds the CUR and LEN
939 points of the allocated array. (See defines above).
940
941 We build the initial structure using the lists, and then convert
942 it into the compressed table form which allows faster lookups
943 (but cant be modified once converted).
944
945
946 */
947
948
949 STRLEN transcount = 1;
950
a02a5408 951 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
952 TRIE_LIST_NEW(1);
953 next_alloc = 2;
954
955 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
956
c445ea15
AL
957 regnode * const noper = NEXTOPER( cur );
958 U8 *uc = (U8*)STRING( noper );
959 const U8 * const e = uc + STR_LEN( noper );
960 U32 state = 1; /* required init */
961 U16 charid = 0; /* sanity init */
962 U8 *scan = (U8*)NULL; /* sanity init */
963 STRLEN foldlen = 0; /* required init */
964 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
965
966 for ( ; uc < e ; uc += len ) {
967
968 TRIE_READ_CHAR;
969
970 if ( uvc < 256 ) {
971 charid = trie->charmap[ uvc ];
972 } else {
973 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
974 if ( !svpp ) {
975 charid = 0;
976 } else {
977 charid=(U16)SvIV( *svpp );
978 }
979 }
980 if ( charid ) {
a3621e74 981
c445ea15
AL
982 U16 check;
983 U32 newstate = 0;
a3621e74 984
c445ea15
AL
985 charid--;
986 if ( !trie->states[ state ].trans.list ) {
987 TRIE_LIST_NEW( state );
988 }
989 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
990 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
991 newstate = TRIE_LIST_ITEM( state, check ).newstate;
992 break;
993 }
994 }
995 if ( ! newstate ) {
996 newstate = next_alloc++;
997 TRIE_LIST_PUSH( state, charid, newstate );
998 transcount++;
999 }
1000 state = newstate;
1001 } else {
1002 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a28509cc 1003 }
c445ea15
AL
1004 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1005 }
a3621e74 1006
c445ea15
AL
1007 if ( !trie->states[ state ].wordnum ) {
1008 /* we havent inserted this word into the structure yet. */
1009 trie->states[ state ].wordnum = ++curword;
a3621e74 1010
c445ea15
AL
1011 DEBUG_r({
1012 /* store the word for dumping */
1013 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1014 if ( UTF ) SvUTF8_on( tmp );
1015 av_push( trie->words, tmp );
1016 });
a3621e74 1017
c445ea15 1018 } else {
bb263b4e 1019 /*EMPTY*/; /* It's a dupe. So ignore it. */
c445ea15 1020 }
a3621e74
YO
1021
1022 } /* end second pass */
1023
1024 trie->laststate = next_alloc;
1025 Renew( trie->states, next_alloc, reg_trie_state );
1026
1027 DEBUG_TRIE_COMPILE_MORE_r({
1028 U32 state;
a3621e74 1029
a28509cc 1030 /* print out the table precompression. */
a3621e74
YO
1031
1032 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1033 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1034
1035 for( state=1 ; state < next_alloc ; state ++ ) {
a28509cc 1036 U16 charid;
a3621e74 1037
e4584336 1038 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
a3621e74
YO
1039 if ( ! trie->states[ state ].wordnum ) {
1040 PerlIO_printf( Perl_debug_log, "%5s| ","");
1041 } else {
e4584336 1042 PerlIO_printf( Perl_debug_log, "W%04x| ",
a3621e74
YO
1043 trie->states[ state ].wordnum
1044 );
1045 }
1046 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1047 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
e4584336 1048 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
cfd0369c 1049 SvPV_nolen_const( *tmp ),
a3621e74 1050 TRIE_LIST_ITEM(state,charid).forid,
e4584336 1051 (UV)TRIE_LIST_ITEM(state,charid).newstate
a3621e74
YO
1052 );
1053 }
1054
1055 }
1056 PerlIO_printf( Perl_debug_log, "\n\n" );
1057 });
1058
a02a5408 1059 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1060 {
1061 U32 state;
a3621e74
YO
1062 U32 tp = 0;
1063 U32 zp = 0;
1064
1065
1066 for( state=1 ; state < next_alloc ; state ++ ) {
1067 U32 base=0;
1068
1069 /*
1070 DEBUG_TRIE_COMPILE_MORE_r(
1071 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1072 );
1073 */
1074
1075 if (trie->states[state].trans.list) {
1076 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1077 U16 maxid=minid;
a28509cc 1078 U16 idx;
a3621e74
YO
1079
1080 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1081 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1082 if ( forid < minid ) {
1083 minid=forid;
1084 } else if ( forid > maxid ) {
1085 maxid=forid;
1086 }
a3621e74
YO
1087 }
1088 if ( transcount < tp + maxid - minid + 1) {
1089 transcount *= 2;
1090 Renew( trie->trans, transcount, reg_trie_trans );
1091 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1092 }
1093 base = trie->uniquecharcount + tp - minid;
1094 if ( maxid == minid ) {
1095 U32 set = 0;
1096 for ( ; zp < tp ; zp++ ) {
1097 if ( ! trie->trans[ zp ].next ) {
1098 base = trie->uniquecharcount + zp - minid;
1099 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1100 trie->trans[ zp ].check = state;
1101 set = 1;
1102 break;
1103 }
1104 }
1105 if ( !set ) {
1106 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1107 trie->trans[ tp ].check = state;
1108 tp++;
1109 zp = tp;
1110 }
1111 } else {
1112 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1113 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1114 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1115 trie->trans[ tid ].check = state;
1116 }
1117 tp += ( maxid - minid + 1 );
1118 }
1119 Safefree(trie->states[ state ].trans.list);
1120 }
1121 /*
1122 DEBUG_TRIE_COMPILE_MORE_r(
1123 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1124 );
1125 */
1126 trie->states[ state ].trans.base=base;
1127 }
cc601c31 1128 trie->lasttrans = tp + 1;
a3621e74
YO
1129 }
1130 } else {
1131 /*
1132 Second Pass -- Flat Table Representation.
1133
1134 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1135 We know that we will need Charcount+1 trans at most to store the data
1136 (one row per char at worst case) So we preallocate both structures
1137 assuming worst case.
1138
1139 We then construct the trie using only the .next slots of the entry
1140 structs.
1141
1142 We use the .check field of the first entry of the node temporarily to
1143 make compression both faster and easier by keeping track of how many non
1144 zero fields are in the node.
1145
1146 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1147 transition.
1148
1149 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1150 number representing the first entry of the node, and state as a
1151 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1152 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1153 are 2 entrys per node. eg:
1154
1155 A B A B
1156 1. 2 4 1. 3 7
1157 2. 0 3 3. 0 5
1158 3. 0 0 5. 0 0
1159 4. 0 0 7. 0 0
1160
1161 The table is internally in the right hand, idx form. However as we also
1162 have to deal with the states array which is indexed by nodenum we have to
1163 use TRIE_NODENUM() to convert.
1164
1165 */
1166
a02a5408 1167 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
a3621e74 1168 reg_trie_trans );
a02a5408 1169 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
1170 next_alloc = trie->uniquecharcount + 1;
1171
1172 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1173
c445ea15 1174 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1175 const U8 *uc = (U8*)STRING( noper );
1176 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1177
1178 U32 state = 1; /* required init */
1179
1180 U16 charid = 0; /* sanity init */
1181 U32 accept_state = 0; /* sanity init */
1182 U8 *scan = (U8*)NULL; /* sanity init */
1183
1184 STRLEN foldlen = 0; /* required init */
1185 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1186
1187
1188 for ( ; uc < e ; uc += len ) {
1189
1190 TRIE_READ_CHAR;
1191
1192 if ( uvc < 256 ) {
1193 charid = trie->charmap[ uvc ];
1194 } else {
c445ea15
AL
1195 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1196 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74
YO
1197 }
1198 if ( charid ) {
1199 charid--;
1200 if ( !trie->trans[ state + charid ].next ) {
1201 trie->trans[ state + charid ].next = next_alloc;
1202 trie->trans[ state ].check++;
1203 next_alloc += trie->uniquecharcount;
1204 }
1205 state = trie->trans[ state + charid ].next;
1206 } else {
e4584336 1207 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1208 }
1209 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1210 }
1211
1212 accept_state = TRIE_NODENUM( state );
1213 if ( !trie->states[ accept_state ].wordnum ) {
1214 /* we havent inserted this word into the structure yet. */
1215 trie->states[ accept_state ].wordnum = ++curword;
1216
1217 DEBUG_r({
1218 /* store the word for dumping */
1219 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1220 if ( UTF ) SvUTF8_on( tmp );
1221 av_push( trie->words, tmp );
1222 });
1223
1224 } else {
bb263b4e 1225 /*EMPTY*/; /* Its a dupe. So ignore it. */
a3621e74
YO
1226 }
1227
1228 } /* end second pass */
1229
1230 DEBUG_TRIE_COMPILE_MORE_r({
1231 /*
1232 print out the table precompression so that we can do a visual check
1233 that they are identical.
1234 */
1235 U32 state;
1236 U16 charid;
1237 PerlIO_printf( Perl_debug_log, "\nChar : " );
1238
1239 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1240 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1241 if ( tmp ) {
cfd0369c 1242 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1243 }
1244 }
1245
1246 PerlIO_printf( Perl_debug_log, "\nState+-" );
1247
1248 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1249 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1250 }
1251
1252 PerlIO_printf( Perl_debug_log, "\n" );
1253
1254 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1255
e4584336 1256 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
a3621e74
YO
1257
1258 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
e4584336
RB
1259 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1260 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
a3621e74
YO
1261 }
1262 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
e4584336 1263 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
a3621e74 1264 } else {
e4584336 1265 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
a3621e74
YO
1266 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1267 }
1268 }
1269 PerlIO_printf( Perl_debug_log, "\n\n" );
1270 });
1271 {
1272 /*
1273 * Inplace compress the table.*
1274
1275 For sparse data sets the table constructed by the trie algorithm will
1276 be mostly 0/FAIL transitions or to put it another way mostly empty.
1277 (Note that leaf nodes will not contain any transitions.)
1278
1279 This algorithm compresses the tables by eliminating most such
1280 transitions, at the cost of a modest bit of extra work during lookup:
1281
1282 - Each states[] entry contains a .base field which indicates the
1283 index in the state[] array wheres its transition data is stored.
1284
1285 - If .base is 0 there are no valid transitions from that node.
1286
1287 - If .base is nonzero then charid is added to it to find an entry in
1288 the trans array.
1289
1290 -If trans[states[state].base+charid].check!=state then the
1291 transition is taken to be a 0/Fail transition. Thus if there are fail
1292 transitions at the front of the node then the .base offset will point
1293 somewhere inside the previous nodes data (or maybe even into a node
1294 even earlier), but the .check field determines if the transition is
1295 valid.
1296
1297 The following process inplace converts the table to the compressed
1298 table: We first do not compress the root node 1,and mark its all its
1299 .check pointers as 1 and set its .base pointer as 1 as well. This
1300 allows to do a DFA construction from the compressed table later, and
1301 ensures that any .base pointers we calculate later are greater than
1302 0.
1303
1304 - We set 'pos' to indicate the first entry of the second node.
1305
1306 - We then iterate over the columns of the node, finding the first and
1307 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1308 and set the .check pointers accordingly, and advance pos
1309 appropriately and repreat for the next node. Note that when we copy
1310 the next pointers we have to convert them from the original
1311 NODEIDX form to NODENUM form as the former is not valid post
1312 compression.
1313
1314 - If a node has no transitions used we mark its base as 0 and do not
1315 advance the pos pointer.
1316
1317 - If a node only has one transition we use a second pointer into the
1318 structure to fill in allocated fail transitions from other states.
1319 This pointer is independent of the main pointer and scans forward
1320 looking for null transitions that are allocated to a state. When it
1321 finds one it writes the single transition into the "hole". If the
1322 pointer doesnt find one the single transition is appeneded as normal.
1323
1324 - Once compressed we can Renew/realloc the structures to release the
1325 excess space.
1326
1327 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1328 specifically Fig 3.47 and the associated pseudocode.
1329
1330 demq
1331 */
a3b680e6 1332 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1333 U32 state, charid;
a3621e74
YO
1334 U32 pos = 0, zp=0;
1335 trie->laststate = laststate;
1336
1337 for ( state = 1 ; state < laststate ; state++ ) {
1338 U8 flag = 0;
a28509cc
AL
1339 const U32 stateidx = TRIE_NODEIDX( state );
1340 const U32 o_used = trie->trans[ stateidx ].check;
1341 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1342 trie->trans[ stateidx ].check = 0;
1343
1344 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1345 if ( flag || trie->trans[ stateidx + charid ].next ) {
1346 if ( trie->trans[ stateidx + charid ].next ) {
1347 if (o_used == 1) {
1348 for ( ; zp < pos ; zp++ ) {
1349 if ( ! trie->trans[ zp ].next ) {
1350 break;
1351 }
1352 }
1353 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1354 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1355 trie->trans[ zp ].check = state;
1356 if ( ++zp > pos ) pos = zp;
1357 break;
1358 }
1359 used--;
1360 }
1361 if ( !flag ) {
1362 flag = 1;
1363 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1364 }
1365 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1366 trie->trans[ pos ].check = state;
1367 pos++;
1368 }
1369 }
1370 }
cc601c31 1371 trie->lasttrans = pos + 1;
a3621e74
YO
1372 Renew( trie->states, laststate + 1, reg_trie_state);
1373 DEBUG_TRIE_COMPILE_MORE_r(
e4584336
RB
1374 PerlIO_printf( Perl_debug_log,
1375 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
5d7488b2
AL
1376 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1377 (IV)next_alloc,
1378 (IV)pos,
a3621e74
YO
1379 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1380 );
1381
1382 } /* end table compress */
1383 }
cc601c31
YO
1384 /* resize the trans array to remove unused space */
1385 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74
YO
1386
1387 DEBUG_TRIE_COMPILE_r({
1388 U32 state;
1389 /*
1390 Now we print it out again, in a slightly different form as there is additional
1391 info we want to be able to see when its compressed. They are close enough for
1392 visual comparison though.
1393 */
1394 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1395
1396 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1397 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1398 if ( tmp ) {
cfd0369c 1399 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1400 }
1401 }
1402 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
cc601c31 1403
a3621e74
YO
1404 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1405 PerlIO_printf( Perl_debug_log, "-----");
1406 PerlIO_printf( Perl_debug_log, "\n");
cc601c31 1407
a3621e74 1408 for( state = 1 ; state < trie->laststate ; state++ ) {
a28509cc 1409 const U32 base = trie->states[ state ].trans.base;
a3621e74 1410
e4584336 1411 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
a3621e74
YO
1412
1413 if ( trie->states[ state ].wordnum ) {
1414 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1415 } else {
1416 PerlIO_printf( Perl_debug_log, "%6s", "" );
1417 }
1418
e4584336 1419 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
a3621e74
YO
1420
1421 if ( base ) {
1422 U32 ofs = 0;
1423
cc601c31
YO
1424 while( ( base + ofs < trie->uniquecharcount ) ||
1425 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1426 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
a3621e74
YO
1427 ofs++;
1428
e4584336 1429 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
a3621e74
YO
1430
1431 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1432 if ( ( base + ofs >= trie->uniquecharcount ) &&
1433 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1434 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1435 {
e4584336
RB
1436 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1437 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
a3621e74
YO
1438 } else {
1439 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1440 }
1441 }
1442
e4584336 1443 PerlIO_printf( Perl_debug_log, "]");
a3621e74
YO
1444
1445 }
1446 PerlIO_printf( Perl_debug_log, "\n" );
1447 }
1448 });
1449
1450 {
1451 /* now finally we "stitch in" the new TRIE node
1452 This means we convert either the first branch or the first Exact,
1453 depending on whether the thing following (in 'last') is a branch
1454 or not and whther first is the startbranch (ie is it a sub part of
1455 the alternation or is it the whole thing.)
1456 Assuming its a sub part we conver the EXACT otherwise we convert
1457 the whole branch sequence, including the first.
1458 */
1459 regnode *convert;
1460
1461
1462
1463
1464 if ( first == startbranch && OP( last ) != BRANCH ) {
1465 convert = first;
1466 } else {
1467 convert = NEXTOPER( first );
1468 NEXT_OFF( first ) = (U16)(last - first);
1469 }
1470
1471 OP( convert ) = TRIE + (U8)( flags - EXACT );
1472 NEXT_OFF( convert ) = (U16)(tail - convert);
1473 ARG_SET( convert, data_slot );
1474
1475 /* tells us if we need to handle accept buffers specially */
1476 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1477
1478
1479 /* needed for dumping*/
1480 DEBUG_r({
1481 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1482 /* We now need to mark all of the space originally used by the
1483 branches as optimized away. This keeps the dumpuntil from
1484 throwing a wobbly as it doesnt use regnext() to traverse the
1485 opcodes.
1486 */
1487 while( optimize < last ) {
1488 OP( optimize ) = OPTIMIZED;
1489 optimize++;
1490 }
1491 });
1492 } /* end node insert */
1493 return 1;
1494}
1495
1496
1497
1498/*
5d1c421c
JH
1499 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1500 * These need to be revisited when a newer toolchain becomes available.
1501 */
1502#if defined(__sparc64__) && defined(__GNUC__)
1503# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1504# undef SPARC64_GCC_WORKAROUND
1505# define SPARC64_GCC_WORKAROUND 1
1506# endif
1507#endif
1508
653099ff
GS
1509/* REx optimizer. Converts nodes into quickier variants "in place".
1510 Finds fixed substrings. */
1511
a0288114 1512/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
1513 to the position after last scanned or to NULL. */
1514
a3621e74 1515
76e3520e 1516STATIC I32
9a957fbc
AL
1517S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1518 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1519 /* scanp: Start here (read-write). */
1520 /* deltap: Write maxlen-minlen here. */
1521 /* last: Stop before this one. */
1522{
97aff369 1523 dVAR;
c277df42
IZ
1524 I32 min = 0, pars = 0, code;
1525 regnode *scan = *scanp, *next;
1526 I32 delta = 0;
1527 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1528 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1529 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1530 scan_data_t data_fake;
653099ff 1531 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1532 SV *re_trie_maxbuff = NULL;
1533
1534 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1535
c277df42
IZ
1536 while (scan && OP(scan) != END && scan < last) {
1537 /* Peephole optimizer: */
a3621e74 1538 DEBUG_OPTIMISE_r({
c445ea15 1539 SV * const mysv=sv_newmortal();
32fc9b6a 1540 regprop(RExC_rx, mysv, scan);
e4584336 1541 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
cfd0369c 1542 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
a3621e74 1543 });
c277df42 1544
22c35a8c 1545 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 1546 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1547 regnode *n = regnext(scan);
1548 U32 stringok = 1;
1549#ifdef DEBUGGING
1550 regnode *stop = scan;
b81d288d 1551#endif
c277df42 1552
cd439c50 1553 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1554 /* Skip NOTHING, merge EXACT*. */
1555 while (n &&
b81d288d 1556 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
1557 (stringok && (OP(n) == OP(scan))))
1558 && NEXT_OFF(n)
1559 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1560 if (OP(n) == TAIL || n > next)
1561 stringok = 0;
22c35a8c 1562 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
1563 NEXT_OFF(scan) += NEXT_OFF(n);
1564 next = n + NODE_STEP_REGNODE;
1565#ifdef DEBUGGING
1566 if (stringok)
1567 stop = n;
b81d288d 1568#endif
c277df42 1569 n = regnext(n);
a0ed51b3 1570 }
f49d4d0f 1571 else if (stringok) {
a3b680e6 1572 const int oldl = STR_LEN(scan);
c445ea15 1573 regnode * const nnext = regnext(n);
f49d4d0f 1574
b81d288d 1575 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1576 break;
1577 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1578 STR_LEN(scan) += STR_LEN(n);
1579 next = n + NODE_SZ_STR(n);
c277df42 1580 /* Now we can overwrite *n : */
f49d4d0f 1581 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1582#ifdef DEBUGGING
f49d4d0f 1583 stop = next - 1;
b81d288d 1584#endif
c277df42
IZ
1585 n = nnext;
1586 }
1587 }
61a36c01 1588
a3621e74 1589 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1590/*
1591 Two problematic code points in Unicode casefolding of EXACT nodes:
1592
1593 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1594 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1595
1596 which casefold to
1597
1598 Unicode UTF-8
1599
1600 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1601 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1602
1603 This means that in case-insensitive matching (or "loose matching",
1604 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1605 length of the above casefolded versions) can match a target string
1606 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1607 This would rather mess up the minimum length computation.
1608
1609 What we'll do is to look for the tail four bytes, and then peek
1610 at the preceding two bytes to see whether we need to decrease
1611 the minimum length by four (six minus two).
1612
1613 Thanks to the design of UTF-8, there cannot be false matches:
1614 A sequence of valid UTF-8 bytes cannot be a subsequence of
1615 another valid sequence of UTF-8 bytes.
1616
1617*/
c445ea15
AL
1618 char * const s0 = STRING(scan), *s, *t;
1619 char * const s1 = s0 + STR_LEN(scan) - 1;
1620 char * const s2 = s1 - 4;
d4c19fe8 1621 const char t0[] = "\xcc\x88\xcc\x81";
a28509cc 1622 const char * const t1 = t0 + 3;
2af232bd 1623
61a36c01
JH
1624 for (s = s0 + 2;
1625 s < s2 && (t = ninstr(s, s1, t0, t1));
1626 s = t + 4) {
1627 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1628 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1629 min -= 4;
1630 }
1631 }
1632
c277df42
IZ
1633#ifdef DEBUGGING
1634 /* Allow dumping */
cd439c50 1635 n = scan + NODE_SZ_STR(scan);
c277df42 1636 while (n <= stop) {
22c35a8c 1637 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1638 OP(n) = OPTIMIZED;
1639 NEXT_OFF(n) = 0;
1640 }
1641 n++;
1642 }
653099ff 1643#endif
c277df42 1644 }
a3621e74
YO
1645
1646
1647
653099ff
GS
1648 /* Follow the next-chain of the current node and optimize
1649 away all the NOTHINGs from it. */
c277df42 1650 if (OP(scan) != CURLYX) {
a3b680e6 1651 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
1652 ? I32_MAX
1653 /* I32 may be smaller than U16 on CRAYs! */
1654 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1655 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1656 int noff;
1657 regnode *n = scan;
b81d288d 1658
c277df42
IZ
1659 /* Skip NOTHING and LONGJMP. */
1660 while ((n = regnext(n))
22c35a8c 1661 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1662 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1663 && off + noff < max)
1664 off += noff;
1665 if (reg_off_by_arg[OP(scan)])
1666 ARG(scan) = off;
b81d288d 1667 else
c277df42
IZ
1668 NEXT_OFF(scan) = off;
1669 }
a3621e74 1670
653099ff
GS
1671 /* The principal pseudo-switch. Cannot be a switch, since we
1672 look into several different things. */
b81d288d 1673 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1674 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1675 next = regnext(scan);
1676 code = OP(scan);
a3621e74 1677 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1678
1679 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1680 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1681 struct regnode_charclass_class accum;
d4c19fe8 1682 regnode * const startbranch=scan;
c277df42 1683
653099ff 1684 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1685 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1686 if (flags & SCF_DO_STCLASS)
830247a4 1687 cl_init_zero(pRExC_state, &accum);
a3621e74 1688
c277df42 1689 while (OP(scan) == code) {
830247a4 1690 I32 deltanext, minnext, f = 0, fake;
653099ff 1691 struct regnode_charclass_class this_class;
c277df42
IZ
1692
1693 num++;
1694 data_fake.flags = 0;
b81d288d 1695 if (data) {
2c2d71f5 1696 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1697 data_fake.last_closep = data->last_closep;
1698 }
1699 else
1700 data_fake.last_closep = &fake;
c277df42
IZ
1701 next = regnext(scan);
1702 scan = NEXTOPER(scan);
1703 if (code != BRANCH)
1704 scan = NEXTOPER(scan);
653099ff 1705 if (flags & SCF_DO_STCLASS) {
830247a4 1706 cl_init(pRExC_state, &this_class);
653099ff
GS
1707 data_fake.start_class = &this_class;
1708 f = SCF_DO_STCLASS_AND;
b81d288d 1709 }
e1901655
IZ
1710 if (flags & SCF_WHILEM_VISITED_POS)
1711 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1712
653099ff 1713 /* we suppose the run is continuous, last=next...*/
830247a4 1714 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1715 next, &data_fake, f,depth+1);
b81d288d 1716 if (min1 > minnext)
c277df42
IZ
1717 min1 = minnext;
1718 if (max1 < minnext + deltanext)
1719 max1 = minnext + deltanext;
1720 if (deltanext == I32_MAX)
aca2d497 1721 is_inf = is_inf_internal = 1;
c277df42
IZ
1722 scan = next;
1723 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1724 pars++;
405ff068 1725 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1726 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1727 if (data)
1728 data->whilem_c = data_fake.whilem_c;
653099ff 1729 if (flags & SCF_DO_STCLASS)
830247a4 1730 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1731 if (code == SUSPEND)
c277df42
IZ
1732 break;
1733 }
1734 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1735 min1 = 0;
5d3527ba
JH
1736 if ((flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) && !data)
1737 Perl_croak(aTHX_ "panic: null data in study_chunk");
c277df42
IZ
1738 if (flags & SCF_DO_SUBSTR) {
1739 data->pos_min += min1;
1740 data->pos_delta += max1 - min1;
1741 if (max1 != min1 || is_inf)
1742 data->longest = &(data->longest_float);
1743 }
1744 min += min1;
1745 delta += max1 - min1;
653099ff 1746 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1747 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1748 if (min1) {
1749 cl_and(data->start_class, &and_with);
1750 flags &= ~SCF_DO_STCLASS;
1751 }
1752 }
1753 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1754 if (min1) {
1755 cl_and(data->start_class, &accum);
653099ff 1756 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1757 }
1758 else {
b81d288d 1759 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1760 * data->start_class */
1761 StructCopy(data->start_class, &and_with,
1762 struct regnode_charclass_class);
1763 flags &= ~SCF_DO_STCLASS_AND;
1764 StructCopy(&accum, data->start_class,
1765 struct regnode_charclass_class);
1766 flags |= SCF_DO_STCLASS_OR;
1767 data->start_class->flags |= ANYOF_EOS;
1768 }
653099ff 1769 }
a3621e74
YO
1770
1771 /* demq.
1772
1773 Assuming this was/is a branch we are dealing with: 'scan' now
1774 points at the item that follows the branch sequence, whatever
1775 it is. We now start at the beginning of the sequence and look
1776 for subsequences of
1777
1778 BRANCH->EXACT=>X
1779 BRANCH->EXACT=>X
1780
1781 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1782
1783 If we can find such a subseqence we need to turn the first
1784 element into a trie and then add the subsequent branch exact
1785 strings to the trie.
1786
1787 We have two cases
1788
1789 1. patterns where the whole set of branch can be converted to a trie,
1790
1791 2. patterns where only a subset of the alternations can be
1792 converted to a trie.
1793
1794 In case 1 we can replace the whole set with a single regop
1795 for the trie. In case 2 we need to keep the start and end
1796 branchs so
1797
1798 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1799 becomes BRANCH TRIE; BRANCH X;
1800
1801 Hypthetically when we know the regex isnt anchored we can
1802 turn a case 1 into a DFA and let it rip... Every time it finds a match
1803 it would just call its tail, no WHILEM/CURLY needed.
1804
1805 */
0111c4fd
RGS
1806 if (DO_TRIE) {
1807 if (!re_trie_maxbuff) {
1808 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1809 if (!SvIOK(re_trie_maxbuff))
1810 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1811 }
a3621e74
YO
1812 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1813 regnode *cur;
1814 regnode *first = (regnode *)NULL;
1815 regnode *last = (regnode *)NULL;
1816 regnode *tail = scan;
1817 U8 optype = 0;
1818 U32 count=0;
1819
1820#ifdef DEBUGGING
c445ea15 1821 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
1822#endif
1823 /* var tail is used because there may be a TAIL
1824 regop in the way. Ie, the exacts will point to the
1825 thing following the TAIL, but the last branch will
1826 point at the TAIL. So we advance tail. If we
1827 have nested (?:) we may have to move through several
1828 tails.
1829 */
1830
1831 while ( OP( tail ) == TAIL ) {
1832 /* this is the TAIL generated by (?:) */
1833 tail = regnext( tail );
1834 }
1835
1836 DEBUG_OPTIMISE_r({
32fc9b6a 1837 regprop(RExC_rx, mysv, tail );
a3621e74 1838 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
cfd0369c 1839 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
a3621e74
YO
1840 (RExC_seen_evals) ? "[EVAL]" : ""
1841 );
1842 });
1843 /*
1844
1845 step through the branches, cur represents each
1846 branch, noper is the first thing to be matched
1847 as part of that branch and noper_next is the
1848 regnext() of that node. if noper is an EXACT
1849 and noper_next is the same as scan (our current
1850 position in the regex) then the EXACT branch is
1851 a possible optimization target. Once we have
1852 two or more consequetive such branches we can
1853 create a trie of the EXACT's contents and stich
1854 it in place. If the sequence represents all of
1855 the branches we eliminate the whole thing and
1856 replace it with a single TRIE. If it is a
1857 subsequence then we need to stitch it in. This
1858 means the first branch has to remain, and needs
1859 to be repointed at the item on the branch chain
1860 following the last branch optimized. This could
1861 be either a BRANCH, in which case the
1862 subsequence is internal, or it could be the
1863 item following the branch sequence in which
1864 case the subsequence is at the end.
1865
1866 */
1867
1868 /* dont use tail as the end marker for this traverse */
1869 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
1870 regnode * const noper = NEXTOPER( cur );
1871 regnode * const noper_next = regnext( noper );
a3621e74 1872
a3621e74 1873 DEBUG_OPTIMISE_r({
32fc9b6a 1874 regprop(RExC_rx, mysv, cur);
a3621e74 1875 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1876 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
a3621e74 1877
32fc9b6a 1878 regprop(RExC_rx, mysv, noper);
a3621e74 1879 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 1880 SvPV_nolen_const(mysv));
a3621e74
YO
1881
1882 if ( noper_next ) {
32fc9b6a 1883 regprop(RExC_rx, mysv, noper_next );
a3621e74 1884 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 1885 SvPV_nolen_const(mysv));
a3621e74
YO
1886 }
1887 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1888 first, last, cur );
1889 });
1890 if ( ( first ? OP( noper ) == optype
1891 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1892 && noper_next == tail && count<U16_MAX)
1893 {
1894 count++;
1895 if ( !first ) {
1896 first = cur;
1897 optype = OP( noper );
1898 } else {
1899 DEBUG_OPTIMISE_r(
1900 if (!last ) {
32fc9b6a 1901 regprop(RExC_rx, mysv, first);
a3621e74 1902 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1903 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
32fc9b6a 1904 regprop(RExC_rx, mysv, NEXTOPER(first) );
a3621e74 1905 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1906 SvPV_nolen_const( mysv ) );
a3621e74
YO
1907 }
1908 );
1909 last = cur;
1910 DEBUG_OPTIMISE_r({
32fc9b6a 1911 regprop(RExC_rx, mysv, cur);
a3621e74 1912 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1913 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
32fc9b6a 1914 regprop(RExC_rx, mysv, noper );
a3621e74 1915 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1916 SvPV_nolen_const( mysv ) );
a3621e74
YO
1917 });
1918 }
1919 } else {
1920 if ( last ) {
1921 DEBUG_OPTIMISE_r(
1922 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1923 (int)depth * 2 + 2, "E:", "**END**" );
a3621e74
YO
1924 );
1925 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1926 }
1927 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1928 && noper_next == tail )
1929 {
1930 count = 1;
1931 first = cur;
1932 optype = OP( noper );
1933 } else {
1934 count = 0;
1935 first = NULL;
1936 optype = 0;
1937 }
1938 last = NULL;
1939 }
1940 }
1941 DEBUG_OPTIMISE_r({
32fc9b6a 1942 regprop(RExC_rx, mysv, cur);
a3621e74 1943 PerlIO_printf( Perl_debug_log,
e4584336 1944 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
cfd0369c 1945 " ", SvPV_nolen_const( mysv ), first, last, cur);
a3621e74
YO
1946
1947 });
1948 if ( last ) {
1949 DEBUG_OPTIMISE_r(
1950 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1951 (int)depth * 2 + 2, "E:", "==END==" );
a3621e74
YO
1952 );
1953 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1954 }
1955 }
1956 }
a0ed51b3 1957 }
a3621e74 1958 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 1959 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 1960 } else /* single branch is optimized. */
c277df42
IZ
1961 scan = NEXTOPER(scan);
1962 continue;
a0ed51b3
LW
1963 }
1964 else if (OP(scan) == EXACT) {
cd439c50 1965 I32 l = STR_LEN(scan);
c445ea15 1966 UV uc;
a0ed51b3 1967 if (UTF) {
a3b680e6 1968 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 1969 l = utf8_length(s, s + l);
9041c2e3 1970 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
1971 } else {
1972 uc = *((U8*)STRING(scan));
a0ed51b3
LW
1973 }
1974 min += l;
c277df42 1975 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
1976 /* The code below prefers earlier match for fixed
1977 offset, later match for variable offset. */
1978 if (data->last_end == -1) { /* Update the start info. */
1979 data->last_start_min = data->pos_min;
1980 data->last_start_max = is_inf
b81d288d 1981 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 1982 }
cd439c50 1983 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
1984 if (UTF)
1985 SvUTF8_on(data->last_found);
0eda9292 1986 {
9a957fbc 1987 SV * const sv = data->last_found;
a28509cc 1988 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
1989 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1990 if (mg && mg->mg_len >= 0)
5e43f467
JH
1991 mg->mg_len += utf8_length((U8*)STRING(scan),
1992 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 1993 }
c277df42
IZ
1994 data->last_end = data->pos_min + l;
1995 data->pos_min += l; /* As in the first entry. */
1996 data->flags &= ~SF_BEFORE_EOL;
1997 }
653099ff
GS
1998 if (flags & SCF_DO_STCLASS_AND) {
1999 /* Check whether it is compatible with what we know already! */
2000 int compat = 1;
2001
1aa99e6b 2002 if (uc >= 0x100 ||
516a5887 2003 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2004 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2005 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2006 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2007 )
653099ff
GS
2008 compat = 0;
2009 ANYOF_CLASS_ZERO(data->start_class);
2010 ANYOF_BITMAP_ZERO(data->start_class);
2011 if (compat)
1aa99e6b 2012 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2013 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2014 if (uc < 0x100)
2015 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2016 }
2017 else if (flags & SCF_DO_STCLASS_OR) {
2018 /* false positive possible if the class is case-folded */
1aa99e6b 2019 if (uc < 0x100)
9b877dbb
IH
2020 ANYOF_BITMAP_SET(data->start_class, uc);
2021 else
2022 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2023 data->start_class->flags &= ~ANYOF_EOS;
2024 cl_and(data->start_class, &and_with);
2025 }
2026 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2027 }
653099ff 2028 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2029 I32 l = STR_LEN(scan);
1aa99e6b 2030 UV uc = *((U8*)STRING(scan));
653099ff
GS
2031
2032 /* Search for fixed substrings supports EXACT only. */
b81d288d 2033 if (flags & SCF_DO_SUBSTR)
830247a4 2034 scan_commit(pRExC_state, data);
a0ed51b3 2035 if (UTF) {
6136c704 2036 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2037 l = utf8_length(s, s + l);
9041c2e3 2038 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2039 }
2040 min += l;
c277df42 2041 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 2042 data->pos_min += l;
653099ff
GS
2043 if (flags & SCF_DO_STCLASS_AND) {
2044 /* Check whether it is compatible with what we know already! */
2045 int compat = 1;
2046
1aa99e6b 2047 if (uc >= 0x100 ||
516a5887 2048 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2049 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2050 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2051 compat = 0;
2052 ANYOF_CLASS_ZERO(data->start_class);
2053 ANYOF_BITMAP_ZERO(data->start_class);
2054 if (compat) {
1aa99e6b 2055 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2056 data->start_class->flags &= ~ANYOF_EOS;
2057 data->start_class->flags |= ANYOF_FOLD;
2058 if (OP(scan) == EXACTFL)
2059 data->start_class->flags |= ANYOF_LOCALE;
2060 }
2061 }
2062 else if (flags & SCF_DO_STCLASS_OR) {
2063 if (data->start_class->flags & ANYOF_FOLD) {
2064 /* false positive possible if the class is case-folded.
2065 Assume that the locale settings are the same... */
1aa99e6b
IH
2066 if (uc < 0x100)
2067 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2068 data->start_class->flags &= ~ANYOF_EOS;
2069 }
2070 cl_and(data->start_class, &and_with);
2071 }
2072 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2073 }
bfed75c6 2074 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2075 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2076 I32 f = flags, pos_before = 0;
d4c19fe8 2077 regnode * const oscan = scan;
653099ff
GS
2078 struct regnode_charclass_class this_class;
2079 struct regnode_charclass_class *oclass = NULL;
727f22e3 2080 I32 next_is_eval = 0;
653099ff 2081
22c35a8c 2082 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2083 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2084 scan = NEXTOPER(scan);
2085 goto finish;
2086 case PLUS:
653099ff 2087 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2088 next = NEXTOPER(scan);
653099ff 2089 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2090 mincount = 1;
2091 maxcount = REG_INFTY;
c277df42
IZ
2092 next = regnext(scan);
2093 scan = NEXTOPER(scan);
2094 goto do_curly;
2095 }
2096 }
2097 if (flags & SCF_DO_SUBSTR)
2098 data->pos_min++;
2099 min++;
2100 /* Fall through. */
2101 case STAR:
653099ff
GS
2102 if (flags & SCF_DO_STCLASS) {
2103 mincount = 0;
b81d288d 2104 maxcount = REG_INFTY;
653099ff
GS
2105 next = regnext(scan);
2106 scan = NEXTOPER(scan);
2107 goto do_curly;
2108 }
b81d288d 2109 is_inf = is_inf_internal = 1;
c277df42
IZ
2110 scan = regnext(scan);
2111 if (flags & SCF_DO_SUBSTR) {
830247a4 2112 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2113 data->longest = &(data->longest_float);
2114 }
2115 goto optimize_curly_tail;
2116 case CURLY:
b81d288d 2117 mincount = ARG1(scan);
c277df42
IZ
2118 maxcount = ARG2(scan);
2119 next = regnext(scan);
cb434fcc
IZ
2120 if (OP(scan) == CURLYX) {
2121 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2122 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2123 }
c277df42 2124 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2125 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2126 do_curly:
2127 if (flags & SCF_DO_SUBSTR) {
830247a4 2128 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2129 pos_before = data->pos_min;
2130 }
2131 if (data) {
2132 fl = data->flags;
2133 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2134 if (is_inf)
2135 data->flags |= SF_IS_INF;
2136 }
653099ff 2137 if (flags & SCF_DO_STCLASS) {
830247a4 2138 cl_init(pRExC_state, &this_class);
653099ff
GS
2139 oclass = data->start_class;
2140 data->start_class = &this_class;
2141 f |= SCF_DO_STCLASS_AND;
2142 f &= ~SCF_DO_STCLASS_OR;
2143 }
e1901655
IZ
2144 /* These are the cases when once a subexpression
2145 fails at a particular position, it cannot succeed
2146 even after backtracking at the enclosing scope.
b81d288d 2147
e1901655
IZ
2148 XXXX what if minimal match and we are at the
2149 initial run of {n,m}? */
2150 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2151 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2152
c277df42 2153 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2154 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2155 (mincount == 0
2156 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2157
2158 if (flags & SCF_DO_STCLASS)
2159 data->start_class = oclass;
2160 if (mincount == 0 || minnext == 0) {
2161 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2162 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2163 }
2164 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2165 /* Switch to OR mode: cache the old value of
653099ff
GS
2166 * data->start_class */
2167 StructCopy(data->start_class, &and_with,
2168 struct regnode_charclass_class);
2169 flags &= ~SCF_DO_STCLASS_AND;
2170 StructCopy(&this_class, data->start_class,
2171 struct regnode_charclass_class);
2172 flags |= SCF_DO_STCLASS_OR;
2173 data->start_class->flags |= ANYOF_EOS;
2174 }
2175 } else { /* Non-zero len */
2176 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2177 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2178 cl_and(data->start_class, &and_with);
2179 }
2180 else if (flags & SCF_DO_STCLASS_AND)
2181 cl_and(data->start_class, &this_class);
2182 flags &= ~SCF_DO_STCLASS;
2183 }
c277df42
IZ
2184 if (!scan) /* It was not CURLYX, but CURLY. */
2185 scan = next;
041457d9
DM
2186 if ( /* ? quantifier ok, except for (?{ ... }) */
2187 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2188 && (minnext == 0) && (deltanext == 0)
99799961 2189 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2190 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2191 && ckWARN(WARN_REGEXP))
b45f050a 2192 {
830247a4 2193 vWARN(RExC_parse,
b45f050a
JF
2194 "Quantifier unexpected on zero-length expression");
2195 }
2196
c277df42 2197 min += minnext * mincount;
b81d288d 2198 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2199 && (minnext + deltanext) > 0)
2200 || deltanext == I32_MAX);
aca2d497 2201 is_inf |= is_inf_internal;
c277df42
IZ
2202 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2203
2204 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2205 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2206 && data->flags & SF_IN_PAR
2207 && !(data->flags & SF_HAS_EVAL)
2208 && !deltanext && minnext == 1 ) {
2209 /* Try to optimize to CURLYN. */
2210 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2211 regnode * const nxt1 = nxt;
497b47a8
JH
2212#ifdef DEBUGGING
2213 regnode *nxt2;
2214#endif
c277df42
IZ
2215
2216 /* Skip open. */
2217 nxt = regnext(nxt);
bfed75c6 2218 if (!strchr((const char*)PL_simple,OP(nxt))
22c35a8c 2219 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 2220 && STR_LEN(nxt) == 1))
c277df42 2221 goto nogo;
497b47a8 2222#ifdef DEBUGGING
c277df42 2223 nxt2 = nxt;
497b47a8 2224#endif
c277df42 2225 nxt = regnext(nxt);
b81d288d 2226 if (OP(nxt) != CLOSE)
c277df42
IZ
2227 goto nogo;
2228 /* Now we know that nxt2 is the only contents: */
eb160463 2229 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2230 OP(oscan) = CURLYN;
2231 OP(nxt1) = NOTHING; /* was OPEN. */
2232#ifdef DEBUGGING
2233 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2234 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2235 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2236 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2237 OP(nxt + 1) = OPTIMIZED; /* was count. */
2238 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2239#endif
c277df42 2240 }
c277df42
IZ
2241 nogo:
2242
2243 /* Try optimization CURLYX => CURLYM. */
b81d288d 2244 if ( OP(oscan) == CURLYX && data
c277df42 2245 && !(data->flags & SF_HAS_PAR)
c277df42 2246 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2247 && !deltanext /* atom is fixed width */
2248 && minnext != 0 /* CURLYM can't handle zero width */
2249 ) {
c277df42
IZ
2250 /* XXXX How to optimize if data == 0? */
2251 /* Optimize to a simpler form. */
2252 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2253 regnode *nxt2;
2254
2255 OP(oscan) = CURLYM;
2256 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2257 && (OP(nxt2) != WHILEM))
c277df42
IZ
2258 nxt = nxt2;
2259 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2260 /* Need to optimize away parenths. */
2261 if (data->flags & SF_IN_PAR) {
2262 /* Set the parenth number. */
2263 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2264
b81d288d 2265 if (OP(nxt) != CLOSE)
b45f050a 2266 FAIL("Panic opt close");
eb160463 2267 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2268 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2269 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2270#ifdef DEBUGGING
2271 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2272 OP(nxt + 1) = OPTIMIZED; /* was count. */
2273 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2274 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2275#endif
c277df42
IZ
2276#if 0
2277 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2278 regnode *nnxt = regnext(nxt1);
b81d288d 2279
c277df42
IZ
2280 if (nnxt == nxt) {
2281 if (reg_off_by_arg[OP(nxt1)])
2282 ARG_SET(nxt1, nxt2 - nxt1);
2283 else if (nxt2 - nxt1 < U16_MAX)
2284 NEXT_OFF(nxt1) = nxt2 - nxt1;
2285 else
2286 OP(nxt) = NOTHING; /* Cannot beautify */
2287 }
2288 nxt1 = nnxt;
2289 }
2290#endif
2291 /* Optimize again: */
b81d288d 2292 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2293 NULL, 0,depth+1);
a0ed51b3
LW
2294 }
2295 else
c277df42 2296 oscan->flags = 0;
c277df42 2297 }
e1901655
IZ
2298 else if ((OP(oscan) == CURLYX)
2299 && (flags & SCF_WHILEM_VISITED_POS)
2300 /* See the comment on a similar expression above.
2301 However, this time it not a subexpression
2302 we care about, but the expression itself. */
2303 && (maxcount == REG_INFTY)
2304 && data && ++data->whilem_c < 16) {
2305 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2306 /* Find WHILEM (as in regexec.c) */
2307 regnode *nxt = oscan + NEXT_OFF(oscan);
2308
2309 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2310 nxt += ARG(nxt);
eb160463
GS
2311 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2312 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2313 }
b81d288d 2314 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2315 pars++;
2316 if (flags & SCF_DO_SUBSTR) {
c445ea15 2317 SV *last_str = NULL;
c277df42
IZ
2318 int counted = mincount != 0;
2319
2320 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2321#if defined(SPARC64_GCC_WORKAROUND)
2322 I32 b = 0;
2323 STRLEN l = 0;
cfd0369c 2324 const char *s = NULL;
5d1c421c
JH
2325 I32 old = 0;
2326
2327 if (pos_before >= data->last_start_min)
2328 b = pos_before;
2329 else
2330 b = data->last_start_min;
2331
2332 l = 0;
cfd0369c 2333 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2334 old = b - data->last_start_min;
2335
2336#else
b81d288d 2337 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2338 ? pos_before : data->last_start_min;
2339 STRLEN l;
d4c19fe8 2340 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2341 I32 old = b - data->last_start_min;
5d1c421c 2342#endif
a0ed51b3
LW
2343
2344 if (UTF)
2345 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2346
a0ed51b3 2347 l -= old;
c277df42 2348 /* Get the added string: */
79cb57f6 2349 last_str = newSVpvn(s + old, l);
0e933229
IH
2350 if (UTF)
2351 SvUTF8_on(last_str);
c277df42
IZ
2352 if (deltanext == 0 && pos_before == b) {
2353 /* What was added is a constant string */
2354 if (mincount > 1) {
2355 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2356 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2357 SvPVX_const(last_str), l, mincount - 1);
b162af07 2358 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2359 /* Add additional parts. */
b81d288d 2360 SvCUR_set(data->last_found,
c277df42
IZ
2361 SvCUR(data->last_found) - l);
2362 sv_catsv(data->last_found, last_str);
0eda9292
JH
2363 {
2364 SV * sv = data->last_found;
2365 MAGIC *mg =
2366 SvUTF8(sv) && SvMAGICAL(sv) ?
2367 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2368 if (mg && mg->mg_len >= 0)
2369 mg->mg_len += CHR_SVLEN(last_str);
2370 }
c277df42
IZ
2371 data->last_end += l * (mincount - 1);
2372 }
2a8d9689
HS
2373 } else {
2374 /* start offset must point into the last copy */
2375 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2376 data->last_start_max += is_inf ? I32_MAX
2377 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2378 }
2379 }
2380 /* It is counted once already... */
2381 data->pos_min += minnext * (mincount - counted);
2382 data->pos_delta += - counted * deltanext +
2383 (minnext + deltanext) * maxcount - minnext * mincount;
2384 if (mincount != maxcount) {
653099ff
GS
2385 /* Cannot extend fixed substrings found inside
2386 the group. */
830247a4 2387 scan_commit(pRExC_state,data);
c277df42 2388 if (mincount && last_str) {
d4c19fe8
AL
2389 SV * const sv = data->last_found;
2390 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
2391 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2392
2393 if (mg)
2394 mg->mg_len = -1;
2395 sv_setsv(sv, last_str);
c277df42 2396 data->last_end = data->pos_min;
b81d288d 2397 data->last_start_min =
a0ed51b3 2398 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2399 data->last_start_max = is_inf
2400 ? I32_MAX
c277df42 2401 : data->pos_min + data->pos_delta
a0ed51b3 2402 - CHR_SVLEN(last_str);
c277df42
IZ
2403 }
2404 data->longest = &(data->longest_float);
2405 }
aca2d497 2406 SvREFCNT_dec(last_str);
c277df42 2407 }
405ff068 2408 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2409 data->flags |= SF_HAS_EVAL;
2410 optimize_curly_tail:
c277df42 2411 if (OP(oscan) != CURLYX) {
22c35a8c 2412 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2413 && NEXT_OFF(next))
2414 NEXT_OFF(oscan) += NEXT_OFF(next);
2415 }
c277df42 2416 continue;
653099ff 2417 default: /* REF and CLUMP only? */
c277df42 2418 if (flags & SCF_DO_SUBSTR) {
830247a4 2419 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2420 data->longest = &(data->longest_float);
2421 }
aca2d497 2422 is_inf = is_inf_internal = 1;
653099ff 2423 if (flags & SCF_DO_STCLASS_OR)
830247a4 2424 cl_anything(pRExC_state, data->start_class);
653099ff 2425 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2426 break;
2427 }
a0ed51b3 2428 }
bfed75c6 2429 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2430 int value = 0;
653099ff 2431
c277df42 2432 if (flags & SCF_DO_SUBSTR) {
830247a4 2433 scan_commit(pRExC_state,data);
c277df42
IZ
2434 data->pos_min++;
2435 }
2436 min++;
653099ff
GS
2437 if (flags & SCF_DO_STCLASS) {
2438 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2439
2440 /* Some of the logic below assumes that switching
2441 locale on will only add false positives. */
2442 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2443 case SANY:
653099ff
GS
2444 default:
2445 do_default:
2446 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2447 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2448 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2449 break;
2450 case REG_ANY:
2451 if (OP(scan) == SANY)
2452 goto do_default;
2453 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2454 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2455 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2456 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2457 }
2458 if (flags & SCF_DO_STCLASS_AND || !value)
2459 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2460 break;
2461 case ANYOF:
2462 if (flags & SCF_DO_STCLASS_AND)
2463 cl_and(data->start_class,
2464 (struct regnode_charclass_class*)scan);
2465 else
830247a4 2466 cl_or(pRExC_state, data->start_class,
653099ff
GS
2467 (struct regnode_charclass_class*)scan);
2468 break;
2469 case ALNUM:
2470 if (flags & SCF_DO_STCLASS_AND) {
2471 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2472 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2473 for (value = 0; value < 256; value++)
2474 if (!isALNUM(value))
2475 ANYOF_BITMAP_CLEAR(data->start_class, value);
2476 }
2477 }
2478 else {
2479 if (data->start_class->flags & ANYOF_LOCALE)
2480 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2481 else {
2482 for (value = 0; value < 256; value++)
2483 if (isALNUM(value))
b81d288d 2484 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2485 }
2486 }
2487 break;
2488 case ALNUML:
2489 if (flags & SCF_DO_STCLASS_AND) {
2490 if (data->start_class->flags & ANYOF_LOCALE)
2491 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2492 }
2493 else {
2494 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2495 data->start_class->flags |= ANYOF_LOCALE;
2496 }
2497 break;
2498 case NALNUM:
2499 if (flags & SCF_DO_STCLASS_AND) {
2500 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2501 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2502 for (value = 0; value < 256; value++)
2503 if (isALNUM(value))
2504 ANYOF_BITMAP_CLEAR(data->start_class, value);
2505 }
2506 }
2507 else {
2508 if (data->start_class->flags & ANYOF_LOCALE)
2509 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2510 else {
2511 for (value = 0; value < 256; value++)
2512 if (!isALNUM(value))
b81d288d 2513 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2514 }
2515 }
2516 break;
2517 case NALNUML:
2518 if (flags & SCF_DO_STCLASS_AND) {
2519 if (data->start_class->flags & ANYOF_LOCALE)
2520 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2521 }
2522 else {
2523 data->start_class->flags |= ANYOF_LOCALE;
2524 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2525 }
2526 break;
2527 case SPACE:
2528 if (flags & SCF_DO_STCLASS_AND) {
2529 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2530 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2531 for (value = 0; value < 256; value++)
2532 if (!isSPACE(value))
2533 ANYOF_BITMAP_CLEAR(data->start_class, value);
2534 }
2535 }
2536 else {
2537 if (data->start_class->flags & ANYOF_LOCALE)
2538 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2539 else {
2540 for (value = 0; value < 256; value++)
2541 if (isSPACE(value))
b81d288d 2542 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2543 }
2544 }
2545 break;
2546 case SPACEL:
2547 if (flags & SCF_DO_STCLASS_AND) {
2548 if (data->start_class->flags & ANYOF_LOCALE)
2549 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2550 }
2551 else {
2552 data->start_class->flags |= ANYOF_LOCALE;
2553 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2554 }
2555 break;
2556 case NSPACE:
2557 if (flags & SCF_DO_STCLASS_AND) {
2558 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2559 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2560 for (value = 0; value < 256; value++)
2561 if (isSPACE(value))
2562 ANYOF_BITMAP_CLEAR(data->start_class, value);
2563 }
2564 }
2565 else {
2566 if (data->start_class->flags & ANYOF_LOCALE)
2567 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2568 else {
2569 for (value = 0; value < 256; value++)
2570 if (!isSPACE(value))
b81d288d 2571 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2572 }
2573 }
2574 break;
2575 case NSPACEL:
2576 if (flags & SCF_DO_STCLASS_AND) {
2577 if (data->start_class->flags & ANYOF_LOCALE) {
2578 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2579 for (value = 0; value < 256; value++)
2580 if (!isSPACE(value))
2581 ANYOF_BITMAP_CLEAR(data->start_class, value);
2582 }
2583 }
2584 else {
2585 data->start_class->flags |= ANYOF_LOCALE;
2586 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2587 }
2588 break;
2589 case DIGIT:
2590 if (flags & SCF_DO_STCLASS_AND) {
2591 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2592 for (value = 0; value < 256; value++)
2593 if (!isDIGIT(value))
2594 ANYOF_BITMAP_CLEAR(data->start_class, value);
2595 }
2596 else {
2597 if (data->start_class->flags & ANYOF_LOCALE)
2598 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2599 else {
2600 for (value = 0; value < 256; value++)
2601 if (isDIGIT(value))
b81d288d 2602 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2603 }
2604 }
2605 break;
2606 case NDIGIT:
2607 if (flags & SCF_DO_STCLASS_AND) {
2608 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2609 for (value = 0; value < 256; value++)
2610 if (isDIGIT(value))
2611 ANYOF_BITMAP_CLEAR(data->start_class, value);
2612 }
2613 else {
2614 if (data->start_class->flags & ANYOF_LOCALE)
2615 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2616 else {
2617 for (value = 0; value < 256; value++)
2618 if (!isDIGIT(value))
b81d288d 2619 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2620 }
2621 }
2622 break;
2623 }
2624 if (flags & SCF_DO_STCLASS_OR)
2625 cl_and(data->start_class, &and_with);
2626 flags &= ~SCF_DO_STCLASS;
2627 }
a0ed51b3 2628 }
22c35a8c 2629 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2630 data->flags |= (OP(scan) == MEOL
2631 ? SF_BEFORE_MEOL
2632 : SF_BEFORE_SEOL);
a0ed51b3 2633 }
653099ff
GS
2634 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2635 /* Lookbehind, or need to calculate parens/evals/stclass: */
2636 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2637 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2638 /* Lookahead/lookbehind */
cb434fcc 2639 I32 deltanext, minnext, fake = 0;
c277df42 2640 regnode *nscan;
653099ff
GS
2641 struct regnode_charclass_class intrnl;
2642 int f = 0;
c277df42
IZ
2643
2644 data_fake.flags = 0;
b81d288d 2645 if (data) {
2c2d71f5 2646 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2647 data_fake.last_closep = data->last_closep;
2648 }
2649 else
2650 data_fake.last_closep = &fake;
653099ff
GS
2651 if ( flags & SCF_DO_STCLASS && !scan->flags
2652 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2653 cl_init(pRExC_state, &intrnl);
653099ff 2654 data_fake.start_class = &intrnl;
e1901655 2655 f |= SCF_DO_STCLASS_AND;
653099ff 2656 }
e1901655
IZ
2657 if (flags & SCF_WHILEM_VISITED_POS)
2658 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2659 next = regnext(scan);
2660 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2661 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2662 if (scan->flags) {
2663 if (deltanext) {
9baa0206 2664 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2665 }
2666 else if (minnext > U8_MAX) {
9baa0206 2667 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2668 }
eb160463 2669 scan->flags = (U8)minnext;
c277df42
IZ
2670 }
2671 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2672 pars++;
405ff068 2673 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2674 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2675 if (data)
2676 data->whilem_c = data_fake.whilem_c;
e1901655 2677 if (f & SCF_DO_STCLASS_AND) {
a28509cc 2678 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
2679
2680 cl_and(data->start_class, &intrnl);
2681 if (was)
2682 data->start_class->flags |= ANYOF_EOS;
2683 }
a0ed51b3
LW
2684 }
2685 else if (OP(scan) == OPEN) {
c277df42 2686 pars++;
a0ed51b3 2687 }
cb434fcc 2688 else if (OP(scan) == CLOSE) {
eb160463 2689 if ((I32)ARG(scan) == is_par) {
cb434fcc 2690 next = regnext(scan);
c277df42 2691
cb434fcc
IZ
2692 if ( next && (OP(next) != WHILEM) && next < last)
2693 is_par = 0; /* Disable optimization */
2694 }
2695 if (data)
2696 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2697 }
2698 else if (OP(scan) == EVAL) {
c277df42
IZ
2699 if (data)
2700 data->flags |= SF_HAS_EVAL;
2701 }
96776eda 2702 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2703 if (flags & SCF_DO_SUBSTR) {
830247a4 2704 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2705 data->longest = &(data->longest_float);
2706 }
2707 is_inf = is_inf_internal = 1;
653099ff 2708 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2709 cl_anything(pRExC_state, data->start_class);
96776eda 2710 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2711 }
c277df42
IZ
2712 /* Else: zero-length, ignore. */
2713 scan = regnext(scan);
2714 }
2715
2716 finish:
2717 *scanp = scan;
aca2d497 2718 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2719 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2720 data->pos_delta = I32_MAX - data->pos_min;
2721 if (is_par > U8_MAX)
2722 is_par = 0;
2723 if (is_par && pars==1 && data) {
2724 data->flags |= SF_IN_PAR;
2725 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2726 }
2727 else if (pars && data) {
c277df42
IZ
2728 data->flags |= SF_HAS_PAR;
2729 data->flags &= ~SF_IN_PAR;
2730 }
653099ff
GS
2731 if (flags & SCF_DO_STCLASS_OR)
2732 cl_and(data->start_class, &and_with);
c277df42
IZ
2733 return min;
2734}
2735
76e3520e 2736STATIC I32
5f66b61c 2737S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2738{
830247a4 2739 if (RExC_rx->data) {
b81d288d
AB
2740 Renewc(RExC_rx->data,
2741 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2742 char, struct reg_data);
830247a4
IZ
2743 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2744 RExC_rx->data->count += n;
a0ed51b3
LW
2745 }
2746 else {
a02a5408 2747 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2748 char, struct reg_data);
a02a5408 2749 Newx(RExC_rx->data->what, n, U8);
830247a4 2750 RExC_rx->data->count = n;
c277df42 2751 }
830247a4
IZ
2752 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2753 return RExC_rx->data->count - n;
c277df42
IZ
2754}
2755
d88dccdf 2756void
864dbfa3 2757Perl_reginitcolors(pTHX)
d88dccdf 2758{
97aff369 2759 dVAR;
1df70142 2760 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 2761 if (s) {
1df70142
AL
2762 char *t = savepv(s);
2763 int i = 0;
2764 PL_colors[0] = t;
d88dccdf 2765 while (++i < 6) {
1df70142
AL
2766 t = strchr(t, '\t');
2767 if (t) {
2768 *t = '\0';
2769 PL_colors[i] = ++t;
d88dccdf
IZ
2770 }
2771 else
1df70142 2772 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
2773 }
2774 } else {
1df70142 2775 int i = 0;
b81d288d 2776 while (i < 6)
06b5626a 2777 PL_colors[i++] = (char *)"";
d88dccdf
IZ
2778 }
2779 PL_colorset = 1;
2780}
2781
8615cb43 2782
a687059c 2783/*
e50aee73 2784 - pregcomp - compile a regular expression into internal code
a687059c
LW
2785 *
2786 * We can't allocate space until we know how big the compiled form will be,
2787 * but we can't compile it (and thus know how big it is) until we've got a
2788 * place to put the code. So we cheat: we compile it twice, once with code
2789 * generation turned off and size counting turned on, and once "for real".
2790 * This also means that we don't allocate space until we are sure that the
2791 * thing really will compile successfully, and we never have to move the
2792 * code and thus invalidate pointers into it. (Note that it has to be in
2793 * one piece because free() must be able to free it all.) [NB: not true in perl]
2794 *
2795 * Beware that the optimization-preparation code in here knows about some
2796 * of the structure of the compiled regexp. [I'll say.]
2797 */
2798regexp *
864dbfa3 2799Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2800{
97aff369 2801 dVAR;
a0d0e21e 2802 register regexp *r;
c277df42 2803 regnode *scan;
c277df42 2804 regnode *first;
a0d0e21e 2805 I32 flags;
a0d0e21e
LW
2806 I32 minlen = 0;
2807 I32 sawplus = 0;
2808 I32 sawopen = 0;
2c2d71f5 2809 scan_data_t data;
830247a4
IZ
2810 RExC_state_t RExC_state;
2811 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 2812
a3621e74
YO
2813 GET_RE_DEBUG_FLAGS_DECL;
2814
a0d0e21e 2815 if (exp == NULL)
c277df42 2816 FAIL("NULL regexp argument");
a0d0e21e 2817
a5961de5 2818 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 2819
5cfc7842 2820 RExC_precomp = exp;
a3621e74
YO
2821 DEBUG_r(if (!PL_colorset) reginitcolors());
2822 DEBUG_COMPILE_r({
2823 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
2824 PL_colors[4],PL_colors[5],PL_colors[0],
2825 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2826 });
e2509266 2827 RExC_flags = pm->op_pmflags;
830247a4 2828 RExC_sawback = 0;
bbce6d69 2829
830247a4
IZ
2830 RExC_seen = 0;
2831 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2832 RExC_seen_evals = 0;
2833 RExC_extralen = 0;
c277df42 2834
bbce6d69 2835 /* First pass: determine size, legality. */
830247a4 2836 RExC_parse = exp;
fac92740 2837 RExC_start = exp;
830247a4
IZ
2838 RExC_end = xend;
2839 RExC_naughty = 0;
2840 RExC_npar = 1;
2841 RExC_size = 0L;
2842 RExC_emit = &PL_regdummy;
2843 RExC_whilem_seen = 0;
85ddcde9
JH
2844#if 0 /* REGC() is (currently) a NOP at the first pass.
2845 * Clever compilers notice this and complain. --jhi */
830247a4 2846 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 2847#endif
830247a4 2848 if (reg(pRExC_state, 0, &flags) == NULL) {
c445ea15 2849 RExC_precomp = NULL;
a0d0e21e
LW
2850 return(NULL);
2851 }
a3621e74 2852 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 2853
c277df42
IZ
2854 /* Small enough for pointer-storage convention?
2855 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
2856 if (RExC_size >= 0x10000L && RExC_extralen)
2857 RExC_size += RExC_extralen;
c277df42 2858 else
830247a4
IZ
2859 RExC_extralen = 0;
2860 if (RExC_whilem_seen > 15)
2861 RExC_whilem_seen = 15;
a0d0e21e 2862
bbce6d69 2863 /* Allocate space and initialize. */
a02a5408 2864 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 2865 char, regexp);
a0d0e21e 2866 if (r == NULL)
b45f050a
JF
2867 FAIL("Regexp out of space");
2868
0f79a09d
GS
2869#ifdef DEBUGGING
2870 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 2871 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 2872#endif
c277df42 2873 r->refcnt = 1;
bbce6d69 2874 r->prelen = xend - exp;
5cfc7842 2875 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 2876 r->subbeg = NULL;
f8c7b90f 2877#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 2878 r->saved_copy = NULL;
ed252734 2879#endif
cf93c79d 2880 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 2881 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 2882 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
2883
2884 r->substrs = 0; /* Useful during FAIL. */
2885 r->startp = 0; /* Useful during FAIL. */
2886 r->endp = 0; /* Useful during FAIL. */
2887
a02a5408 2888 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 2889 if (r->offsets) {
2af232bd 2890 r->offsets[0] = RExC_size;
fac92740 2891 }
a3621e74 2892 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
2893 "%s %"UVuf" bytes for offset annotations.\n",
2894 r->offsets ? "Got" : "Couldn't get",
392fbf5d 2895 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 2896
830247a4 2897 RExC_rx = r;
bbce6d69 2898
2899 /* Second pass: emit code. */
e2509266 2900 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
2901 RExC_parse = exp;
2902 RExC_end = xend;
2903 RExC_naughty = 0;
2904 RExC_npar = 1;
fac92740 2905 RExC_emit_start = r->program;
830247a4 2906 RExC_emit = r->program;
2cd61cdb 2907 /* Store the count of eval-groups for security checks: */
eb160463 2908 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 2909 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 2910 r->data = 0;
830247a4 2911 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
2912 return(NULL);
2913
a3621e74 2914
a0d0e21e 2915 /* Dig out information for optimizations. */
cf93c79d 2916 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 2917 pm->op_pmflags = RExC_flags;
a0ed51b3 2918 if (UTF)
5ff6fc6d 2919 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 2920 r->regstclass = NULL;
830247a4 2921 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 2922 r->reganch |= ROPT_NAUGHTY;
c277df42 2923 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
2924
2925 /* XXXX To minimize changes to RE engine we always allocate
2926 3-units-long substrs field. */
a02a5408 2927 Newxz(r->substrs, 1, struct reg_substr_data);
2779dcf1 2928
2c2d71f5 2929 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 2930 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 2931 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 2932 I32 fake;
c5254dd6 2933 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
2934 struct regnode_charclass_class ch_class;
2935 int stclass_flag;
cb434fcc 2936 I32 last_close = 0;
a0d0e21e
LW
2937
2938 first = scan;
c277df42 2939 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 2940 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 2941 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
2942 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2943 (OP(first) == PLUS) ||
2944 (OP(first) == MINMOD) ||
653099ff 2945 /* An {n,m} with n>0 */
22c35a8c 2946 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
2947 if (OP(first) == PLUS)
2948 sawplus = 1;
2949 else
2950 first += regarglen[(U8)OP(first)];
2951 first = NEXTOPER(first);
a687059c
LW
2952 }
2953
a0d0e21e
LW
2954 /* Starting-point info. */
2955 again:
653099ff 2956 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b 2957 if (OP(first) == EXACT)
bb263b4e 2958 /*EMPTY*/; /* Empty, get anchored substr later. */
1aa99e6b 2959 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
2960 r->regstclass = first;
2961 }
bfed75c6 2962 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 2963 r->regstclass = first;
22c35a8c
GS
2964 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2965 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 2966 r->regstclass = first;
22c35a8c 2967 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
2968 r->reganch |= (OP(first) == MBOL
2969 ? ROPT_ANCH_MBOL
2970 : (OP(first) == SBOL
2971 ? ROPT_ANCH_SBOL
2972 : ROPT_ANCH_BOL));
a0d0e21e 2973 first = NEXTOPER(first);
774d564b 2974 goto again;
2975 }
2976 else if (OP(first) == GPOS) {
2977 r->reganch |= ROPT_ANCH_GPOS;
2978 first = NEXTOPER(first);
2979 goto again;
a0d0e21e 2980 }
e09294f4 2981 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 2982 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
2983 !(r->reganch & ROPT_ANCH) )
2984 {
2985 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
2986 const int type =
2987 (OP(NEXTOPER(first)) == REG_ANY)
2988 ? ROPT_ANCH_MBOL
2989 : ROPT_ANCH_SBOL;
cad2e5aa 2990 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 2991 first = NEXTOPER(first);
774d564b 2992 goto again;
a0d0e21e 2993 }
b81d288d 2994 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 2995 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
2996 /* x+ must match at the 1st pos of run of x's */
2997 r->reganch |= ROPT_SKIP;
a0d0e21e 2998
c277df42 2999 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 3000 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 3001 (IV)(first - scan + 1)));
a0d0e21e
LW
3002 /*
3003 * If there's something expensive in the r.e., find the
3004 * longest literal string that must appear and make it the
3005 * regmust. Resolve ties in favor of later strings, since
3006 * the regstart check works with the beginning of the r.e.
3007 * and avoiding duplication strengthens checking. Not a
3008 * strong reason, but sufficient in the absence of others.
3009 * [Now we resolve ties in favor of the earlier string if
c277df42 3010 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3011 * earlier string may buy us something the later one won't.]
3012 */
a0d0e21e 3013 minlen = 0;
a687059c 3014
396482e1
GA
3015 data.longest_fixed = newSVpvs("");
3016 data.longest_float = newSVpvs("");
3017 data.last_found = newSVpvs("");
c277df42
IZ
3018 data.longest = &(data.longest_fixed);
3019 first = scan;
653099ff 3020 if (!r->regstclass) {
830247a4 3021 cl_init(pRExC_state, &ch_class);
653099ff
GS
3022 data.start_class = &ch_class;
3023 stclass_flag = SCF_DO_STCLASS_AND;
3024 } else /* XXXX Check for BOUND? */
3025 stclass_flag = 0;
cb434fcc 3026 data.last_closep = &last_close;
653099ff 3027
830247a4 3028 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3029 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3030 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3031 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3032 && !RExC_seen_zerolen
3033 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3034 r->reganch |= ROPT_CHECK_ALL;
830247a4 3035 scan_commit(pRExC_state, &data);
c277df42
IZ
3036 SvREFCNT_dec(data.last_found);
3037
a0ed51b3 3038 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3039 if (longest_float_length
c277df42
IZ
3040 || (data.flags & SF_FL_BEFORE_EOL
3041 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3042 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3043 int t;
3044
a0ed51b3 3045 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3046 && data.offset_fixed == data.offset_float_min
3047 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3048 goto remove_float; /* As in (a)+. */
3049
33b8afdf
JH
3050 if (SvUTF8(data.longest_float)) {
3051 r->float_utf8 = data.longest_float;
c445ea15 3052 r->float_substr = NULL;
33b8afdf
JH
3053 } else {
3054 r->float_substr = data.longest_float;
c445ea15 3055 r->float_utf8 = NULL;
33b8afdf 3056 }
c277df42
IZ
3057 r->float_min_offset = data.offset_float_min;
3058 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3059 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3060 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3061 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3062 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3063 }
3064 else {
aca2d497 3065 remove_float:
c445ea15 3066 r->float_substr = r->float_utf8 = NULL;
c277df42 3067 SvREFCNT_dec(data.longest_float);
c5254dd6 3068 longest_float_length = 0;
a0d0e21e 3069 }
c277df42 3070
a0ed51b3 3071 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3072 if (longest_fixed_length
c277df42
IZ
3073 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3074 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3075 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3076 int t;
3077
33b8afdf
JH
3078 if (SvUTF8(data.longest_fixed)) {
3079 r->anchored_utf8 = data.longest_fixed;
c445ea15 3080 r->anchored_substr = NULL;
33b8afdf
JH
3081 } else {
3082 r->anchored_substr = data.longest_fixed;
c445ea15 3083 r->anchored_utf8 = NULL;
33b8afdf 3084 }
c277df42 3085 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3086 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3087 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3088 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3089 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3090 }
3091 else {
c445ea15 3092 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 3093 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3094 longest_fixed_length = 0;
a0d0e21e 3095 }
b81d288d 3096 if (r->regstclass
ffc61ed2 3097 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3098 r->regstclass = NULL;
33b8afdf
JH
3099 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3100 && stclass_flag
653099ff 3101 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3102 && !cl_is_anything(data.start_class))
3103 {
1df70142 3104 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3105
a02a5408 3106 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3107 struct regnode_charclass_class);
3108 StructCopy(data.start_class,
830247a4 3109 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3110 struct regnode_charclass_class);
830247a4 3111 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3112 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3113 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 3114 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3115 PerlIO_printf(Perl_debug_log,
a0288114 3116 "synthetic stclass \"%s\".\n",
3f7c398e 3117 SvPVX_const(sv));});
653099ff 3118 }
c277df42
IZ
3119
3120 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3121 if (longest_fixed_length > longest_float_length) {
c277df42 3122 r->check_substr = r->anchored_substr;
33b8afdf 3123 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3124 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3125 if (r->reganch & ROPT_ANCH_SINGLE)
3126 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3127 }
3128 else {
c277df42 3129 r->check_substr = r->float_substr;
33b8afdf 3130 r->check_utf8 = r->float_utf8;
c277df42
IZ
3131 r->check_offset_min = data.offset_float_min;
3132 r->check_offset_max = data.offset_float_max;
a0d0e21e 3133 }
30382c73
IZ
3134 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3135 This should be changed ASAP! */
33b8afdf 3136 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3137 r->reganch |= RE_USE_INTUIT;
33b8afdf 3138 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3139 r->reganch |= RE_INTUIT_TAIL;
3140 }
a0ed51b3
LW
3141 }
3142 else {
c277df42
IZ
3143 /* Several toplevels. Best we can is to set minlen. */
3144 I32 fake;
653099ff 3145 struct regnode_charclass_class ch_class;
cb434fcc 3146 I32 last_close = 0;
c277df42 3147
a3621e74 3148 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3149 scan = r->program + 1;
830247a4 3150 cl_init(pRExC_state, &ch_class);
653099ff 3151 data.start_class = &ch_class;
cb434fcc 3152 data.last_closep = &last_close;
a3621e74 3153 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf 3154 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 3155 = r->float_substr = r->float_utf8 = NULL;
653099ff 3156 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3157 && !cl_is_anything(data.start_class))
3158 {
1df70142 3159 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3160
a02a5408 3161 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3162 struct regnode_charclass_class);
3163 StructCopy(data.start_class,
830247a4 3164 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3165 struct regnode_charclass_class);
830247a4 3166 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3167 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3168 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 3169 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3170 PerlIO_printf(Perl_debug_log,
a0288114 3171 "synthetic stclass \"%s\".\n",
3f7c398e 3172 SvPVX_const(sv));});
653099ff 3173 }
a0d0e21e
LW
3174 }
3175
a0d0e21e 3176 r->minlen = minlen;
b81d288d 3177 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3178 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3179 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3180 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3181 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3182 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3183 if (RExC_seen & REG_SEEN_CANY)
3184 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3185 Newxz(r->startp, RExC_npar, I32);
3186 Newxz(r->endp, RExC_npar, I32);
a3621e74 3187 DEBUG_COMPILE_r(regdump(r));
a0d0e21e 3188 return(r);
a687059c
LW
3189}
3190
3191/*
3192 - reg - regular expression, i.e. main body or parenthesized thing
3193 *
3194 * Caller must absorb opening parenthesis.
3195 *
3196 * Combining parenthesis handling with the base level of regular expression
3197 * is a trifle forced, but the need to tie the tails of the branches to what
3198 * follows makes it hard to avoid.
3199 */
76e3520e 3200STATIC regnode *
830247a4 3201S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 3202 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3203{
27da23d5 3204 dVAR;
c277df42
IZ
3205 register regnode *ret; /* Will be the head of the group. */
3206 register regnode *br;
3207 register regnode *lastbr;
cbbf8932 3208 register regnode *ender = NULL;
a0d0e21e 3209 register I32 parno = 0;
cbbf8932
AL
3210 I32 flags;
3211 const I32 oregflags = RExC_flags;
6136c704
AL
3212 bool have_branch = 0;
3213 bool is_open = 0;
9d1d55b5
JP
3214
3215 /* for (?g), (?gc), and (?o) warnings; warning
3216 about (?c) will warn about (?g) -- japhy */
3217
6136c704
AL
3218#define WASTED_O 0x01
3219#define WASTED_G 0x02
3220#define WASTED_C 0x04
3221#define WASTED_GC (0x02|0x04)
cbbf8932 3222 I32 wastedflags = 0x00;
9d1d55b5 3223
fac92740 3224 char * parse_start = RExC_parse; /* MJD */
a28509cc 3225 char * const oregcomp_parse = RExC_parse;
a0d0e21e 3226
821b33a5 3227 *flagp = 0; /* Tentatively. */
a0d0e21e 3228
9d1d55b5 3229
a0d0e21e
LW
3230 /* Make an OPEN node, if parenthesized. */
3231 if (paren) {
fac92740 3232 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3233 U32 posflags = 0, negflags = 0;
3234 U32 *flagsp = &posflags;
6136c704 3235 bool is_logical = 0;
a28509cc 3236 const char * const seqstart = RExC_parse;
ca9dfc88 3237
830247a4
IZ
3238 RExC_parse++;
3239 paren = *RExC_parse++;
c277df42 3240 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3241 switch (paren) {
fac92740 3242 case '<': /* (?<...) */
830247a4 3243 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3244 if (*RExC_parse == '!')
c277df42 3245 paren = ',';
b81d288d 3246 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3247 goto unknown;
830247a4 3248 RExC_parse++;
fac92740
MJD
3249 case '=': /* (?=...) */
3250 case '!': /* (?!...) */
830247a4 3251 RExC_seen_zerolen++;
fac92740
MJD
3252 case ':': /* (?:...) */
3253 case '>': /* (?>...) */
a0d0e21e 3254 break;
fac92740
MJD
3255 case '$': /* (?$...) */
3256 case '@': /* (?@...) */
8615cb43 3257 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3258 break;
fac92740 3259 case '#': /* (?#...) */
830247a4
IZ
3260 while (*RExC_parse && *RExC_parse != ')')
3261 RExC_parse++;
3262 if (*RExC_parse != ')')
c277df42 3263 FAIL("Sequence (?#... not terminated");
830247a4 3264 nextchar(pRExC_state);
a0d0e21e
LW
3265 *flagp = TRYAGAIN;
3266 return NULL;
fac92740 3267 case 'p': /* (?p...) */
9014280d 3268 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3269 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3270 /* FALL THROUGH*/
fac92740 3271 case '?': /* (??...) */
6136c704 3272 is_logical = 1;
438a3801
YST
3273 if (*RExC_parse != '{')
3274 goto unknown;
830247a4 3275 paren = *RExC_parse++;
0f5d15d6 3276 /* FALL THROUGH */
fac92740 3277 case '{': /* (?{...}) */
c277df42 3278 {
c277df42
IZ
3279 I32 count = 1, n = 0;
3280 char c;
830247a4 3281 char *s = RExC_parse;
c277df42 3282
830247a4
IZ
3283 RExC_seen_zerolen++;
3284 RExC_seen |= REG_SEEN_EVAL;
3285 while (count && (c = *RExC_parse)) {
6136c704
AL
3286 if (c == '\\') {
3287 if (RExC_parse[1])
3288 RExC_parse++;
3289 }
b81d288d 3290 else if (c == '{')
c277df42 3291 count++;
b81d288d 3292 else if (c == '}')
c277df42 3293 count--;
830247a4 3294 RExC_parse++;
c277df42 3295 }
6136c704 3296 if (*RExC_parse != ')') {
b81d288d 3297 RExC_parse = s;
b45f050a
JF
3298 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3299 }
c277df42 3300 if (!SIZE_ONLY) {
f3548bdc 3301 PAD *pad;
6136c704
AL
3302 OP_4tree *sop, *rop;
3303 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3304
569233ed
SB
3305 ENTER;
3306 Perl_save_re_context(aTHX);
f3548bdc 3307 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3308 sop->op_private |= OPpREFCOUNTED;
3309 /* re_dup will OpREFCNT_inc */
3310 OpREFCNT_set(sop, 1);
569233ed 3311 LEAVE;
c277df42 3312
830247a4
IZ
3313 n = add_data(pRExC_state, 3, "nop");
3314 RExC_rx->data->data[n] = (void*)rop;
3315 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3316 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3317 SvREFCNT_dec(sv);
a0ed51b3 3318 }
e24b16f9 3319 else { /* First pass */
830247a4 3320 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3321 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3322 /* No compiled RE interpolated, has runtime
3323 components ===> unsafe. */
3324 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3325 if (PL_tainting && PL_tainted)
cc6b7395 3326 FAIL("Eval-group in insecure regular expression");
923e4eb5 3327 if (IN_PERL_COMPILETIME)
b5c19bd7 3328 PL_cv_has_eval = 1;
c277df42 3329 }
b5c19bd7 3330
830247a4 3331 nextchar(pRExC_state);
6136c704 3332 if (is_logical) {
830247a4 3333 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3334 if (!SIZE_ONLY)
3335 ret->flags = 2;
830247a4 3336 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3337 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3338 return ret;
3339 }
ccb2c380
MP
3340 ret = reganode(pRExC_state, EVAL, n);
3341 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3342 Set_Node_Offset(ret, parse_start);
3343 return ret;
c277df42 3344 }
fac92740 3345 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3346 {
fac92740 3347 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3348 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3349 || RExC_parse[1] == '<'
830247a4 3350 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3351 I32 flag;
3352
830247a4 3353 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3354 if (!SIZE_ONLY)
3355 ret->flags = 1;
830247a4 3356 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 3357 goto insert_if;
b81d288d 3358 }
a0ed51b3 3359 }
830247a4 3360 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3361 /* (?(1)...) */
6136c704 3362 char c;
830247a4 3363 parno = atoi(RExC_parse++);
c277df42 3364
830247a4
IZ
3365 while (isDIGIT(*RExC_parse))
3366 RExC_parse++;
fac92740 3367 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3368
830247a4 3369 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3370 vFAIL("Switch condition not recognized");
c277df42 3371 insert_if:
830247a4
IZ
3372 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3373 br = regbranch(pRExC_state, &flags, 1);
c277df42 3374 if (br == NULL)
830247a4 3375 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3376 else
830247a4
IZ
3377 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3378 c = *nextchar(pRExC_state);
d1b80229
IZ
3379 if (flags&HASWIDTH)
3380 *flagp |= HASWIDTH;
c277df42 3381 if (c == '|') {
830247a4
IZ
3382 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3383 regbranch(pRExC_state, &flags, 1);
3384 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
3385 if (flags&HASWIDTH)
3386 *flagp |= HASWIDTH;
830247a4 3387 c = *nextchar(pRExC_state);
a0ed51b3
LW
3388 }
3389 else
c277df42
IZ
3390 lastbr = NULL;
3391 if (c != ')')
8615cb43 3392 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
3393 ender = reg_node(pRExC_state, TAIL);
3394 regtail(pRExC_state, br, ender);
c277df42 3395 if (lastbr) {
830247a4
IZ
3396 regtail(pRExC_state, lastbr, ender);
3397 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3398 }
3399 else
830247a4 3400 regtail(pRExC_state, ret, ender);
c277df42 3401 return ret;
a0ed51b3
LW
3402 }
3403 else {
830247a4 3404 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3405 }
3406 }
1b1626e4 3407 case 0:
830247a4 3408 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3409 vFAIL("Sequence (? incomplete");
1b1626e4 3410 break;
a0d0e21e 3411 default:
830247a4 3412 --RExC_parse;
fac92740 3413 parse_flags: /* (?i) */
830247a4 3414 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3415 /* (?g), (?gc) and (?o) are useless here
3416 and must be globally applied -- japhy */
3417
3418 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3419 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 3420 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
3421 if (! (wastedflags & wflagbit) ) {
3422 wastedflags |= wflagbit;
3423 vWARN5(
3424 RExC_parse + 1,
3425 "Useless (%s%c) - %suse /%c modifier",
3426 flagsp == &negflags ? "?-" : "?",
3427 *RExC_parse,
3428 flagsp == &negflags ? "don't " : "",
3429 *RExC_parse
3430 );
3431 }
3432 }
3433 }
3434 else if (*RExC_parse == 'c') {
3435 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
3436 if (! (wastedflags & WASTED_C) ) {
3437 wastedflags |= WASTED_GC;
9d1d55b5
JP
3438 vWARN3(
3439 RExC_parse + 1,
3440 "Useless (%sc) - %suse /gc modifier",
3441 flagsp == &negflags ? "?-" : "?",
3442 flagsp == &negflags ? "don't " : ""
3443 );
3444 }
3445 }
3446 }
3447 else { pmflag(flagsp, *RExC_parse); }
3448
830247a4 3449 ++RExC_parse;
ca9dfc88 3450 }
830247a4 3451 if (*RExC_parse == '-') {
ca9dfc88 3452 flagsp = &negflags;
9d1d55b5 3453 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3454 ++RExC_parse;
ca9dfc88 3455 goto parse_flags;
48c036b1 3456 }
e2509266
JH
3457 RExC_flags |= posflags;
3458 RExC_flags &= ~negflags;
830247a4
IZ
3459 if (*RExC_parse == ':') {
3460 RExC_parse++;
ca9dfc88
IZ
3461 paren = ':';
3462 break;
3463 }
c277df42 3464 unknown:
830247a4
IZ
3465 if (*RExC_parse != ')') {
3466 RExC_parse++;
3467 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3468 }
830247a4 3469 nextchar(pRExC_state);
a0d0e21e
LW
3470 *flagp = TRYAGAIN;
3471 return NULL;
3472 }
3473 }
fac92740 3474 else { /* (...) */
830247a4
IZ
3475 parno = RExC_npar;
3476 RExC_npar++;
3477 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3478 Set_Node_Length(ret, 1); /* MJD */
3479 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 3480 is_open = 1;
a0d0e21e 3481 }
a0ed51b3 3482 }
fac92740 3483 else /* ! paren */
a0d0e21e
LW
3484 ret = NULL;
3485
3486 /* Pick up the branches, linking them together. */
fac92740 3487 parse_start = RExC_parse; /* MJD */
830247a4 3488 br = regbranch(pRExC_state, &flags, 1);
fac92740 3489 /* branch_len = (paren != 0); */
2af232bd 3490
a0d0e21e
LW
3491 if (br == NULL)
3492 return(NULL);
830247a4
IZ
3493 if (*RExC_parse == '|') {
3494 if (!SIZE_ONLY && RExC_extralen) {
3495 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3496 }
fac92740 3497 else { /* MJD */
830247a4 3498 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3499 Set_Node_Length(br, paren != 0);
3500 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3501 }
c277df42
IZ
3502 have_branch = 1;
3503 if (SIZE_ONLY)
830247a4 3504 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3505 }
3506 else if (paren == ':') {
c277df42
IZ
3507 *flagp |= flags&SIMPLE;
3508 }
6136c704 3509 if (is_open) { /* Starts with OPEN. */
830247a4 3510 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3511 }
3512 else if (paren != '?') /* Not Conditional */
a0d0e21e 3513 ret = br;
32a0ca98 3514 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3515 lastbr = br;
830247a4
IZ
3516 while (*RExC_parse == '|') {
3517 if (!SIZE_ONLY && RExC_extralen) {
3518 ender = reganode(pRExC_state, LONGJMP,0);
3519 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3520 }
3521 if (SIZE_ONLY)
830247a4
IZ
3522 RExC_extralen += 2; /* Account for LONGJMP. */
3523 nextchar(pRExC_state);
3524 br = regbranch(pRExC_state, &flags, 0);
2af232bd 3525
a687059c 3526 if (br == NULL)
a0d0e21e 3527 return(NULL);
830247a4 3528 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3529 lastbr = br;
821b33a5
IZ
3530 if (flags&HASWIDTH)
3531 *flagp |= HASWIDTH;
a687059c 3532 *flagp |= flags&SPSTART;
a0d0e21e
LW
3533 }
3534
c277df42
IZ
3535 if (have_branch || paren != ':') {
3536 /* Make a closing node, and hook it on the end. */
3537 switch (paren) {
3538 case ':':
830247a4 3539 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3540 break;
3541 case 1:
830247a4 3542 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3543 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3544 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3545 break;
3546 case '<':
c277df42
IZ
3547 case ',':
3548 case '=':
3549 case '!':
c277df42 3550 *flagp &= ~HASWIDTH;
821b33a5
IZ
3551 /* FALL THROUGH */
3552 case '>':
830247a4 3553 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3554 break;
3555 case 0:
830247a4 3556 ender = reg_node(pRExC_state, END);
c277df42
IZ
3557 break;
3558 }
830247a4 3559 regtail(pRExC_state, lastbr, ender);
a0d0e21e 3560
c277df42
IZ
3561 if (have_branch) {
3562 /* Hook the tails of the branches to the closing node. */
3563 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 3564 regoptail(pRExC_state, br, ender);
c277df42
IZ
3565 }
3566 }
a0d0e21e 3567 }
c277df42
IZ
3568
3569 {
e1ec3a88
AL
3570 const char *p;
3571 static const char parens[] = "=!<,>";
c277df42
IZ
3572
3573 if (paren && (p = strchr(parens, paren))) {
eb160463 3574 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
3575 int flag = (p - parens) > 1;
3576
3577 if (paren == '>')
3578 node = SUSPEND, flag = 0;
830247a4 3579 reginsert(pRExC_state, node,ret);
45948336
EP
3580 Set_Node_Cur_Length(ret);
3581 Set_Node_Offset(ret, parse_start + 1);
c277df42 3582 ret->flags = flag;
830247a4 3583 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 3584 }
a0d0e21e
LW
3585 }
3586
3587 /* Check for proper termination. */
ce3e6498 3588 if (paren) {
e2509266 3589 RExC_flags = oregflags;
830247a4
IZ
3590 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3591 RExC_parse = oregcomp_parse;
380a0633 3592 vFAIL("Unmatched (");
ce3e6498 3593 }
a0ed51b3 3594 }
830247a4
IZ
3595 else if (!paren && RExC_parse < RExC_end) {
3596 if (*RExC_parse == ')') {
3597 RExC_parse++;
380a0633 3598 vFAIL("Unmatched )");
a0ed51b3
LW
3599 }
3600 else
b45f050a 3601 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
3602 /* NOTREACHED */
3603 }
a687059c 3604
a0d0e21e 3605 return(ret);
a687059c
LW
3606}
3607
3608/*
3609 - regbranch - one alternative of an | operator
3610 *
3611 * Implements the concatenation operator.
3612 */
76e3520e 3613STATIC regnode *
830247a4 3614S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 3615{
97aff369 3616 dVAR;
c277df42
IZ
3617 register regnode *ret;
3618 register regnode *chain = NULL;
3619 register regnode *latest;
3620 I32 flags = 0, c = 0;
a0d0e21e 3621
b81d288d 3622 if (first)
c277df42
IZ
3623 ret = NULL;
3624 else {
b81d288d 3625 if (!SIZE_ONLY && RExC_extralen)
830247a4 3626 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 3627 else {
830247a4 3628 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
3629 Set_Node_Length(ret, 1);
3630 }
c277df42
IZ
3631 }
3632
b81d288d 3633 if (!first && SIZE_ONLY)
830247a4 3634 RExC_extralen += 1; /* BRANCHJ */
b81d288d 3635
c277df42 3636 *flagp = WORST; /* Tentatively. */
a0d0e21e 3637
830247a4
IZ
3638 RExC_parse--;
3639 nextchar(pRExC_state);
3640 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 3641 flags &= ~TRYAGAIN;
830247a4 3642 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
3643 if (latest == NULL) {
3644 if (flags & TRYAGAIN)
3645 continue;
3646 return(NULL);
a0ed51b3
LW
3647 }
3648 else if (ret == NULL)
c277df42 3649 ret = latest;
a0d0e21e 3650 *flagp |= flags&HASWIDTH;
c277df42 3651 if (chain == NULL) /* First piece. */
a0d0e21e
LW
3652 *flagp |= flags&SPSTART;
3653 else {
830247a4
IZ
3654 RExC_naughty++;
3655 regtail(pRExC_state, chain, latest);
a687059c 3656 }
a0d0e21e 3657 chain = latest;
c277df42
IZ
3658 c++;
3659 }
3660 if (chain == NULL) { /* Loop ran zero times. */
830247a4 3661 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
3662 if (ret == NULL)
3663 ret = chain;
3664 }
3665 if (c == 1) {
3666 *flagp |= flags&SIMPLE;
a0d0e21e 3667 }
a687059c 3668
d4c19fe8 3669 return ret;
a687059c
LW
3670}
3671
3672/*
3673 - regpiece - something followed by possible [*+?]
3674 *
3675 * Note that the branching code sequences used for ? and the general cases
3676 * of * and + are somewhat optimized: they use the same NOTHING node as
3677 * both the endmarker for their branch list and the body of the last branch.
3678 * It might seem that this node could be dispensed with entirely, but the
3679 * endmarker role is not redundant.
3680 */
76e3520e 3681STATIC regnode *
830247a4 3682S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3683{
97aff369 3684 dVAR;
c277df42 3685 register regnode *ret;
a0d0e21e
LW
3686 register char op;
3687 register char *next;
3688 I32 flags;
1df70142 3689 const char * const origparse = RExC_parse;
a0d0e21e
LW
3690 char *maxpos;
3691 I32 min;
c277df42 3692 I32 max = REG_INFTY;
fac92740 3693 char *parse_start;
a0d0e21e 3694
830247a4 3695 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
3696 if (ret == NULL) {
3697 if (flags & TRYAGAIN)
3698 *flagp |= TRYAGAIN;
3699 return(NULL);
3700 }
3701
830247a4 3702 op = *RExC_parse;
a0d0e21e 3703
830247a4 3704 if (op == '{' && regcurly(RExC_parse)) {
fac92740 3705 parse_start = RExC_parse; /* MJD */
830247a4 3706 next = RExC_parse + 1;
c445ea15 3707 maxpos = NULL;
a0d0e21e
LW
3708 while (isDIGIT(*next) || *next == ',') {
3709 if (*next == ',') {
3710 if (maxpos)
3711 break;
3712 else
3713 maxpos = next;
a687059c 3714 }
a0d0e21e
LW
3715 next++;
3716 }
3717 if (*next == '}') { /* got one */
3718 if (!maxpos)
3719 maxpos = next;
830247a4
IZ
3720 RExC_parse++;
3721 min = atoi(RExC_parse);
a0d0e21e
LW
3722 if (*maxpos == ',')
3723 maxpos++;
3724 else
830247a4 3725 maxpos = RExC_parse;
a0d0e21e
LW
3726 max = atoi(maxpos);
3727 if (!max && *maxpos != '0')
c277df42
IZ
3728 max = REG_INFTY; /* meaning "infinity" */
3729 else if (max >= REG_INFTY)
8615cb43 3730 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
3731 RExC_parse = next;
3732 nextchar(pRExC_state);
a0d0e21e
LW
3733
3734 do_curly:
3735 if ((flags&SIMPLE)) {
830247a4
IZ
3736 RExC_naughty += 2 + RExC_naughty / 2;
3737 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
3738 Set_Node_Offset(ret, parse_start+1); /* MJD */
3739 Set_Node_Cur_Length(ret);
a0d0e21e
LW
3740 }
3741 else {
830247a4 3742 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
3743
3744 w->flags = 0;
830247a4
IZ
3745 regtail(pRExC_state, ret, w);
3746 if (!SIZE_ONLY && RExC_extralen) {
3747 reginsert(pRExC_state, LONGJMP,ret);
3748 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
3749 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3750 }
830247a4 3751 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
3752 /* MJD hk */
3753 Set_Node_Offset(ret, parse_start+1);
2af232bd 3754 Set_Node_Length(ret,
fac92740 3755 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 3756
830247a4 3757 if (!SIZE_ONLY && RExC_extralen)
c277df42 3758 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 3759 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 3760 if (SIZE_ONLY)
830247a4
IZ
3761 RExC_whilem_seen++, RExC_extralen += 3;
3762 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 3763 }
c277df42 3764 ret->flags = 0;
a0d0e21e
LW
3765
3766 if (min > 0)
821b33a5
IZ
3767 *flagp = WORST;
3768 if (max > 0)
3769 *flagp |= HASWIDTH;
a0d0e21e 3770 if (max && max < min)
8615cb43 3771 vFAIL("Can't do {n,m} with n > m");
c277df42 3772 if (!SIZE_ONLY) {
eb160463
GS
3773 ARG1_SET(ret, (U16)min);
3774 ARG2_SET(ret, (U16)max);
a687059c 3775 }
a687059c 3776
a0d0e21e 3777 goto nest_check;
a687059c 3778 }
a0d0e21e 3779 }
a687059c 3780
a0d0e21e
LW
3781 if (!ISMULT1(op)) {
3782 *flagp = flags;
a687059c 3783 return(ret);
a0d0e21e 3784 }
bb20fd44 3785
c277df42 3786#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
3787
3788 /* if this is reinstated, don't forget to put this back into perldiag:
3789
3790 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3791
3792 (F) The part of the regexp subject to either the * or + quantifier
3793 could match an empty string. The {#} shows in the regular
3794 expression about where the problem was discovered.
3795
3796 */
3797
bb20fd44 3798 if (!(flags&HASWIDTH) && op != '?')
b45f050a 3799 vFAIL("Regexp *+ operand could be empty");
b81d288d 3800#endif
bb20fd44 3801
fac92740 3802 parse_start = RExC_parse;
830247a4 3803 nextchar(pRExC_state);
a0d0e21e 3804
821b33a5 3805 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
3806
3807 if (op == '*' && (flags&SIMPLE)) {
830247a4 3808 reginsert(pRExC_state, STAR, ret);
c277df42 3809 ret->flags = 0;
830247a4 3810 RExC_naughty += 4;
a0d0e21e
LW
3811 }
3812 else if (op == '*') {
3813 min = 0;
3814 goto do_curly;
a0ed51b3
LW
3815 }
3816 else if (op == '+' && (flags&SIMPLE)) {
830247a4 3817 reginsert(pRExC_state, PLUS, ret);
c277df42 3818 ret->flags = 0;
830247a4 3819 RExC_naughty += 3;
a0d0e21e
LW
3820 }
3821 else if (op == '+') {
3822 min = 1;
3823 goto do_curly;
a0ed51b3
LW
3824 }
3825 else if (op == '?') {
a0d0e21e
LW
3826 min = 0; max = 1;
3827 goto do_curly;
3828 }
3829 nest_check:
041457d9 3830 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 3831 vWARN3(RExC_parse,
b45f050a 3832 "%.*s matches null string many times",
afd78fd5 3833 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 3834 origparse);
a0d0e21e
LW
3835 }
3836
830247a4
IZ
3837 if (*RExC_parse == '?') {
3838 nextchar(pRExC_state);
3839 reginsert(pRExC_state, MINMOD, ret);
3840 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 3841 }
830247a4
IZ
3842 if (ISMULT2(RExC_parse)) {
3843 RExC_parse++;
b45f050a
JF
3844 vFAIL("Nested quantifiers");
3845 }
a0d0e21e
LW
3846
3847 return(ret);
a687059c
LW
3848}
3849
3850/*
3851 - regatom - the lowest level
3852 *
3853 * Optimization: gobbles an entire sequence of ordinary characters so that
3854 * it can turn them into a single node, which is smaller to store and
3855 * faster to run. Backslashed characters are exceptions, each becoming a
3856 * separate node; the code is simpler that way and it's not worth fixing.
3857 *
b45f050a 3858 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 3859STATIC regnode *
830247a4 3860S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3861{
97aff369 3862 dVAR;
cbbf8932 3863 register regnode *ret = NULL;
a0d0e21e 3864 I32 flags;
45948336 3865 char *parse_start = RExC_parse;
a0d0e21e
LW
3866
3867 *flagp = WORST; /* Tentatively. */
3868
3869tryagain:
830247a4 3870 switch (*RExC_parse) {
a0d0e21e 3871 case '^':
830247a4
IZ
3872 RExC_seen_zerolen++;
3873 nextchar(pRExC_state);
e2509266 3874 if (RExC_flags & PMf_MULTILINE)
830247a4 3875 ret = reg_node(pRExC_state, MBOL);
e2509266 3876 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3877 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3878 else
830247a4 3879 ret = reg_node(pRExC_state, BOL);
fac92740 3880 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3881 break;
3882 case '$':
830247a4 3883 nextchar(pRExC_state);
b81d288d 3884 if (*RExC_parse)
830247a4 3885 RExC_seen_zerolen++;
e2509266 3886 if (RExC_flags & PMf_MULTILINE)
830247a4 3887 ret = reg_node(pRExC_state, MEOL);
e2509266 3888 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3889 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3890 else
830247a4 3891 ret = reg_node(pRExC_state, EOL);
fac92740 3892 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3893 break;
3894 case '.':
830247a4 3895 nextchar(pRExC_state);
e2509266 3896 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
3897 ret = reg_node(pRExC_state, SANY);
3898 else
3899 ret = reg_node(pRExC_state, REG_ANY);
3900 *flagp |= HASWIDTH|SIMPLE;
830247a4 3901 RExC_naughty++;
fac92740 3902 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3903 break;
3904 case '[':
b45f050a 3905 {
830247a4 3906 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 3907 ret = regclass(pRExC_state);
830247a4
IZ
3908 if (*RExC_parse != ']') {
3909 RExC_parse = oregcomp_parse;
b45f050a
JF
3910 vFAIL("Unmatched [");
3911 }
830247a4 3912 nextchar(pRExC_state);
a0d0e21e 3913 *flagp |= HASWIDTH|SIMPLE;
fac92740 3914 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 3915 break;
b45f050a 3916 }
a0d0e21e 3917 case '(':
830247a4
IZ
3918 nextchar(pRExC_state);
3919 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 3920 if (ret == NULL) {
bf93d4cc 3921 if (flags & TRYAGAIN) {
830247a4 3922 if (RExC_parse == RExC_end) {
bf93d4cc
GS
3923 /* Make parent create an empty node if needed. */
3924 *flagp |= TRYAGAIN;
3925 return(NULL);
3926 }
a0d0e21e 3927 goto tryagain;
bf93d4cc 3928 }
a0d0e21e
LW
3929 return(NULL);
3930 }
c277df42 3931 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
3932 break;
3933 case '|':
3934 case ')':
3935 if (flags & TRYAGAIN) {
3936 *flagp |= TRYAGAIN;
3937 return NULL;
3938 }
b45f050a 3939 vFAIL("Internal urp");
a0d0e21e
LW
3940 /* Supposed to be caught earlier. */
3941 break;
85afd4ae 3942 case '{':
830247a4
IZ
3943 if (!regcurly(RExC_parse)) {
3944 RExC_parse++;
85afd4ae
CS
3945 goto defchar;
3946 }
3947 /* FALL THROUGH */
a0d0e21e
LW
3948 case '?':
3949 case '+':
3950 case '*':
830247a4 3951 RExC_parse++;
b45f050a 3952 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
3953 break;
3954 case '\\':
830247a4 3955 switch (*++RExC_parse) {
a0d0e21e 3956 case 'A':
830247a4
IZ
3957 RExC_seen_zerolen++;
3958 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3959 *flagp |= SIMPLE;
830247a4 3960 nextchar(pRExC_state);
fac92740 3961 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3962 break;
3963 case 'G':
830247a4
IZ
3964 ret = reg_node(pRExC_state, GPOS);
3965 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 3966 *flagp |= SIMPLE;
830247a4 3967 nextchar(pRExC_state);
fac92740 3968 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3969 break;
3970 case 'Z':
830247a4 3971 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3972 *flagp |= SIMPLE;
a1917ab9 3973 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 3974 nextchar(pRExC_state);
a0d0e21e 3975 break;
b85d18e9 3976 case 'z':
830247a4 3977 ret = reg_node(pRExC_state, EOS);
b85d18e9 3978 *flagp |= SIMPLE;
830247a4
IZ
3979 RExC_seen_zerolen++; /* Do not optimize RE away */
3980 nextchar(pRExC_state);
fac92740 3981 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 3982 break;
4a2d328f 3983 case 'C':
f33976b4
DB
3984 ret = reg_node(pRExC_state, CANY);
3985 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 3986 *flagp |= HASWIDTH|SIMPLE;
830247a4 3987 nextchar(pRExC_state);
fac92740 3988 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
3989 break;
3990 case 'X':
830247a4 3991 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 3992 *flagp |= HASWIDTH;
830247a4 3993 nextchar(pRExC_state);
fac92740 3994 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 3995 break;
a0d0e21e 3996 case 'w':
eb160463 3997 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 3998 *flagp |= HASWIDTH|SIMPLE;
830247a4 3999 nextchar(pRExC_state);
fac92740 4000 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4001 break;
4002 case 'W':
eb160463 4003 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4004 *flagp |= HASWIDTH|SIMPLE;
830247a4 4005 nextchar(pRExC_state);
fac92740 4006 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4007 break;
4008 case 'b':
830247a4
IZ
4009 RExC_seen_zerolen++;
4010 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4011 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4012 *flagp |= SIMPLE;
830247a4 4013 nextchar(pRExC_state);
fac92740 4014 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4015 break;
4016 case 'B':
830247a4
IZ
4017 RExC_seen_zerolen++;
4018 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4019 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4020 *flagp |= SIMPLE;
830247a4 4021 nextchar(pRExC_state);
fac92740 4022 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4023 break;
4024 case 's':
eb160463 4025 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4026 *flagp |= HASWIDTH|SIMPLE;
830247a4 4027 nextchar(pRExC_state);
fac92740 4028 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4029 break;
4030 case 'S':
eb160463 4031 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4032 *flagp |= HASWIDTH|SIMPLE;
830247a4 4033 nextchar(pRExC_state);
fac92740 4034 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4035 break;
4036 case 'd':
ffc61ed2 4037 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4038 *flagp |= HASWIDTH|SIMPLE;
830247a4 4039 nextchar(pRExC_state);
fac92740 4040 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4041 break;
4042 case 'D':
ffc61ed2 4043 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4044 *flagp |= HASWIDTH|SIMPLE;
830247a4 4045 nextchar(pRExC_state);
fac92740 4046 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4047 break;
a14b48bc
LW
4048 case 'p':
4049 case 'P':
3568d838 4050 {
830247a4 4051 char* oldregxend = RExC_end;
ccb2c380 4052 char* parse_start = RExC_parse - 2;
a14b48bc 4053
830247a4 4054 if (RExC_parse[1] == '{') {
3568d838 4055 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4056 RExC_end = strchr(RExC_parse, '}');
4057 if (!RExC_end) {
0da60cf5 4058 U8 c = (U8)*RExC_parse;
830247a4
IZ
4059 RExC_parse += 2;
4060 RExC_end = oldregxend;
0da60cf5 4061 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4062 }
830247a4 4063 RExC_end++;
a14b48bc 4064 }
af6f566e 4065 else {
830247a4 4066 RExC_end = RExC_parse + 2;
af6f566e
HS
4067 if (RExC_end > oldregxend)
4068 RExC_end = oldregxend;
4069 }
830247a4 4070 RExC_parse--;
a14b48bc 4071
ffc61ed2 4072 ret = regclass(pRExC_state);
a14b48bc 4073
830247a4
IZ
4074 RExC_end = oldregxend;
4075 RExC_parse--;
ccb2c380
MP
4076
4077 Set_Node_Offset(ret, parse_start + 2);
4078 Set_Node_Cur_Length(ret);
830247a4 4079 nextchar(pRExC_state);
a14b48bc
LW
4080 *flagp |= HASWIDTH|SIMPLE;
4081 }
4082 break;
a0d0e21e
LW
4083 case 'n':
4084 case 'r':
4085 case 't':
4086 case 'f':
4087 case 'e':
4088 case 'a':
4089 case 'x':
4090 case 'c':
4091 case '0':
4092 goto defchar;
4093 case '1': case '2': case '3': case '4':
4094 case '5': case '6': case '7': case '8': case '9':
4095 {
1df70142 4096 const I32 num = atoi(RExC_parse);
a0d0e21e 4097
830247a4 4098 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4099 goto defchar;
4100 else {
fac92740 4101 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4102 while (isDIGIT(*RExC_parse))
4103 RExC_parse++;
b45f050a 4104
eb160463 4105 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4106 vFAIL("Reference to nonexistent group");
830247a4 4107 RExC_sawback = 1;
eb160463
GS
4108 ret = reganode(pRExC_state,
4109 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4110 num);
a0d0e21e 4111 *flagp |= HASWIDTH;
2af232bd 4112
fac92740 4113 /* override incorrect value set in reganode MJD */
2af232bd 4114 Set_Node_Offset(ret, parse_start+1);
fac92740 4115 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4116 RExC_parse--;
4117 nextchar(pRExC_state);
a0d0e21e
LW
4118 }
4119 }
4120 break;
4121 case '\0':
830247a4 4122 if (RExC_parse >= RExC_end)
b45f050a 4123 FAIL("Trailing \\");
a0d0e21e
LW
4124 /* FALL THROUGH */
4125 default:
a0288114 4126 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 4127 back into the quick-grab loop below */
45948336 4128 parse_start--;
a0d0e21e
LW
4129 goto defchar;
4130 }
4131 break;
4633a7c4
LW
4132
4133 case '#':
e2509266 4134 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4135 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4136 if (RExC_parse < RExC_end)
4633a7c4
LW
4137 goto tryagain;
4138 }
4139 /* FALL THROUGH */
4140
a0d0e21e 4141 default: {
ba210ebe 4142 register STRLEN len;
58ae7d3f 4143 register UV ender;
a0d0e21e 4144 register char *p;
c277df42 4145 char *oldp, *s;
80aecb99 4146 STRLEN foldlen;
89ebb4a3 4147 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4148
4149 parse_start = RExC_parse - 1;
a0d0e21e 4150
830247a4 4151 RExC_parse++;
a0d0e21e
LW
4152
4153 defchar:
58ae7d3f 4154 ender = 0;
eb160463
GS
4155 ret = reg_node(pRExC_state,
4156 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4157 s = STRING(ret);
830247a4
IZ
4158 for (len = 0, p = RExC_parse - 1;
4159 len < 127 && p < RExC_end;
a0d0e21e
LW
4160 len++)
4161 {
4162 oldp = p;
5b5a24f7 4163
e2509266 4164 if (RExC_flags & PMf_EXTENDED)
830247a4 4165 p = regwhite(p, RExC_end);
a0d0e21e
LW
4166 switch (*p) {
4167 case '^':
4168 case '$':
4169 case '.':
4170 case '[':
4171 case '(':
4172 case ')':
4173 case '|':
4174 goto loopdone;
4175 case '\\':
4176 switch (*++p) {
4177 case 'A':
1ed8eac0
JF
4178 case 'C':
4179 case 'X':
a0d0e21e
LW
4180 case 'G':
4181 case 'Z':
b85d18e9 4182 case 'z':
a0d0e21e
LW
4183 case 'w':
4184 case 'W':
4185 case 'b':
4186 case 'B':
4187 case 's':
4188 case 'S':
4189 case 'd':
4190 case 'D':
a14b48bc
LW
4191 case 'p':
4192 case 'P':
a0d0e21e
LW
4193 --p;
4194 goto loopdone;
4195 case 'n':
4196 ender = '\n';
4197 p++;
a687059c 4198 break;
a0d0e21e
LW
4199 case 'r':
4200 ender = '\r';
4201 p++;
a687059c 4202 break;
a0d0e21e
LW
4203 case 't':
4204 ender = '\t';
4205 p++;
a687059c 4206 break;
a0d0e21e
LW
4207 case 'f':
4208 ender = '\f';
4209 p++;
a687059c 4210 break;
a0d0e21e 4211 case 'e':
c7f1f016 4212 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4213 p++;
a687059c 4214 break;
a0d0e21e 4215 case 'a':
c7f1f016 4216 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4217 p++;
a687059c 4218 break;
a0d0e21e 4219 case 'x':
a0ed51b3 4220 if (*++p == '{') {
1df70142 4221 char* const e = strchr(p, '}');
b81d288d 4222
b45f050a 4223 if (!e) {
830247a4 4224 RExC_parse = p + 1;
b45f050a
JF
4225 vFAIL("Missing right brace on \\x{}");
4226 }
de5f0749 4227 else {
a4c04bdc
NC
4228 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4229 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 4230 STRLEN numlen = e - p - 1;
53305cf1 4231 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4232 if (ender > 0xff)
4233 RExC_utf8 = 1;
a0ed51b3
LW
4234 p = e + 1;
4235 }
a0ed51b3
LW
4236 }
4237 else {
a4c04bdc 4238 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 4239 STRLEN numlen = 2;
53305cf1 4240 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4241 p += numlen;
4242 }
a687059c 4243 break;
a0d0e21e
LW
4244 case 'c':
4245 p++;
bbce6d69 4246 ender = UCHARAT(p++);
4247 ender = toCTRL(ender);
a687059c 4248 break;
a0d0e21e
LW
4249 case '0': case '1': case '2': case '3':case '4':
4250 case '5': case '6': case '7': case '8':case '9':
4251 if (*p == '0' ||
830247a4 4252 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 4253 I32 flags = 0;
1df70142 4254 STRLEN numlen = 3;
53305cf1 4255 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4256 p += numlen;
4257 }
4258 else {
4259 --p;
4260 goto loopdone;
a687059c
LW
4261 }
4262 break;
a0d0e21e 4263 case '\0':
830247a4 4264 if (p >= RExC_end)
b45f050a 4265 FAIL("Trailing \\");
a687059c 4266 /* FALL THROUGH */
a0d0e21e 4267 default:
041457d9 4268 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 4269 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4270 goto normal_default;
a0d0e21e
LW
4271 }
4272 break;
a687059c 4273 default:
a0ed51b3 4274 normal_default:
fd400ab9 4275 if (UTF8_IS_START(*p) && UTF) {
1df70142 4276 STRLEN numlen;
5e12f4fb 4277 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 4278 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
4279 p += numlen;
4280 }
4281 else
4282 ender = *p++;
a0d0e21e 4283 break;
a687059c 4284 }
e2509266 4285 if (RExC_flags & PMf_EXTENDED)
830247a4 4286 p = regwhite(p, RExC_end);
60a8b682
JH
4287 if (UTF && FOLD) {
4288 /* Prime the casefolded buffer. */
ac7e0132 4289 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4290 }
a0d0e21e
LW
4291 if (ISMULT2(p)) { /* Back off on ?+*. */
4292 if (len)
4293 p = oldp;
16ea2a2e 4294 else if (UTF) {
0ebc6274
JH
4295 STRLEN unilen;
4296
80aecb99 4297 if (FOLD) {
60a8b682 4298 /* Emit all the Unicode characters. */
1df70142 4299 STRLEN numlen;
80aecb99
JH
4300 for (foldbuf = tmpbuf;
4301 foldlen;
4302 foldlen -= numlen) {
4303 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4304 if (numlen > 0) {
0ebc6274
JH
4305 reguni(pRExC_state, ender, s, &unilen);
4306 s += unilen;
4307 len += unilen;
4308 /* In EBCDIC the numlen
4309 * and unilen can differ. */
9dc45d57 4310 foldbuf += numlen;
47654450
JH
4311 if (numlen >= foldlen)
4312 break;
9dc45d57
JH
4313 }
4314 else
4315 break; /* "Can't happen." */
80aecb99
JH
4316 }
4317 }
4318 else {
0ebc6274 4319 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4320 if (unilen > 0) {
0ebc6274
JH
4321 s += unilen;
4322 len += unilen;
9dc45d57 4323 }
80aecb99 4324 }
a0ed51b3 4325 }
a0d0e21e
LW
4326 else {
4327 len++;
eb160463 4328 REGC((char)ender, s++);
a0d0e21e
LW
4329 }
4330 break;
a687059c 4331 }
16ea2a2e 4332 if (UTF) {
0ebc6274
JH
4333 STRLEN unilen;
4334
80aecb99 4335 if (FOLD) {
60a8b682 4336 /* Emit all the Unicode characters. */
1df70142 4337 STRLEN numlen;
80aecb99
JH
4338 for (foldbuf = tmpbuf;
4339 foldlen;
4340 foldlen -= numlen) {
4341 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4342 if (numlen > 0) {
0ebc6274
JH
4343 reguni(pRExC_state, ender, s, &unilen);
4344 len += unilen;
4345 s += unilen;
4346 /* In EBCDIC the numlen
4347 * and unilen can differ. */
9dc45d57 4348 foldbuf += numlen;
47654450
JH
4349 if (numlen >= foldlen)
4350 break;
9dc45d57
JH
4351 }
4352 else
4353 break;
80aecb99
JH
4354 }
4355 }
4356 else {
0ebc6274 4357 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4358 if (unilen > 0) {
0ebc6274
JH
4359 s += unilen;
4360 len += unilen;
9dc45d57 4361 }
80aecb99
JH
4362 }
4363 len--;
a0ed51b3
LW
4364 }
4365 else
eb160463 4366 REGC((char)ender, s++);
a0d0e21e
LW
4367 }
4368 loopdone:
830247a4 4369 RExC_parse = p - 1;
fac92740 4370 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4371 nextchar(pRExC_state);
793db0cb
JH
4372 {
4373 /* len is STRLEN which is unsigned, need to copy to signed */
4374 IV iv = len;
4375 if (iv < 0)
4376 vFAIL("Internal disaster");
4377 }
a0d0e21e
LW
4378 if (len > 0)
4379 *flagp |= HASWIDTH;
090f7165 4380 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4381 *flagp |= SIMPLE;
c277df42 4382 if (!SIZE_ONLY)
cd439c50
IZ
4383 STR_LEN(ret) = len;
4384 if (SIZE_ONLY)
830247a4 4385 RExC_size += STR_SZ(len);
cd439c50 4386 else
830247a4 4387 RExC_emit += STR_SZ(len);
a687059c 4388 }
a0d0e21e
LW
4389 break;
4390 }
a687059c 4391
60a8b682
JH
4392 /* If the encoding pragma is in effect recode the text of
4393 * any EXACT-kind nodes. */
22c54be3 4394 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
4395 STRLEN oldlen = STR_LEN(ret);
4396 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4397
4398 if (RExC_utf8)
4399 SvUTF8_on(sv);
4400 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
4401 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4402 const STRLEN newlen = SvCUR(sv);
d0063567
DK
4403
4404 if (SvUTF8(sv))
4405 RExC_utf8 = 1;
4406 if (!SIZE_ONLY) {
a3621e74
YO
4407 GET_RE_DEBUG_FLAGS_DECL;
4408 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
4409 (int)oldlen, STRING(ret),
4410 (int)newlen, s));
4411 Copy(s, STRING(ret), newlen, char);
4412 STR_LEN(ret) += newlen - oldlen;
4413 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4414 } else
4415 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4416 }
a72c7584
JH
4417 }
4418
a0d0e21e 4419 return(ret);
a687059c
LW
4420}
4421
873ef191 4422STATIC char *
5f66b61c 4423S_regwhite(char *p, const char *e)
5b5a24f7
CS
4424{
4425 while (p < e) {
4426 if (isSPACE(*p))
4427 ++p;
4428 else if (*p == '#') {
4429 do {
4430 p++;
4431 } while (p < e && *p != '\n');
4432 }
4433 else
4434 break;
4435 }
4436 return p;
4437}
4438
b8c5462f
JH
4439/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4440 Character classes ([:foo:]) can also be negated ([:^foo:]).
4441 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4442 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 4443 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
4444
4445#define POSIXCC_DONE(c) ((c) == ':')
4446#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4447#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4448
b8c5462f 4449STATIC I32
830247a4 4450S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 4451{
97aff369 4452 dVAR;
936ed897 4453 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 4454
830247a4 4455 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 4456 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 4457 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 4458 const char c = UCHARAT(RExC_parse);
097eb12c 4459 char* const s = RExC_parse++;
b81d288d 4460
9a86a77b 4461 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
4462 RExC_parse++;
4463 if (RExC_parse == RExC_end)
620e46c5 4464 /* Grandfather lone [:, [=, [. */
830247a4 4465 RExC_parse = s;
620e46c5 4466 else {
1df70142 4467 const char* t = RExC_parse++; /* skip over the c */
a28509cc 4468 const char *posixcc;
b8c5462f 4469
80916619
NC
4470 assert(*t == c);
4471
9a86a77b 4472 if (UCHARAT(RExC_parse) == ']') {
830247a4 4473 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
4474 posixcc = s + 1;
4475 if (*s == ':') {
1df70142
AL
4476 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4477 const I32 skip = t - posixcc;
80916619
NC
4478
4479 /* Initially switch on the length of the name. */
4480 switch (skip) {
4481 case 4:
4482 if (memEQ(posixcc, "word", 4)) {
4483 /* this is not POSIX, this is the Perl \w */;
4484 namedclass
4485 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4486 }
cc4319de 4487 break;
80916619
NC
4488 case 5:
4489 /* Names all of length 5. */
4490 /* alnum alpha ascii blank cntrl digit graph lower
4491 print punct space upper */
4492 /* Offset 4 gives the best switch position. */
4493 switch (posixcc[4]) {
4494 case 'a':
4495 if (memEQ(posixcc, "alph", 4)) {
4496 /* a */
4497 namedclass
4498 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4499 }
4500 break;
4501 case 'e':
4502 if (memEQ(posixcc, "spac", 4)) {
4503 /* e */
4504 namedclass
4505 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4506 }
4507 break;
4508 case 'h':
4509 if (memEQ(posixcc, "grap", 4)) {
4510 /* h */
4511 namedclass
4512 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4513 }
4514 break;
4515 case 'i':
4516 if (memEQ(posixcc, "asci", 4)) {
4517 /* i */
4518 namedclass
4519 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4520 }
4521 break;
4522 case 'k':
4523 if (memEQ(posixcc, "blan", 4)) {
4524 /* k */
4525 namedclass
4526 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4527 }
4528 break;
4529 case 'l':
4530 if (memEQ(posixcc, "cntr", 4)) {
4531 /* l */
4532 namedclass
4533 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4534 }
4535 break;
4536 case 'm':
4537 if (memEQ(posixcc, "alnu", 4)) {
4538 /* m */
4539 namedclass
4540 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4541 }
4542 break;
4543 case 'r':
4544 if (memEQ(posixcc, "lowe", 4)) {
4545 /* r */
4546 namedclass
4547 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4548 }
4549 if (memEQ(posixcc, "uppe", 4)) {
8fdec511 4550 /* r */
80916619
NC
4551 namedclass
4552 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4553 }
4554 break;
4555 case 't':
4556 if (memEQ(posixcc, "digi", 4)) {
4557 /* t */
4558 namedclass
4559 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4560 }
4561 if (memEQ(posixcc, "prin", 4)) {
8fdec511 4562 /* t */
80916619
NC
4563 namedclass
4564 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4565 }
4566 if (memEQ(posixcc, "punc", 4)) {
4567 /* t */
4568 namedclass
4569 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4570 }
4571 break;
b8c5462f
JH
4572 }
4573 break;
80916619
NC
4574 case 6:
4575 if (memEQ(posixcc, "xdigit", 6)) {
4576 namedclass
4577 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
4578 }
4579 break;
4580 }
80916619
NC
4581
4582 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
4583 {
4584 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4585 t - s - 1, s + 1);
4586 }
80916619
NC
4587 assert (posixcc[skip] == ':');
4588 assert (posixcc[skip+1] == ']');
b45f050a 4589 } else if (!SIZE_ONLY) {
b8c5462f 4590 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 4591
830247a4 4592 /* adjust RExC_parse so the warning shows after
b45f050a 4593 the class closes */
9a86a77b 4594 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 4595 RExC_parse++;
b45f050a
JF
4596 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4597 }
b8c5462f
JH
4598 } else {
4599 /* Maternal grandfather:
4600 * "[:" ending in ":" but not in ":]" */
830247a4 4601 RExC_parse = s;
767d463e 4602 }
620e46c5
JH
4603 }
4604 }
4605
b8c5462f
JH
4606 return namedclass;
4607}
4608
4609STATIC void
830247a4 4610S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 4611{
97aff369 4612 dVAR;
b938889d 4613 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
4614 const char *s = RExC_parse;
4615 const char c = *s++;
b8c5462f
JH
4616
4617 while(*s && isALNUM(*s))
4618 s++;
4619 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
4620 if (ckWARN(WARN_REGEXP))
4621 vWARN3(s+2,
4622 "POSIX syntax [%c %c] belongs inside character classes",
4623 c, c);
b45f050a
JF
4624
4625 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 4626 if (POSIXCC_NOTYET(c)) {
830247a4 4627 /* adjust RExC_parse so the error shows after
b45f050a 4628 the class closes */
9a86a77b 4629 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
4630 ;
4631 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4632 }
b8c5462f
JH
4633 }
4634 }
620e46c5
JH
4635}
4636
76e3520e 4637STATIC regnode *
830247a4 4638S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 4639{
97aff369 4640 dVAR;
ffc61ed2 4641 register UV value;
9a86a77b 4642 register UV nextvalue;
3568d838 4643 register IV prevvalue = OOB_UNICODE;
ffc61ed2 4644 register IV range = 0;
c277df42 4645 register regnode *ret;
ba210ebe 4646 STRLEN numlen;
ffc61ed2 4647 IV namedclass;
cbbf8932 4648 char *rangebegin = NULL;
936ed897 4649 bool need_class = 0;
c445ea15 4650 SV *listsv = NULL;
ffc61ed2
JH
4651 register char *e;
4652 UV n;
9e55ce06 4653 bool optimize_invert = TRUE;
cbbf8932 4654 AV* unicode_alternate = NULL;
1b2d223b
JH
4655#ifdef EBCDIC
4656 UV literal_endpoint = 0;
4657#endif
ffc61ed2
JH
4658
4659 ret = reganode(pRExC_state, ANYOF, 0);
4660
4661 if (!SIZE_ONLY)
4662 ANYOF_FLAGS(ret) = 0;
4663
9a86a77b 4664 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
4665 RExC_naughty++;
4666 RExC_parse++;
4667 if (!SIZE_ONLY)
4668 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4669 }
a0d0e21e 4670
73060fc4 4671 if (SIZE_ONLY) {
830247a4 4672 RExC_size += ANYOF_SKIP;
73060fc4
JH
4673 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
4674 }
936ed897 4675 else {
830247a4 4676 RExC_emit += ANYOF_SKIP;
936ed897
IZ
4677 if (FOLD)
4678 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4679 if (LOC)
4680 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 4681 ANYOF_BITMAP_ZERO(ret);
396482e1 4682 listsv = newSVpvs("# comment\n");
a0d0e21e 4683 }
b8c5462f 4684
9a86a77b
JH
4685 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4686
b938889d 4687 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 4688 checkposixcc(pRExC_state);
b8c5462f 4689
f064b6ad
HS
4690 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4691 if (UCHARAT(RExC_parse) == ']')
4692 goto charclassloop;
ffc61ed2 4693
9a86a77b 4694 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
4695
4696 charclassloop:
4697
4698 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4699
73b437c8 4700 if (!range)
830247a4 4701 rangebegin = RExC_parse;
ffc61ed2 4702 if (UTF) {
5e12f4fb 4703 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 4704 RExC_end - RExC_parse,
9f7f3913 4705 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
4706 RExC_parse += numlen;
4707 }
4708 else
4709 value = UCHARAT(RExC_parse++);
9a86a77b
JH
4710 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4711 if (value == '[' && POSIXCC(nextvalue))
830247a4 4712 namedclass = regpposixcc(pRExC_state, value);
620e46c5 4713 else if (value == '\\') {
ffc61ed2 4714 if (UTF) {
5e12f4fb 4715 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 4716 RExC_end - RExC_parse,
9f7f3913 4717 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
4718 RExC_parse += numlen;
4719 }
4720 else
4721 value = UCHARAT(RExC_parse++);
470c3474 4722 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 4723 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
4724 * be a problem later if we want switch on Unicode.
4725 * A similar issue a little bit later when switching on
4726 * namedclass. --jhi */
ffc61ed2 4727 switch ((I32)value) {
b8c5462f
JH
4728 case 'w': namedclass = ANYOF_ALNUM; break;
4729 case 'W': namedclass = ANYOF_NALNUM; break;
4730 case 's': namedclass = ANYOF_SPACE; break;
4731 case 'S': namedclass = ANYOF_NSPACE; break;
4732 case 'd': namedclass = ANYOF_DIGIT; break;
4733 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
4734 case 'p':
4735 case 'P':
af6f566e 4736 if (RExC_parse >= RExC_end)
2a4859cd 4737 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 4738 if (*RExC_parse == '{') {
1df70142 4739 const U8 c = (U8)value;
ffc61ed2
JH
4740 e = strchr(RExC_parse++, '}');
4741 if (!e)
0da60cf5 4742 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
4743 while (isSPACE(UCHARAT(RExC_parse)))
4744 RExC_parse++;
4745 if (e == RExC_parse)
0da60cf5 4746 vFAIL2("Empty \\%c{}", c);
ffc61ed2 4747 n = e - RExC_parse;
ab13f0c7
JH
4748 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4749 n--;
ffc61ed2
JH
4750 }
4751 else {
4752 e = RExC_parse;
4753 n = 1;
4754 }
4755 if (!SIZE_ONLY) {
ab13f0c7
JH
4756 if (UCHARAT(RExC_parse) == '^') {
4757 RExC_parse++;
4758 n--;
4759 value = value == 'p' ? 'P' : 'p'; /* toggle */
4760 while (isSPACE(UCHARAT(RExC_parse))) {
4761 RExC_parse++;
4762 n--;
4763 }
4764 }
097eb12c
AL
4765 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
4766 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
4767 }
4768 RExC_parse = e + 1;
4769 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2
JP
4770 namedclass = ANYOF_MAX; /* no official name, but it's named */
4771 break;
b8c5462f
JH
4772 case 'n': value = '\n'; break;
4773 case 'r': value = '\r'; break;
4774 case 't': value = '\t'; break;
4775 case 'f': value = '\f'; break;
4776 case 'b': value = '\b'; break;
c7f1f016
NIS
4777 case 'e': value = ASCII_TO_NATIVE('\033');break;
4778 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 4779 case 'x':
ffc61ed2 4780 if (*RExC_parse == '{') {
a4c04bdc
NC
4781 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4782 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 4783 e = strchr(RExC_parse++, '}');
b81d288d 4784 if (!e)
ffc61ed2 4785 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
4786
4787 numlen = e - RExC_parse;
4788 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4789 RExC_parse = e + 1;
4790 }
4791 else {
a4c04bdc 4792 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4793 numlen = 2;
4794 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4795 RExC_parse += numlen;
4796 }
b8c5462f
JH
4797 break;
4798 case 'c':
830247a4 4799 value = UCHARAT(RExC_parse++);
b8c5462f
JH
4800 value = toCTRL(value);
4801 break;
4802 case '0': case '1': case '2': case '3': case '4':
4803 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
4804 {
4805 I32 flags = 0;
4806 numlen = 3;
4807 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 4808 RExC_parse += numlen;
b8c5462f 4809 break;
53305cf1 4810 }
1028017a 4811 default:
041457d9 4812 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
4813 vWARN2(RExC_parse,
4814 "Unrecognized escape \\%c in character class passed through",
4815 (int)value);
1028017a 4816 break;
b8c5462f 4817 }
ffc61ed2 4818 } /* end of \blah */
1b2d223b
JH
4819#ifdef EBCDIC
4820 else
4821 literal_endpoint++;
4822#endif
ffc61ed2
JH
4823
4824 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4825
4826 if (!SIZE_ONLY && !need_class)
936ed897 4827 ANYOF_CLASS_ZERO(ret);
ffc61ed2 4828
936ed897 4829 need_class = 1;
ffc61ed2
JH
4830
4831 /* a bad range like a-\d, a-[:digit:] ? */
4832 if (range) {
73b437c8 4833 if (!SIZE_ONLY) {
afd78fd5 4834 if (ckWARN(WARN_REGEXP)) {
097eb12c 4835 const int w =
afd78fd5
JH
4836 RExC_parse >= rangebegin ?
4837 RExC_parse - rangebegin : 0;
830247a4 4838 vWARN4(RExC_parse,
b45f050a 4839 "False [] range \"%*.*s\"",
097eb12c 4840 w, w, rangebegin);
afd78fd5 4841 }
3568d838
JH
4842 if (prevvalue < 256) {
4843 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
4844 ANYOF_BITMAP_SET(ret, '-');
4845 }
4846 else {
4847 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4848 Perl_sv_catpvf(aTHX_ listsv,
3568d838 4849 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 4850 }
b8c5462f 4851 }
ffc61ed2
JH
4852
4853 range = 0; /* this was not a true range */
73b437c8 4854 }
ffc61ed2 4855
73b437c8 4856 if (!SIZE_ONLY) {
c49a72a9
NC
4857 const char *what = NULL;
4858 char yesno = 0;
4859
3568d838
JH
4860 if (namedclass > OOB_NAMEDCLASS)
4861 optimize_invert = FALSE;
e2962f66
JH
4862 /* Possible truncation here but in some 64-bit environments
4863 * the compiler gets heartburn about switch on 64-bit values.
4864 * A similar issue a little earlier when switching on value.
98f323fa 4865 * --jhi */
e2962f66 4866 switch ((I32)namedclass) {
73b437c8
JH
4867 case ANYOF_ALNUM:
4868 if (LOC)
936ed897 4869 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
4870 else {
4871 for (value = 0; value < 256; value++)
4872 if (isALNUM(value))
936ed897 4873 ANYOF_BITMAP_SET(ret, value);
73b437c8 4874 }
c49a72a9
NC
4875 yesno = '+';
4876 what = "Word";
73b437c8
JH
4877 break;
4878 case ANYOF_NALNUM:
4879 if (LOC)
936ed897 4880 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
4881 else {
4882 for (value = 0; value < 256; value++)
4883 if (!isALNUM(value))
936ed897 4884 ANYOF_BITMAP_SET(ret, value);
73b437c8 4885 }
c49a72a9
NC
4886 yesno = '!';
4887 what = "Word";
73b437c8 4888 break;
ffc61ed2 4889 case ANYOF_ALNUMC:
73b437c8 4890 if (LOC)
ffc61ed2 4891 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
4892 else {
4893 for (value = 0; value < 256; value++)
ffc61ed2 4894 if (isALNUMC(value))
936ed897 4895 ANYOF_BITMAP_SET(ret, value);
73b437c8 4896 }
c49a72a9
NC
4897 yesno = '+';
4898 what = "Alnum";
73b437c8
JH
4899 break;
4900 case ANYOF_NALNUMC:
4901 if (LOC)
936ed897 4902 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
4903 else {
4904 for (value = 0; value < 256; value++)
4905 if (!isALNUMC(value))
936ed897 4906 ANYOF_BITMAP_SET(ret, value);
73b437c8 4907 }
c49a72a9
NC
4908 yesno = '!';
4909 what = "Alnum";
73b437c8
JH
4910 break;
4911 case ANYOF_ALPHA:
4912 if (LOC)
936ed897 4913 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
4914 else {
4915 for (value = 0; value < 256; value++)
4916 if (isALPHA(value))
936ed897 4917 ANYOF_BITMAP_SET(ret, value);
73b437c8 4918 }
c49a72a9
NC
4919 yesno = '+';
4920 what = "Alpha";
73b437c8
JH
4921 break;
4922 case ANYOF_NALPHA:
4923 if (LOC)
936ed897 4924 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
4925 else {
4926 for (value = 0; value < 256; value++)
4927 if (!isALPHA(value))
936ed897 4928 ANYOF_BITMAP_SET(ret, value);
73b437c8 4929 }
c49a72a9
NC
4930 yesno = '!';
4931 what = "Alpha";
73b437c8
JH
4932 break;
4933 case ANYOF_ASCII:
4934 if (LOC)
936ed897 4935 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 4936 else {
c7f1f016 4937#ifndef EBCDIC
1ba5c669
JH
4938 for (value = 0; value < 128; value++)
4939 ANYOF_BITMAP_SET(ret, value);
4940#else /* EBCDIC */
ffbc6a93 4941 for (value = 0; value < 256; value++) {
3a3c4447
JH
4942 if (isASCII(value))
4943 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4944 }
1ba5c669 4945#endif /* EBCDIC */
73b437c8 4946 }
c49a72a9
NC
4947 yesno = '+';
4948 what = "ASCII";
73b437c8
JH
4949 break;
4950 case ANYOF_NASCII:
4951 if (LOC)
936ed897 4952 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 4953 else {
c7f1f016 4954#ifndef EBCDIC
1ba5c669
JH
4955 for (value = 128; value < 256; value++)
4956 ANYOF_BITMAP_SET(ret, value);
4957#else /* EBCDIC */
ffbc6a93 4958 for (value = 0; value < 256; value++) {
3a3c4447
JH
4959 if (!isASCII(value))
4960 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4961 }
1ba5c669 4962#endif /* EBCDIC */
73b437c8 4963 }
c49a72a9
NC
4964 yesno = '!';
4965 what = "ASCII";
73b437c8 4966 break;
aaa51d5e
JF
4967 case ANYOF_BLANK:
4968 if (LOC)
4969 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4970 else {
4971 for (value = 0; value < 256; value++)
4972 if (isBLANK(value))
4973 ANYOF_BITMAP_SET(ret, value);
4974 }
c49a72a9
NC
4975 yesno = '+';
4976 what = "Blank";
aaa51d5e
JF
4977 break;
4978 case ANYOF_NBLANK:
4979 if (LOC)
4980 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4981 else {
4982 for (value = 0; value < 256; value++)
4983 if (!isBLANK(value))
4984 ANYOF_BITMAP_SET(ret, value);
4985 }
c49a72a9
NC
4986 yesno = '!';
4987 what = "Blank";
aaa51d5e 4988 break;
73b437c8
JH
4989 case ANYOF_CNTRL:
4990 if (LOC)
936ed897 4991 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
4992 else {
4993 for (value = 0; value < 256; value++)
4994 if (isCNTRL(value))
936ed897 4995 ANYOF_BITMAP_SET(ret, value);
73b437c8 4996 }
c49a72a9
NC
4997 yesno = '+';
4998 what = "Cntrl";
73b437c8
JH
4999 break;
5000 case ANYOF_NCNTRL:
5001 if (LOC)
936ed897 5002 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5003 else {
5004 for (value = 0; value < 256; value++)
5005 if (!isCNTRL(value))
936ed897 5006 ANYOF_BITMAP_SET(ret, value);
73b437c8 5007 }
c49a72a9
NC
5008 yesno = '!';
5009 what = "Cntrl";
ffc61ed2
JH
5010 break;
5011 case ANYOF_DIGIT:
5012 if (LOC)
5013 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5014 else {
5015 /* consecutive digits assumed */
5016 for (value = '0'; value <= '9'; value++)
5017 ANYOF_BITMAP_SET(ret, value);
5018 }
c49a72a9
NC
5019 yesno = '+';
5020 what = "Digit";
ffc61ed2
JH
5021 break;
5022 case ANYOF_NDIGIT:
5023 if (LOC)
5024 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5025 else {
5026 /* consecutive digits assumed */
5027 for (value = 0; value < '0'; value++)
5028 ANYOF_BITMAP_SET(ret, value);
5029 for (value = '9' + 1; value < 256; value++)
5030 ANYOF_BITMAP_SET(ret, value);
5031 }
c49a72a9
NC
5032 yesno = '!';
5033 what = "Digit";
73b437c8
JH
5034 break;
5035 case ANYOF_GRAPH:
5036 if (LOC)
936ed897 5037 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5038 else {
5039 for (value = 0; value < 256; value++)
5040 if (isGRAPH(value))
936ed897 5041 ANYOF_BITMAP_SET(ret, value);
73b437c8 5042 }
c49a72a9
NC
5043 yesno = '+';
5044 what = "Graph";
73b437c8
JH
5045 break;
5046 case ANYOF_NGRAPH:
5047 if (LOC)
936ed897 5048 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5049 else {
5050 for (value = 0; value < 256; value++)
5051 if (!isGRAPH(value))
936ed897 5052 ANYOF_BITMAP_SET(ret, value);
73b437c8 5053 }
c49a72a9
NC
5054 yesno = '!';
5055 what = "Graph";
73b437c8
JH
5056 break;
5057 case ANYOF_LOWER:
5058 if (LOC)
936ed897 5059 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5060 else {
5061 for (value = 0; value < 256; value++)
5062 if (isLOWER(value))
936ed897 5063 ANYOF_BITMAP_SET(ret, value);
73b437c8 5064 }
c49a72a9
NC
5065 yesno = '+';
5066 what = "Lower";
73b437c8
JH
5067 break;
5068 case ANYOF_NLOWER:
5069 if (LOC)
936ed897 5070 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5071 else {
5072 for (value = 0; value < 256; value++)
5073 if (!isLOWER(value))
936ed897 5074 ANYOF_BITMAP_SET(ret, value);
73b437c8 5075 }
c49a72a9
NC
5076 yesno = '!';
5077 what = "Lower";
73b437c8
JH
5078 break;
5079 case ANYOF_PRINT:
5080 if (LOC)
936ed897 5081 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5082 else {
5083 for (value = 0; value < 256; value++)
5084 if (isPRINT(value))
936ed897 5085 ANYOF_BITMAP_SET(ret, value);
73b437c8 5086 }
c49a72a9
NC
5087 yesno = '+';
5088 what = "Print";
73b437c8
JH
5089 break;
5090 case ANYOF_NPRINT:
5091 if (LOC)
936ed897 5092 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5093 else {
5094 for (value = 0; value < 256; value++)
5095 if (!isPRINT(value))
936ed897 5096 ANYOF_BITMAP_SET(ret, value);
73b437c8 5097 }
c49a72a9
NC
5098 yesno = '!';
5099 what = "Print";
73b437c8 5100 break;
aaa51d5e
JF
5101 case ANYOF_PSXSPC:
5102 if (LOC)
5103 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5104 else {
5105 for (value = 0; value < 256; value++)
5106 if (isPSXSPC(value))
5107 ANYOF_BITMAP_SET(ret, value);
5108 }
c49a72a9
NC
5109 yesno = '+';
5110 what = "Space";
aaa51d5e
JF
5111 break;
5112 case ANYOF_NPSXSPC:
5113 if (LOC)
5114 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5115 else {
5116 for (value = 0; value < 256; value++)
5117 if (!isPSXSPC(value))
5118 ANYOF_BITMAP_SET(ret, value);
5119 }
c49a72a9
NC
5120 yesno = '!';
5121 what = "Space";
aaa51d5e 5122 break;
73b437c8
JH
5123 case ANYOF_PUNCT:
5124 if (LOC)
936ed897 5125 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5126 else {
5127 for (value = 0; value < 256; value++)
5128 if (isPUNCT(value))
936ed897 5129 ANYOF_BITMAP_SET(ret, value);
73b437c8 5130 }
c49a72a9
NC
5131 yesno = '+';
5132 what = "Punct";
73b437c8
JH
5133 break;
5134 case ANYOF_NPUNCT:
5135 if (LOC)
936ed897 5136 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5137 else {
5138 for (value = 0; value < 256; value++)
5139 if (!isPUNCT(value))
936ed897 5140 ANYOF_BITMAP_SET(ret, value);
73b437c8 5141 }
c49a72a9
NC
5142 yesno = '!';
5143 what = "Punct";
ffc61ed2
JH
5144 break;
5145 case ANYOF_SPACE:
5146 if (LOC)
5147 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5148 else {
5149 for (value = 0; value < 256; value++)
5150 if (isSPACE(value))
5151 ANYOF_BITMAP_SET(ret, value);
5152 }
c49a72a9
NC
5153 yesno = '+';
5154 what = "SpacePerl";
ffc61ed2
JH
5155 break;
5156 case ANYOF_NSPACE:
5157 if (LOC)
5158 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5159 else {
5160 for (value = 0; value < 256; value++)
5161 if (!isSPACE(value))
5162 ANYOF_BITMAP_SET(ret, value);
5163 }
c49a72a9
NC
5164 yesno = '!';
5165 what = "SpacePerl";
73b437c8
JH
5166 break;
5167 case ANYOF_UPPER:
5168 if (LOC)
936ed897 5169 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5170 else {
5171 for (value = 0; value < 256; value++)
5172 if (isUPPER(value))
936ed897 5173 ANYOF_BITMAP_SET(ret, value);
73b437c8 5174 }
c49a72a9
NC
5175 yesno = '+';
5176 what = "Upper";
73b437c8
JH
5177 break;
5178 case ANYOF_NUPPER:
5179 if (LOC)
936ed897 5180 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5181 else {
5182 for (value = 0; value < 256; value++)
5183 if (!isUPPER(value))
936ed897 5184 ANYOF_BITMAP_SET(ret, value);
73b437c8 5185 }
c49a72a9
NC
5186 yesno = '!';
5187 what = "Upper";
73b437c8
JH
5188 break;
5189 case ANYOF_XDIGIT:
5190 if (LOC)
936ed897 5191 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5192 else {
5193 for (value = 0; value < 256; value++)
5194 if (isXDIGIT(value))
936ed897 5195 ANYOF_BITMAP_SET(ret, value);
73b437c8 5196 }
c49a72a9
NC
5197 yesno = '+';
5198 what = "XDigit";
73b437c8
JH
5199 break;
5200 case ANYOF_NXDIGIT:
5201 if (LOC)
936ed897 5202 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5203 else {
5204 for (value = 0; value < 256; value++)
5205 if (!isXDIGIT(value))
936ed897 5206 ANYOF_BITMAP_SET(ret, value);
73b437c8 5207 }
c49a72a9
NC
5208 yesno = '!';
5209 what = "XDigit";
73b437c8 5210 break;
f81125e2
JP
5211 case ANYOF_MAX:
5212 /* this is to handle \p and \P */
5213 break;
73b437c8 5214 default:
b45f050a 5215 vFAIL("Invalid [::] class");
73b437c8 5216 break;
b8c5462f 5217 }
c49a72a9
NC
5218 if (what) {
5219 /* Strings such as "+utf8::isWord\n" */
5220 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5221 }
b8c5462f 5222 if (LOC)
936ed897 5223 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5224 continue;
a0d0e21e 5225 }
ffc61ed2
JH
5226 } /* end of namedclass \blah */
5227
a0d0e21e 5228 if (range) {
eb160463 5229 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
5230 const int w = RExC_parse - rangebegin;
5231 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 5232 range = 0; /* not a valid range */
73b437c8 5233 }
a0d0e21e
LW
5234 }
5235 else {
3568d838 5236 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5237 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5238 RExC_parse[1] != ']') {
5239 RExC_parse++;
ffc61ed2
JH
5240
5241 /* a bad range like \w-, [:word:]- ? */
5242 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 5243 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 5244 const int w =
afd78fd5
JH
5245 RExC_parse >= rangebegin ?
5246 RExC_parse - rangebegin : 0;
830247a4 5247 vWARN4(RExC_parse,
b45f050a 5248 "False [] range \"%*.*s\"",
097eb12c 5249 w, w, rangebegin);
afd78fd5 5250 }
73b437c8 5251 if (!SIZE_ONLY)
936ed897 5252 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5253 } else
ffc61ed2
JH
5254 range = 1; /* yeah, it's a range! */
5255 continue; /* but do it the next time */
a0d0e21e 5256 }
a687059c 5257 }
ffc61ed2 5258
93733859 5259 /* now is the next time */
ae5c130c 5260 if (!SIZE_ONLY) {
3568d838
JH
5261 IV i;
5262
5263 if (prevvalue < 256) {
1df70142 5264 const IV ceilvalue = value < 256 ? value : 255;
3568d838
JH
5265
5266#ifdef EBCDIC
1b2d223b
JH
5267 /* In EBCDIC [\x89-\x91] should include
5268 * the \x8e but [i-j] should not. */
5269 if (literal_endpoint == 2 &&
5270 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5271 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5272 {
3568d838
JH
5273 if (isLOWER(prevvalue)) {
5274 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5275 if (isLOWER(i))
5276 ANYOF_BITMAP_SET(ret, i);
5277 } else {
3568d838 5278 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5279 if (isUPPER(i))
5280 ANYOF_BITMAP_SET(ret, i);
5281 }
8ada0baa 5282 }
ffc61ed2 5283 else
8ada0baa 5284#endif
a5961de5
JH
5285 for (i = prevvalue; i <= ceilvalue; i++)
5286 ANYOF_BITMAP_SET(ret, i);
3568d838 5287 }
a5961de5 5288 if (value > 255 || UTF) {
1df70142
AL
5289 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5290 const UV natvalue = NATIVE_TO_UNI(value);
b08decb7 5291
ffc61ed2 5292 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5293 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5294 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5295 prevnatvalue, natvalue);
5296 }
5297 else if (prevnatvalue == natvalue) {
5298 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5299 if (FOLD) {
89ebb4a3 5300 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5301 STRLEN foldlen;
1df70142 5302 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5303
c840d2a2
JH
5304 /* If folding and foldable and a single
5305 * character, insert also the folded version
5306 * to the charclass. */
9e55ce06 5307 if (f != value) {
eb160463 5308 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5309 Perl_sv_catpvf(aTHX_ listsv,
5310 "%04"UVxf"\n", f);
5311 else {
5312 /* Any multicharacter foldings
5313 * require the following transform:
5314 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5315 * where E folds into "pq" and F folds
5316 * into "rst", all other characters
5317 * fold to single characters. We save
5318 * away these multicharacter foldings,
5319 * to be later saved as part of the
5320 * additional "s" data. */
5321 SV *sv;
5322
5323 if (!unicode_alternate)
5324 unicode_alternate = newAV();
5325 sv = newSVpvn((char*)foldbuf, foldlen);
5326 SvUTF8_on(sv);
5327 av_push(unicode_alternate, sv);
5328 }
5329 }
254ba52a 5330
60a8b682
JH
5331 /* If folding and the value is one of the Greek
5332 * sigmas insert a few more sigmas to make the
5333 * folding rules of the sigmas to work right.
5334 * Note that not all the possible combinations
5335 * are handled here: some of them are handled
9e55ce06
JH
5336 * by the standard folding rules, and some of
5337 * them (literal or EXACTF cases) are handled
5338 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5339 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5340 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5341 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5342 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5343 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5344 }
5345 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5346 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5347 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5348 }
5349 }
ffc61ed2 5350 }
1b2d223b
JH
5351#ifdef EBCDIC
5352 literal_endpoint = 0;
5353#endif
8ada0baa 5354 }
ffc61ed2
JH
5355
5356 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5357 }
ffc61ed2 5358
936ed897 5359 if (need_class) {
4f66b38d 5360 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5361 if (SIZE_ONLY)
830247a4 5362 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5363 else
830247a4 5364 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5365 }
ffc61ed2 5366
ae5c130c 5367 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 5368 if (!SIZE_ONLY &&
ffc61ed2 5369 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5370 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5371 ) {
a0ed51b3 5372 for (value = 0; value < 256; ++value) {
936ed897 5373 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5374 UV fold = PL_fold[value];
ffc61ed2
JH
5375
5376 if (fold != value)
5377 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5378 }
5379 }
936ed897 5380 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5381 }
ffc61ed2 5382
ae5c130c 5383 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 5384 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
5385 /* If the only flag is inversion. */
5386 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5387 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5388 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5389 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5390 }
a0d0e21e 5391
b81d288d 5392 if (!SIZE_ONLY) {
097eb12c 5393 AV * const av = newAV();
ffc61ed2
JH
5394 SV *rv;
5395
9e55ce06 5396 /* The 0th element stores the character class description
6a0407ee 5397 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5398 * to initialize the appropriate swash (which gets stored in
5399 * the 1st element), and also useful for dumping the regnode.
5400 * The 2nd element stores the multicharacter foldings,
6a0407ee 5401 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
5402 av_store(av, 0, listsv);
5403 av_store(av, 1, NULL);
9e55ce06 5404 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 5405 rv = newRV_noinc((SV*)av);
19860706 5406 n = add_data(pRExC_state, 1, "s");
830247a4 5407 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 5408 ARG_SET(ret, n);
a0ed51b3
LW
5409 }
5410
5411 return ret;
5412}
5413
76e3520e 5414STATIC char*
830247a4 5415S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 5416{
097eb12c 5417 char* const retval = RExC_parse++;
a0d0e21e 5418
4633a7c4 5419 for (;;) {
830247a4
IZ
5420 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5421 RExC_parse[2] == '#') {
e994fd66
AE
5422 while (*RExC_parse != ')') {
5423 if (RExC_parse == RExC_end)
5424 FAIL("Sequence (?#... not terminated");
830247a4 5425 RExC_parse++;
e994fd66 5426 }
830247a4 5427 RExC_parse++;
4633a7c4
LW
5428 continue;
5429 }
e2509266 5430 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
5431 if (isSPACE(*RExC_parse)) {
5432 RExC_parse++;
748a9306
LW
5433 continue;
5434 }
830247a4 5435 else if (*RExC_parse == '#') {
e994fd66
AE
5436 while (RExC_parse < RExC_end)
5437 if (*RExC_parse++ == '\n') break;
748a9306
LW
5438 continue;
5439 }
748a9306 5440 }
4633a7c4 5441 return retval;
a0d0e21e 5442 }
a687059c
LW
5443}
5444
5445/*
c277df42 5446- reg_node - emit a node
a0d0e21e 5447*/
76e3520e 5448STATIC regnode * /* Location. */
830247a4 5449S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 5450{
97aff369 5451 dVAR;
c277df42 5452 register regnode *ptr;
504618e9 5453 regnode * const ret = RExC_emit;
a687059c 5454
c277df42 5455 if (SIZE_ONLY) {
830247a4
IZ
5456 SIZE_ALIGN(RExC_size);
5457 RExC_size += 1;
a0d0e21e
LW
5458 return(ret);
5459 }
a687059c 5460
c277df42 5461 NODE_ALIGN_FILL(ret);
a0d0e21e 5462 ptr = ret;
c277df42 5463 FILL_ADVANCE_NODE(ptr, op);
fac92740 5464 if (RExC_offsets) { /* MJD */
ccb2c380 5465 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
5466 "reg_node", __LINE__,
5467 reg_name[op],
5468 RExC_emit - RExC_emit_start > RExC_offsets[0]
5469 ? "Overwriting end of array!\n" : "OK",
5470 RExC_emit - RExC_emit_start,
5471 RExC_parse - RExC_start,
5472 RExC_offsets[0]));
ccb2c380 5473 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
5474 }
5475
830247a4 5476 RExC_emit = ptr;
a687059c 5477
a0d0e21e 5478 return(ret);
a687059c
LW
5479}
5480
5481/*
a0d0e21e
LW
5482- reganode - emit a node with an argument
5483*/
76e3520e 5484STATIC regnode * /* Location. */
830247a4 5485S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 5486{
97aff369 5487 dVAR;
c277df42 5488 register regnode *ptr;
504618e9 5489 regnode * const ret = RExC_emit;
fe14fcc3 5490
c277df42 5491 if (SIZE_ONLY) {
830247a4
IZ
5492 SIZE_ALIGN(RExC_size);
5493 RExC_size += 2;
a0d0e21e
LW
5494 return(ret);
5495 }
fe14fcc3 5496
c277df42 5497 NODE_ALIGN_FILL(ret);
a0d0e21e 5498 ptr = ret;
c277df42 5499 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 5500 if (RExC_offsets) { /* MJD */
ccb2c380 5501 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5502 "reganode",
ccb2c380
MP
5503 __LINE__,
5504 reg_name[op],
fac92740
MJD
5505 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5506 "Overwriting end of array!\n" : "OK",
5507 RExC_emit - RExC_emit_start,
5508 RExC_parse - RExC_start,
5509 RExC_offsets[0]));
ccb2c380 5510 Set_Cur_Node_Offset;
fac92740
MJD
5511 }
5512
830247a4 5513 RExC_emit = ptr;
fe14fcc3 5514
a0d0e21e 5515 return(ret);
fe14fcc3
LW
5516}
5517
5518/*
cd439c50 5519- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
5520*/
5521STATIC void
a28509cc 5522S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 5523{
97aff369 5524 dVAR;
5e12f4fb 5525 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
5526}
5527
5528/*
a0d0e21e
LW
5529- reginsert - insert an operator in front of already-emitted operand
5530*
5531* Means relocating the operand.
5532*/
76e3520e 5533STATIC void
830247a4 5534S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 5535{
97aff369 5536 dVAR;
c277df42
IZ
5537 register regnode *src;
5538 register regnode *dst;
5539 register regnode *place;
504618e9 5540 const int offset = regarglen[(U8)op];
b81d288d 5541
22c35a8c 5542/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
5543
5544 if (SIZE_ONLY) {
830247a4 5545 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
5546 return;
5547 }
a687059c 5548
830247a4
IZ
5549 src = RExC_emit;
5550 RExC_emit += NODE_STEP_REGNODE + offset;
5551 dst = RExC_emit;
fac92740 5552 while (src > opnd) {
c277df42 5553 StructCopy(--src, --dst, regnode);
fac92740 5554 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 5555 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 5556 "reg_insert",
ccb2c380
MP
5557 __LINE__,
5558 reg_name[op],
fac92740
MJD
5559 dst - RExC_emit_start > RExC_offsets[0]
5560 ? "Overwriting end of array!\n" : "OK",
5561 src - RExC_emit_start,
5562 dst - RExC_emit_start,
5563 RExC_offsets[0]));
ccb2c380
MP
5564 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5565 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
5566 }
5567 }
5568
a0d0e21e
LW
5569
5570 place = opnd; /* Op node, where operand used to be. */
fac92740 5571 if (RExC_offsets) { /* MJD */
ccb2c380 5572 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5573 "reginsert",
ccb2c380
MP
5574 __LINE__,
5575 reg_name[op],
fac92740
MJD
5576 place - RExC_emit_start > RExC_offsets[0]
5577 ? "Overwriting end of array!\n" : "OK",
5578 place - RExC_emit_start,
5579 RExC_parse - RExC_start,
5580 RExC_offsets[0]));
ccb2c380 5581 Set_Node_Offset(place, RExC_parse);
45948336 5582 Set_Node_Length(place, 1);
fac92740 5583 }
c277df42
IZ
5584 src = NEXTOPER(place);
5585 FILL_ADVANCE_NODE(place, op);
5586 Zero(src, offset, regnode);
a687059c
LW
5587}
5588
5589/*
c277df42 5590- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 5591*/
097eb12c 5592/* TODO: All three parms should be const */
76e3520e 5593STATIC void
097eb12c 5594S_regtail(pTHX_ const RExC_state_t *pRExC_state, regnode *p, const regnode *val)
a687059c 5595{
97aff369 5596 dVAR;
c277df42 5597 register regnode *scan;
a0d0e21e 5598
c277df42 5599 if (SIZE_ONLY)
a0d0e21e
LW
5600 return;
5601
5602 /* Find last node. */
5603 scan = p;
5604 for (;;) {
504618e9 5605 regnode * const temp = regnext(scan);
a0d0e21e
LW
5606 if (temp == NULL)
5607 break;
5608 scan = temp;
5609 }
a687059c 5610
c277df42
IZ
5611 if (reg_off_by_arg[OP(scan)]) {
5612 ARG_SET(scan, val - scan);
a0ed51b3
LW
5613 }
5614 else {
c277df42
IZ
5615 NEXT_OFF(scan) = val - scan;
5616 }
a687059c
LW
5617}
5618
5619/*
a0d0e21e
LW
5620- regoptail - regtail on operand of first argument; nop if operandless
5621*/
097eb12c 5622/* TODO: All three parms should be const */
76e3520e 5623STATIC void
097eb12c 5624S_regoptail(pTHX_ const RExC_state_t *pRExC_state, regnode *p, const regnode *val)
a687059c 5625{
97aff369 5626 dVAR;
a0d0e21e 5627 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
5628 if (p == NULL || SIZE_ONLY)
5629 return;
22c35a8c 5630 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 5631 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 5632 }
22c35a8c 5633 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 5634 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
5635 }
5636 else
a0d0e21e 5637 return;
a687059c
LW
5638}
5639
5640/*
5641 - regcurly - a little FSA that accepts {\d+,?\d*}
5642 */
79072805 5643STATIC I32
5f66b61c 5644S_regcurly(register const char *s)
a687059c
LW
5645{
5646 if (*s++ != '{')
5647 return FALSE;
f0fcb552 5648 if (!isDIGIT(*s))
a687059c 5649 return FALSE;
f0fcb552 5650 while (isDIGIT(*s))
a687059c
LW
5651 s++;
5652 if (*s == ',')
5653 s++;
f0fcb552 5654 while (isDIGIT(*s))
a687059c
LW
5655 s++;
5656 if (*s != '}')
5657 return FALSE;
5658 return TRUE;
5659}
5660
a687059c
LW
5661
5662/*
fd181c75 5663 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
5664 */
5665void
097eb12c 5666Perl_regdump(pTHX_ const regexp *r)
a687059c 5667{
35ff7856 5668#ifdef DEBUGGING
97aff369 5669 dVAR;
c445ea15 5670 SV * const sv = sv_newmortal();
a687059c 5671
4f639d21 5672 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
5673
5674 /* Header fields of interest. */
c277df42 5675 if (r->anchored_substr)
7b0972df 5676 PerlIO_printf(Perl_debug_log,
a0288114 5677 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
3280af22 5678 PL_colors[0],
7b0972df 5679 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
3f7c398e 5680 SvPVX_const(r->anchored_substr),
3280af22 5681 PL_colors[1],
c277df42 5682 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 5683 (IV)r->anchored_offset);
33b8afdf
JH
5684 else if (r->anchored_utf8)
5685 PerlIO_printf(Perl_debug_log,
a0288114 5686 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
33b8afdf
JH
5687 PL_colors[0],
5688 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
3f7c398e 5689 SvPVX_const(r->anchored_utf8),
33b8afdf
JH
5690 PL_colors[1],
5691 SvTAIL(r->anchored_utf8) ? "$" : "",
5692 (IV)r->anchored_offset);
c277df42 5693 if (r->float_substr)
7b0972df 5694 PerlIO_printf(Perl_debug_log,
a0288114 5695 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
3280af22 5696 PL_colors[0],
b81d288d 5697 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
3f7c398e 5698 SvPVX_const(r->float_substr),
3280af22 5699 PL_colors[1],
c277df42 5700 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 5701 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
5702 else if (r->float_utf8)
5703 PerlIO_printf(Perl_debug_log,
a0288114 5704 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
33b8afdf
JH
5705 PL_colors[0],
5706 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
3f7c398e 5707 SvPVX_const(r->float_utf8),
33b8afdf
JH
5708 PL_colors[1],
5709 SvTAIL(r->float_utf8) ? "$" : "",
5710 (IV)r->float_min_offset, (UV)r->float_max_offset);
5711 if (r->check_substr || r->check_utf8)
b81d288d
AB
5712 PerlIO_printf(Perl_debug_log,
5713 r->check_substr == r->float_substr
33b8afdf 5714 && r->check_utf8 == r->float_utf8
c277df42
IZ
5715 ? "(checking floating" : "(checking anchored");
5716 if (r->reganch & ROPT_NOSCAN)
5717 PerlIO_printf(Perl_debug_log, " noscan");
5718 if (r->reganch & ROPT_CHECK_ALL)
5719 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 5720 if (r->check_substr || r->check_utf8)
c277df42
IZ
5721 PerlIO_printf(Perl_debug_log, ") ");
5722
46fc3d4c 5723 if (r->regstclass) {
32fc9b6a 5724 regprop(r, sv, r->regstclass);
3f7c398e 5725 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
46fc3d4c 5726 }
774d564b 5727 if (r->reganch & ROPT_ANCH) {
5728 PerlIO_printf(Perl_debug_log, "anchored");
5729 if (r->reganch & ROPT_ANCH_BOL)
5730 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
5731 if (r->reganch & ROPT_ANCH_MBOL)
5732 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
5733 if (r->reganch & ROPT_ANCH_SBOL)
5734 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 5735 if (r->reganch & ROPT_ANCH_GPOS)
5736 PerlIO_printf(Perl_debug_log, "(GPOS)");
5737 PerlIO_putc(Perl_debug_log, ' ');
5738 }
c277df42
IZ
5739 if (r->reganch & ROPT_GPOS_SEEN)
5740 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 5741 if (r->reganch & ROPT_SKIP)
760ac839 5742 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 5743 if (r->reganch & ROPT_IMPLICIT)
760ac839 5744 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 5745 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
5746 if (r->reganch & ROPT_EVAL_SEEN)
5747 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 5748 PerlIO_printf(Perl_debug_log, "\n");
fac92740 5749 if (r->offsets) {
e4584336 5750 const U32 len = r->offsets[0];
a3621e74
YO
5751 GET_RE_DEBUG_FLAGS_DECL;
5752 DEBUG_OFFSETS_r({
1df70142 5753 U32 i;
e4584336
RB
5754 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5755 for (i = 1; i <= len; i++)
5756 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5757 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5758 PerlIO_printf(Perl_debug_log, "\n");
a3621e74 5759 });
fac92740 5760 }
65e66c80 5761#else
96a5add6 5762 PERL_UNUSED_CONTEXT;
65e66c80 5763 PERL_UNUSED_ARG(r);
17c3b450 5764#endif /* DEBUGGING */
a687059c
LW
5765}
5766
5767/*
a0d0e21e
LW
5768- regprop - printable representation of opcode
5769*/
46fc3d4c 5770void
32fc9b6a 5771Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 5772{
35ff7856 5773#ifdef DEBUGGING
97aff369 5774 dVAR;
9b155405 5775 register int k;
a0d0e21e 5776
54dc92de 5777 sv_setpvn(sv, "", 0);
9b155405 5778 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
5779 /* It would be nice to FAIL() here, but this may be called from
5780 regexec.c, and it would be hard to supply pRExC_state. */
5781 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 5782 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405
IZ
5783
5784 k = PL_regkind[(U8)OP(o)];
5785
2a782b5b 5786 if (k == EXACT) {
396482e1 5787 SV * const dsv = sv_2mortal(newSVpvs(""));
c728cb41
JH
5788 /* Using is_utf8_string() is a crude hack but it may
5789 * be the best for now since we have no flag "this EXACTish
5790 * node was UTF-8" --jhi */
1df70142 5791 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
2d03de9c 5792 const char * const s = do_utf8 ?
c728cb41
JH
5793 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5794 UNI_DISPLAY_REGEX) :
2a782b5b 5795 STRING(o);
e1ec3a88 5796 const int len = do_utf8 ?
2a782b5b
JH
5797 strlen(s) :
5798 STR_LEN(o);
5799 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5800 PL_colors[0],
5801 len, s,
5802 PL_colors[1]);
bb263b4e
AL
5803 } else if (k == TRIE) {
5804 /*EMPTY*/;
4f639d21
DM
5805 /* print the details od the trie in dumpuntil instead, as
5806 * prog->data isn't available here */
a3621e74 5807 } else if (k == CURLY) {
cb434fcc 5808 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
5809 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5810 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 5811 }
2c2d71f5
JH
5812 else if (k == WHILEM && o->flags) /* Ordinal/of */
5813 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 5814 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 5815 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 5816 else if (k == LOGICAL)
04ebc1ab 5817 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
5818 else if (k == ANYOF) {
5819 int i, rangestart = -1;
2d03de9c 5820 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
5821
5822 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
5823 static const char * const anyofs[] = {
653099ff
GS
5824 "\\w",
5825 "\\W",
5826 "\\s",
5827 "\\S",
5828 "\\d",
5829 "\\D",
5830 "[:alnum:]",
5831 "[:^alnum:]",
5832 "[:alpha:]",
5833 "[:^alpha:]",
5834 "[:ascii:]",
5835 "[:^ascii:]",
5836 "[:ctrl:]",
5837 "[:^ctrl:]",
5838 "[:graph:]",
5839 "[:^graph:]",
5840 "[:lower:]",
5841 "[:^lower:]",
5842 "[:print:]",
5843 "[:^print:]",
5844 "[:punct:]",
5845 "[:^punct:]",
5846 "[:upper:]",
aaa51d5e 5847 "[:^upper:]",
653099ff 5848 "[:xdigit:]",
aaa51d5e
JF
5849 "[:^xdigit:]",
5850 "[:space:]",
5851 "[:^space:]",
5852 "[:blank:]",
5853 "[:^blank:]"
653099ff
GS
5854 };
5855
19860706 5856 if (flags & ANYOF_LOCALE)
396482e1 5857 sv_catpvs(sv, "{loc}");
19860706 5858 if (flags & ANYOF_FOLD)
396482e1 5859 sv_catpvs(sv, "{i}");
653099ff 5860 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 5861 if (flags & ANYOF_INVERT)
396482e1 5862 sv_catpvs(sv, "^");
ffc61ed2
JH
5863 for (i = 0; i <= 256; i++) {
5864 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5865 if (rangestart == -1)
5866 rangestart = i;
5867 } else if (rangestart != -1) {
5868 if (i <= rangestart + 3)
5869 for (; rangestart < i; rangestart++)
653099ff 5870 put_byte(sv, rangestart);
ffc61ed2
JH
5871 else {
5872 put_byte(sv, rangestart);
396482e1 5873 sv_catpvs(sv, "-");
ffc61ed2 5874 put_byte(sv, i - 1);
653099ff 5875 }
ffc61ed2 5876 rangestart = -1;
653099ff 5877 }
847a199f 5878 }
ffc61ed2
JH
5879
5880 if (o->flags & ANYOF_CLASS)
5881 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
5882 if (ANYOF_CLASS_TEST(o,i))
5883 sv_catpv(sv, anyofs[i]);
5884
5885 if (flags & ANYOF_UNICODE)
396482e1 5886 sv_catpvs(sv, "{unicode}");
1aa99e6b 5887 else if (flags & ANYOF_UNICODE_ALL)
396482e1 5888 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
5889
5890 {
5891 SV *lv;
32fc9b6a 5892 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 5893
ffc61ed2
JH
5894 if (lv) {
5895 if (sw) {
89ebb4a3 5896 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 5897
ffc61ed2 5898 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 5899 uvchr_to_utf8(s, i);
ffc61ed2 5900
3568d838 5901 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
5902 if (rangestart == -1)
5903 rangestart = i;
5904 } else if (rangestart != -1) {
ffc61ed2
JH
5905 if (i <= rangestart + 3)
5906 for (; rangestart < i; rangestart++) {
2d03de9c
AL
5907 const U8 * const e = uvchr_to_utf8(s,rangestart);
5908 U8 *p;
5909 for(p = s; p < e; p++)
ffc61ed2
JH
5910 put_byte(sv, *p);
5911 }
5912 else {
2d03de9c
AL
5913 const U8 *e = uvchr_to_utf8(s,rangestart);
5914 U8 *p;
5915 for (p = s; p < e; p++)
ffc61ed2 5916 put_byte(sv, *p);
396482e1 5917 sv_catpvs(sv, "-");
2d03de9c
AL
5918 e = uvchr_to_utf8(s, i-1);
5919 for (p = s; p < e; p++)
1df70142 5920 put_byte(sv, *p);
ffc61ed2
JH
5921 }
5922 rangestart = -1;
5923 }
19860706 5924 }
ffc61ed2 5925
396482e1 5926 sv_catpvs(sv, "..."); /* et cetera */
19860706 5927 }
fde631ed 5928
ffc61ed2 5929 {
2e0de35c 5930 char *s = savesvpv(lv);
c445ea15 5931 char * const origs = s;
b81d288d 5932
ffc61ed2 5933 while(*s && *s != '\n') s++;
b81d288d 5934
ffc61ed2 5935 if (*s == '\n') {
2d03de9c 5936 const char * const t = ++s;
ffc61ed2
JH
5937
5938 while (*s) {
5939 if (*s == '\n')
5940 *s = ' ';
5941 s++;
5942 }
5943 if (s[-1] == ' ')
5944 s[-1] = 0;
5945
5946 sv_catpv(sv, t);
fde631ed 5947 }
b81d288d 5948
ffc61ed2 5949 Safefree(origs);
fde631ed
JH
5950 }
5951 }
653099ff 5952 }
ffc61ed2 5953
653099ff
GS
5954 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5955 }
9b155405 5956 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 5957 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
65e66c80 5958#else
96a5add6 5959 PERL_UNUSED_CONTEXT;
65e66c80
SP
5960 PERL_UNUSED_ARG(sv);
5961 PERL_UNUSED_ARG(o);
17c3b450 5962#endif /* DEBUGGING */
35ff7856 5963}
a687059c 5964
cad2e5aa
JH
5965SV *
5966Perl_re_intuit_string(pTHX_ regexp *prog)
5967{ /* Assume that RE_INTUIT is set */
97aff369 5968 dVAR;
a3621e74 5969 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
5970 PERL_UNUSED_CONTEXT;
5971
a3621e74 5972 DEBUG_COMPILE_r(
cfd0369c 5973 {
2d03de9c 5974 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 5975 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
5976
5977 if (!PL_colorset) reginitcolors();
5978 PerlIO_printf(Perl_debug_log,
a0288114 5979 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
5980 PL_colors[4],
5981 prog->check_substr ? "" : "utf8 ",
5982 PL_colors[5],PL_colors[0],
cad2e5aa
JH
5983 s,
5984 PL_colors[1],
5985 (strlen(s) > 60 ? "..." : ""));
5986 } );
5987
33b8afdf 5988 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
5989}
5990
2b69d0c2 5991void
864dbfa3 5992Perl_pregfree(pTHX_ struct regexp *r)
a687059c 5993{
27da23d5 5994 dVAR;
9e55ce06 5995#ifdef DEBUGGING
c445ea15
AL
5996 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
5997 SV * const re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
9e55ce06 5998#endif
7821416a 5999
a3621e74 6000
7821416a
IZ
6001 if (!r || (--r->refcnt > 0))
6002 return;
a3621e74 6003 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
c445ea15 6004 const char * const s = (r->reganch & ROPT_UTF8)
e1ec3a88 6005 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 6006 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
e1ec3a88 6007 const int len = SvCUR(dsv);
9e55ce06
JH
6008 if (!PL_colorset)
6009 reginitcolors();
6010 PerlIO_printf(Perl_debug_log,
a3621e74 6011 "%sFreeing REx:%s %s%*.*s%s%s\n",
9e55ce06
JH
6012 PL_colors[4],PL_colors[5],PL_colors[0],
6013 len, len, s,
6014 PL_colors[1],
6015 len > 60 ? "..." : "");
6016 });
cad2e5aa 6017
43c5f42d
NC
6018 /* gcov results gave these as non-null 100% of the time, so there's no
6019 optimisation in checking them before calling Safefree */
6020 Safefree(r->precomp);
6021 Safefree(r->offsets); /* 20010421 MJD */
ed252734 6022 RX_MATCH_COPY_FREE(r);
f8c7b90f 6023#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
6024 if (r->saved_copy)
6025 SvREFCNT_dec(r->saved_copy);
6026#endif
a193d654
GS
6027 if (r->substrs) {
6028 if (r->anchored_substr)
6029 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6030 if (r->anchored_utf8)
6031 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6032 if (r->float_substr)
6033 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6034 if (r->float_utf8)
6035 SvREFCNT_dec(r->float_utf8);
2779dcf1 6036 Safefree(r->substrs);
a193d654 6037 }
c277df42
IZ
6038 if (r->data) {
6039 int n = r->data->count;
f3548bdc
DM
6040 PAD* new_comppad = NULL;
6041 PAD* old_comppad;
4026c95a 6042 PADOFFSET refcnt;
dfad63ad 6043
c277df42 6044 while (--n >= 0) {
261faec3 6045 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6046 switch (r->data->what[n]) {
6047 case 's':
6048 SvREFCNT_dec((SV*)r->data->data[n]);
6049 break;
653099ff
GS
6050 case 'f':
6051 Safefree(r->data->data[n]);
6052 break;
dfad63ad
HS
6053 case 'p':
6054 new_comppad = (AV*)r->data->data[n];
6055 break;
c277df42 6056 case 'o':
dfad63ad 6057 if (new_comppad == NULL)
cea2e8a9 6058 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6059 PAD_SAVE_LOCAL(old_comppad,
6060 /* Watch out for global destruction's random ordering. */
c445ea15 6061 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 6062 );
b34c0dd4 6063 OP_REFCNT_LOCK;
4026c95a
SH
6064 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6065 OP_REFCNT_UNLOCK;
6066 if (!refcnt)
9b978d73 6067 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6068
f3548bdc 6069 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6070 SvREFCNT_dec((SV*)new_comppad);
6071 new_comppad = NULL;
c277df42
IZ
6072 break;
6073 case 'n':
9e55ce06 6074 break;
a3621e74
YO
6075 case 't':
6076 {
c445ea15 6077 reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
a3621e74
YO
6078 U32 refcount;
6079 OP_REFCNT_LOCK;
e27afef8 6080 refcount = --trie->refcount;
a3621e74
YO
6081 OP_REFCNT_UNLOCK;
6082 if ( !refcount ) {
43c5f42d 6083 Safefree(trie->charmap);
a3621e74
YO
6084 if (trie->widecharmap)
6085 SvREFCNT_dec((SV*)trie->widecharmap);
43c5f42d
NC
6086 Safefree(trie->states);
6087 Safefree(trie->trans);
a3621e74
YO
6088#ifdef DEBUGGING
6089 if (trie->words)
6090 SvREFCNT_dec((SV*)trie->words);
6091 if (trie->revcharmap)
6092 SvREFCNT_dec((SV*)trie->revcharmap);
6093#endif
6094 Safefree(r->data->data[n]); /* do this last!!!! */
6095 }
6096 break;
6097 }
c277df42 6098 default:
830247a4 6099 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6100 }
6101 }
6102 Safefree(r->data->what);
6103 Safefree(r->data);
a0d0e21e
LW
6104 }
6105 Safefree(r->startp);
6106 Safefree(r->endp);
6107 Safefree(r);
a687059c 6108}
c277df42
IZ
6109
6110/*
6111 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6112 */
6113regnode *
864dbfa3 6114Perl_regnext(pTHX_ register regnode *p)
c277df42 6115{
97aff369 6116 dVAR;
c277df42
IZ
6117 register I32 offset;
6118
3280af22 6119 if (p == &PL_regdummy)
c277df42
IZ
6120 return(NULL);
6121
6122 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6123 if (offset == 0)
6124 return(NULL);
6125
c277df42 6126 return(p+offset);
c277df42
IZ
6127}
6128
01f988be 6129STATIC void
cea2e8a9 6130S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6131{
6132 va_list args;
6133 STRLEN l1 = strlen(pat1);
6134 STRLEN l2 = strlen(pat2);
6135 char buf[512];
06bf62c7 6136 SV *msv;
73d840c0 6137 const char *message;
c277df42
IZ
6138
6139 if (l1 > 510)
6140 l1 = 510;
6141 if (l1 + l2 > 510)
6142 l2 = 510 - l1;
6143 Copy(pat1, buf, l1 , char);
6144 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6145 buf[l1 + l2] = '\n';
6146 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6147#ifdef I_STDARG
6148 /* ANSI variant takes additional second argument */
c277df42 6149 va_start(args, pat2);
8736538c
AS
6150#else
6151 va_start(args);
6152#endif
5a844595 6153 msv = vmess(buf, &args);
c277df42 6154 va_end(args);
cfd0369c 6155 message = SvPV_const(msv,l1);
c277df42
IZ
6156 if (l1 > 512)
6157 l1 = 512;
6158 Copy(message, buf, l1 , char);
197cf9b9 6159 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6160 Perl_croak(aTHX_ "%s", buf);
c277df42 6161}
a0ed51b3
LW
6162
6163/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6164
6165void
864dbfa3 6166Perl_save_re_context(pTHX)
b81d288d 6167{
97aff369 6168 dVAR;
1ade1aa1
NC
6169
6170 struct re_save_state *state;
6171
6172 SAVEVPTR(PL_curcop);
6173 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6174
6175 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6176 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6177 SSPUSHINT(SAVEt_RE_STATE);
6178
46ab3289 6179 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 6180
a0ed51b3 6181 PL_reg_start_tmp = 0;
a0ed51b3 6182 PL_reg_start_tmpl = 0;
c445ea15 6183 PL_reg_oldsaved = NULL;
a5db57d6 6184 PL_reg_oldsavedlen = 0;
a5db57d6 6185 PL_reg_maxiter = 0;
a5db57d6 6186 PL_reg_leftiter = 0;
c445ea15 6187 PL_reg_poscache = NULL;
a5db57d6 6188 PL_reg_poscache_size = 0;
1ade1aa1
NC
6189#ifdef PERL_OLD_COPY_ON_WRITE
6190 PL_nrs = NULL;
6191#endif
ada6e8a9 6192
c445ea15
AL
6193 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6194 if (PL_curpm) {
6195 const REGEXP * const rx = PM_GETRE(PL_curpm);
6196 if (rx) {
1df70142 6197 U32 i;
ada6e8a9 6198 for (i = 1; i <= rx->nparens; i++) {
1df70142 6199 char digits[TYPE_CHARS(long)];
e5105eda 6200 const STRLEN len = my_sprintf(digits, "%lu", (long)i);
49f27e4b
NC
6201 GV *const *const gvp
6202 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6203
b37c2d43
AL
6204 if (gvp) {
6205 GV * const gv = *gvp;
6206 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6207 save_scalar(gv);
49f27e4b 6208 }
ada6e8a9
AMS
6209 }
6210 }
6211 }
a0ed51b3 6212}
51371543 6213
51371543 6214static void
acfe0abc 6215clear_re(pTHX_ void *r)
51371543 6216{
97aff369 6217 dVAR;
51371543
GS
6218 ReREFCNT_dec((regexp *)r);
6219}
ffbc6a93 6220
a28509cc
AL
6221#ifdef DEBUGGING
6222
6223STATIC void
6224S_put_byte(pTHX_ SV *sv, int c)
6225{
6226 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6227 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6228 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6229 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6230 else
6231 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6232}
6233
6234
b5a2f8d8
NC
6235STATIC const regnode *
6236S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6237 const regnode *last, SV* sv, I32 l)
a28509cc 6238{
97aff369 6239 dVAR;
a28509cc 6240 register U8 op = EXACT; /* Arbitrary non-END op. */
b5a2f8d8 6241 register const regnode *next;
a28509cc
AL
6242
6243 while (op != END && (!last || node < last)) {
6244 /* While that wasn't END last time... */
6245
6246 NODE_ALIGN(node);
6247 op = OP(node);
6248 if (op == CLOSE)
6249 l--;
b5a2f8d8 6250 next = regnext((regnode *)node);
a28509cc
AL
6251 /* Where, what. */
6252 if (OP(node) == OPTIMIZED)
6253 goto after_print;
32fc9b6a 6254 regprop(r, sv, node);
a28509cc
AL
6255 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6256 (int)(2*l + 1), "", SvPVX_const(sv));
6257 if (next == NULL) /* Next ptr. */
6258 PerlIO_printf(Perl_debug_log, "(0)");
6259 else
6260 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6261 (void)PerlIO_putc(Perl_debug_log, '\n');
6262 after_print:
6263 if (PL_regkind[(U8)op] == BRANCHJ) {
b5a2f8d8
NC
6264 register const regnode *nnode = (OP(next) == LONGJMP
6265 ? regnext((regnode *)next)
6266 : next);
a28509cc
AL
6267 if (last && nnode > last)
6268 nnode = last;
4f639d21 6269 node = dumpuntil(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a28509cc
AL
6270 }
6271 else if (PL_regkind[(U8)op] == BRANCH) {
4f639d21 6272 node = dumpuntil(r, start, NEXTOPER(node), next, sv, l + 1);
a28509cc
AL
6273 }
6274 else if ( PL_regkind[(U8)op] == TRIE ) {
6275 const I32 n = ARG(node);
4f639d21 6276 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
a28509cc
AL
6277 const I32 arry_len = av_len(trie->words)+1;
6278 I32 word_idx;
6279 PerlIO_printf(Perl_debug_log,
6280 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6281 (int)(2*(l+3)),
6282 "",
6283 trie->wordcount,
6284 (int)trie->charcount,
6285 trie->uniquecharcount,
6286 (IV)trie->laststate-1,
6287 node->flags ? " EVAL mode" : "");
6288
6289 for (word_idx=0; word_idx < arry_len; word_idx++) {
097eb12c 6290 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
a28509cc
AL
6291 if (elem_ptr) {
6292 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6293 (int)(2*(l+4)), "",
6294 PL_colors[0],
cfd0369c 6295 SvPV_nolen_const(*elem_ptr),
a28509cc
AL
6296 PL_colors[1]
6297 );
6298 /*
6299 if (next == NULL)
6300 PerlIO_printf(Perl_debug_log, "(0)\n");
6301 else
6302 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6303 */
6304 }
6305
6306 }
6307
6308 node = NEXTOPER(node);
6309 node += regarglen[(U8)op];
6310
6311 }
6312 else if ( op == CURLY) { /* "next" might be very big: optimizer */
4f639d21 6313 node = dumpuntil(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
6314 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6315 }
6316 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4f639d21 6317 node = dumpuntil(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
6318 next, sv, l + 1);
6319 }
6320 else if ( op == PLUS || op == STAR) {
4f639d21 6321 node = dumpuntil(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a28509cc
AL
6322 }
6323 else if (op == ANYOF) {
6324 /* arglen 1 + class block */
6325 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6326 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6327 node = NEXTOPER(node);
6328 }
6329 else if (PL_regkind[(U8)op] == EXACT) {
6330 /* Literal string, where present. */
6331 node += NODE_SZ_STR(node) - 1;
6332 node = NEXTOPER(node);
6333 }
6334 else {
6335 node = NEXTOPER(node);
6336 node += regarglen[(U8)op];
6337 }
6338 if (op == CURLYX || op == OPEN)
6339 l++;
6340 else if (op == WHILEM)
6341 l--;
6342 }
6343 return node;
6344}
6345
6346#endif /* DEBUGGING */
6347
241d1a3b
NC
6348/*
6349 * Local variables:
6350 * c-indentation-style: bsd
6351 * c-basic-offset: 4
6352 * indent-tabs-mode: t
6353 * End:
6354 *
37442d52
RGS
6355 * ex: set ts=8 sts=4 sw=4 noet:
6356 */