This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Fix implementation of threads->_handle on unixes.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e
AD
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
cad2e5aa 38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
d06ea78c 44/* We *really* need to overwrite these symbols: */
56953603
IZ
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
d06ea78c 48# define Perl_pregfree my_regfree
cad2e5aa
JH
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
d06ea78c 51# define Perl_regnext my_regnext
f0b8d043 52# define Perl_save_re_context my_save_re_context
b81d288d 53# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
b81d288d 56#endif
56953603 57
a687059c 58/*
e50aee73 59 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
60 *
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
63 *
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
67 *
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
70 * from defects in it.
71 *
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
74 *
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
77 *
78 *
79 **** Alterations to Henry's code are...
80 ****
4bb101f2 81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 82 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 83 ****
9ef589d8
LW
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
86
a687059c
LW
87 *
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
91 */
92#include "EXTERN.h"
864dbfa3 93#define PERL_IN_REGCOMP_C
a687059c 94#include "perl.h"
d06ea78c 95
acfe0abc 96#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
97# include "INTERN.h"
98#endif
c277df42
IZ
99
100#define REG_COMP_C
a687059c
LW
101#include "regcomp.h"
102
d4cce5f1 103#ifdef op
11343788 104#undef op
d4cce5f1 105#endif /* op */
11343788 106
fe14fcc3 107#ifdef MSDOS
7e4e8c89 108# if defined(BUGGY_MSC6)
fe14fcc3 109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 110# pragma optimize("a",off)
fe14fcc3 111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
112# pragma optimize("w",on )
113# endif /* BUGGY_MSC6 */
fe14fcc3
LW
114#endif /* MSDOS */
115
a687059c
LW
116#ifndef STATIC
117#define STATIC static
118#endif
119
830247a4 120typedef struct RExC_state_t {
e2509266 121 U32 flags; /* are we folding, multilining? */
830247a4
IZ
122 char *precomp; /* uncompiled string. */
123 regexp *rx;
fac92740 124 char *start; /* Start of input for compile */
830247a4
IZ
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 128 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 129 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
132 U32 seen;
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
135 I32 extralen;
136 I32 seen_zerolen;
137 I32 seen_evals;
1aa99e6b 138 I32 utf8;
830247a4
IZ
139#if ADD_TO_REGEXEC
140 char *starttry; /* -Dr: where regtry was called. */
141#define RExC_starttry (pRExC_state->starttry)
142#endif
143} RExC_state_t;
144
e2509266 145#define RExC_flags (pRExC_state->flags)
830247a4
IZ
146#define RExC_precomp (pRExC_state->precomp)
147#define RExC_rx (pRExC_state->rx)
fac92740 148#define RExC_start (pRExC_state->start)
830247a4
IZ
149#define RExC_end (pRExC_state->end)
150#define RExC_parse (pRExC_state->parse)
151#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 152#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 153#define RExC_emit (pRExC_state->emit)
fac92740 154#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
155#define RExC_naughty (pRExC_state->naughty)
156#define RExC_sawback (pRExC_state->sawback)
157#define RExC_seen (pRExC_state->seen)
158#define RExC_size (pRExC_state->size)
159#define RExC_npar (pRExC_state->npar)
160#define RExC_extralen (pRExC_state->extralen)
161#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 163#define RExC_utf8 (pRExC_state->utf8)
830247a4 164
a687059c
LW
165#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
a687059c 168
35c8bce7
LW
169#ifdef SPSTART
170#undef SPSTART /* dratted cpp namespace... */
171#endif
a687059c
LW
172/*
173 * Flags to be passed up and down.
174 */
a687059c 175#define WORST 0 /* Worst case. */
821b33a5 176#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
177#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178#define SPSTART 0x4 /* Starts with * or +. */
179#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 180
2c2d71f5
JH
181/* Length of a variant. */
182
183typedef struct scan_data_t {
184 I32 len_min;
185 I32 len_delta;
186 I32 pos_min;
187 I32 pos_delta;
188 SV *last_found;
189 I32 last_end; /* min value, <0 unless valid. */
190 I32 last_start_min;
191 I32 last_start_max;
192 SV **longest; /* Either &l_fixed, or &l_float. */
193 SV *longest_fixed;
194 I32 offset_fixed;
195 SV *longest_float;
196 I32 offset_float_min;
197 I32 offset_float_max;
198 I32 flags;
199 I32 whilem_c;
cb434fcc 200 I32 *last_closep;
653099ff 201 struct regnode_charclass_class *start_class;
2c2d71f5
JH
202} scan_data_t;
203
a687059c 204/*
e50aee73 205 * Forward declarations for pregcomp()'s friends.
a687059c 206 */
a0d0e21e 207
27da23d5
JH
208static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
210
211#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212#define SF_BEFORE_SEOL 0x1
213#define SF_BEFORE_MEOL 0x2
214#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
216
09b7f37c
CB
217#ifdef NO_UNARY_PLUS
218# define SF_FIX_SHIFT_EOL (0+2)
219# define SF_FL_SHIFT_EOL (0+4)
220#else
221# define SF_FIX_SHIFT_EOL (+2)
222# define SF_FL_SHIFT_EOL (+4)
223#endif
c277df42
IZ
224
225#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
227
228#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230#define SF_IS_INF 0x40
231#define SF_HAS_PAR 0x80
232#define SF_IN_PAR 0x100
233#define SF_HAS_EVAL 0x200
4bfe0158 234#define SCF_DO_SUBSTR 0x400
653099ff
GS
235#define SCF_DO_STCLASS_AND 0x0800
236#define SCF_DO_STCLASS_OR 0x1000
237#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 238#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 239
eb160463 240#define UTF (RExC_utf8 != 0)
e2509266
JH
241#define LOC ((RExC_flags & PMf_LOCALE) != 0)
242#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 243
ffc61ed2 244#define OOB_UNICODE 12345678
93733859 245#define OOB_NAMEDCLASS -1
b8c5462f 246
a0ed51b3
LW
247#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
249
8615cb43 250
b45f050a
JF
251/* length of regex to show in messages that don't mark a position within */
252#define RegexLengthToShowInErrorMessages 127
253
254/*
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
258 */
7253e4e3
RK
259#define MARKER1 "<-- HERE" /* marker as it appears in the description */
260#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 261
7253e4e3 262#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
263
264/*
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
267 * "...".
268 */
ccb2c380 269#define FAIL(msg) STMT_START { \
bfed75c6 270 const char *ellipses = ""; \
ccb2c380
MP
271 IV len = RExC_end - RExC_precomp; \
272 \
273 if (!SIZE_ONLY) \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
278 ellipses = "..."; \
279 } \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
282} STMT_END
8615cb43 283
b45f050a 284/*
b45f050a
JF
285 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
286 */
ccb2c380 287#define Simple_vFAIL(m) STMT_START { \
a28509cc 288 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
289 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
290 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
291} STMT_END
b45f050a
JF
292
293/*
294 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
295 */
ccb2c380
MP
296#define vFAIL(m) STMT_START { \
297 if (!SIZE_ONLY) \
298 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
299 Simple_vFAIL(m); \
300} STMT_END
b45f050a
JF
301
302/*
303 * Like Simple_vFAIL(), but accepts two arguments.
304 */
ccb2c380 305#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 306 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
307 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
308 (int)offset, RExC_precomp, RExC_precomp + offset); \
309} STMT_END
b45f050a
JF
310
311/*
312 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
313 */
ccb2c380
MP
314#define vFAIL2(m,a1) STMT_START { \
315 if (!SIZE_ONLY) \
316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
317 Simple_vFAIL2(m, a1); \
318} STMT_END
b45f050a
JF
319
320
321/*
322 * Like Simple_vFAIL(), but accepts three arguments.
323 */
ccb2c380 324#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 325 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
326 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
327 (int)offset, RExC_precomp, RExC_precomp + offset); \
328} STMT_END
b45f050a
JF
329
330/*
331 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
332 */
ccb2c380
MP
333#define vFAIL3(m,a1,a2) STMT_START { \
334 if (!SIZE_ONLY) \
335 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
336 Simple_vFAIL3(m, a1, a2); \
337} STMT_END
b45f050a
JF
338
339/*
340 * Like Simple_vFAIL(), but accepts four arguments.
341 */
ccb2c380 342#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 343 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
344 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
345 (int)offset, RExC_precomp, RExC_precomp + offset); \
346} STMT_END
b45f050a 347
ccb2c380 348#define vWARN(loc,m) STMT_START { \
a28509cc 349 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
350 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
351 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
352} STMT_END
353
354#define vWARNdep(loc,m) STMT_START { \
a28509cc 355 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
356 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
357 "%s" REPORT_LOCATION, \
358 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
359} STMT_END
360
361
362#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 363 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
364 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
365 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
366} STMT_END
367
368#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 369 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
370 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
371 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
372} STMT_END
373
374#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 375 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
376 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
377 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
378} STMT_END
379
380#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 381 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
383 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
384} STMT_END
9d1d55b5 385
8615cb43 386
cd439c50 387/* Allow for side effects in s */
ccb2c380
MP
388#define REGC(c,s) STMT_START { \
389 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
390} STMT_END
cd439c50 391
fac92740
MJD
392/* Macros for recording node offsets. 20001227 mjd@plover.com
393 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
394 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
395 * Element 0 holds the number n.
396 */
397
398#define MJD_OFFSET_DEBUG(x)
a3621e74 399/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
400
401
402#define Set_Node_Offset_To_R(node,byte) STMT_START { \
403 if (! SIZE_ONLY) { \
404 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
405 __LINE__, (node), (byte))); \
406 if((node) < 0) { \
551405c4 407 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
408 } else { \
409 RExC_offsets[2*(node)-1] = (byte); \
410 } \
411 } \
412} STMT_END
413
414#define Set_Node_Offset(node,byte) \
415 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
416#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
417
418#define Set_Node_Length_To_R(node,len) STMT_START { \
419 if (! SIZE_ONLY) { \
420 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 421 __LINE__, (int)(node), (int)(len))); \
ccb2c380 422 if((node) < 0) { \
551405c4 423 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
424 } else { \
425 RExC_offsets[2*(node)] = (len); \
426 } \
427 } \
428} STMT_END
429
430#define Set_Node_Length(node,len) \
431 Set_Node_Length_To_R((node)-RExC_emit_start, len)
432#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
433#define Set_Node_Cur_Length(node) \
434 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
435
436/* Get offsets and lengths */
437#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
438#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
439
acfe0abc 440static void clear_re(pTHX_ void *r);
4327152a 441
653099ff
GS
442/* Mark that we cannot extend a found fixed substring at this point.
443 Updata the longest found anchored substring and the longest found
444 floating substrings if needed. */
445
4327152a 446STATIC void
097eb12c 447S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 448{
e1ec3a88
AL
449 const STRLEN l = CHR_SVLEN(data->last_found);
450 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 451
c277df42 452 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 453 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
454 if (*data->longest == data->longest_fixed) {
455 data->offset_fixed = l ? data->last_start_min : data->pos_min;
456 if (data->flags & SF_BEFORE_EOL)
b81d288d 457 data->flags
c277df42
IZ
458 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
459 else
460 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
461 }
462 else {
c277df42 463 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
464 data->offset_float_max = (l
465 ? data->last_start_max
c277df42 466 : data->pos_min + data->pos_delta);
9051bda5
HS
467 if ((U32)data->offset_float_max > (U32)I32_MAX)
468 data->offset_float_max = I32_MAX;
c277df42 469 if (data->flags & SF_BEFORE_EOL)
b81d288d 470 data->flags
c277df42
IZ
471 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
472 else
473 data->flags &= ~SF_FL_BEFORE_EOL;
474 }
475 }
476 SvCUR_set(data->last_found, 0);
0eda9292 477 {
a28509cc 478 SV * const sv = data->last_found;
097eb12c
AL
479 if (SvUTF8(sv) && SvMAGICAL(sv)) {
480 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
481 if (mg)
482 mg->mg_len = 0;
483 }
0eda9292 484 }
c277df42
IZ
485 data->last_end = -1;
486 data->flags &= ~SF_BEFORE_EOL;
487}
488
653099ff
GS
489/* Can match anything (initialization) */
490STATIC void
097eb12c 491S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 492{
653099ff 493 ANYOF_CLASS_ZERO(cl);
f8bef550 494 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 495 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
496 if (LOC)
497 cl->flags |= ANYOF_LOCALE;
498}
499
500/* Can match anything (initialization) */
501STATIC int
5f66b61c 502S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
503{
504 int value;
505
aaa51d5e 506 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
507 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
508 return 1;
1aa99e6b
IH
509 if (!(cl->flags & ANYOF_UNICODE_ALL))
510 return 0;
f8bef550
NC
511 if (!ANYOF_BITMAP_TESTALLSET(cl))
512 return 0;
653099ff
GS
513 return 1;
514}
515
516/* Can match anything (initialization) */
517STATIC void
097eb12c 518S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 519{
8ecf7187 520 Zero(cl, 1, struct regnode_charclass_class);
653099ff 521 cl->type = ANYOF;
830247a4 522 cl_anything(pRExC_state, cl);
653099ff
GS
523}
524
525STATIC void
097eb12c 526S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 527{
8ecf7187 528 Zero(cl, 1, struct regnode_charclass_class);
653099ff 529 cl->type = ANYOF;
830247a4 530 cl_anything(pRExC_state, cl);
653099ff
GS
531 if (LOC)
532 cl->flags |= ANYOF_LOCALE;
533}
534
535/* 'And' a given class with another one. Can create false positives */
536/* We assume that cl is not inverted */
537STATIC void
5f66b61c 538S_cl_and(struct regnode_charclass_class *cl,
a28509cc 539 const struct regnode_charclass_class *and_with)
653099ff 540{
653099ff
GS
541 if (!(and_with->flags & ANYOF_CLASS)
542 && !(cl->flags & ANYOF_CLASS)
543 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
544 && !(and_with->flags & ANYOF_FOLD)
545 && !(cl->flags & ANYOF_FOLD)) {
546 int i;
547
548 if (and_with->flags & ANYOF_INVERT)
549 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
550 cl->bitmap[i] &= ~and_with->bitmap[i];
551 else
552 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
553 cl->bitmap[i] &= and_with->bitmap[i];
554 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
555 if (!(and_with->flags & ANYOF_EOS))
556 cl->flags &= ~ANYOF_EOS;
1aa99e6b 557
14ebb1a2
JH
558 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
559 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
560 cl->flags &= ~ANYOF_UNICODE_ALL;
561 cl->flags |= ANYOF_UNICODE;
562 ARG_SET(cl, ARG(and_with));
563 }
14ebb1a2
JH
564 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
565 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 566 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
567 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
568 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 569 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
570}
571
572/* 'OR' a given class with another one. Can create false positives */
573/* We assume that cl is not inverted */
574STATIC void
097eb12c 575S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 576{
653099ff
GS
577 if (or_with->flags & ANYOF_INVERT) {
578 /* We do not use
579 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
580 * <= (B1 | !B2) | (CL1 | !CL2)
581 * which is wasteful if CL2 is small, but we ignore CL2:
582 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
583 * XXXX Can we handle case-fold? Unclear:
584 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
585 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
586 */
587 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
588 && !(or_with->flags & ANYOF_FOLD)
589 && !(cl->flags & ANYOF_FOLD) ) {
590 int i;
591
592 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
593 cl->bitmap[i] |= ~or_with->bitmap[i];
594 } /* XXXX: logic is complicated otherwise */
595 else {
830247a4 596 cl_anything(pRExC_state, cl);
653099ff
GS
597 }
598 } else {
599 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
600 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 601 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
602 || (cl->flags & ANYOF_FOLD)) ) {
603 int i;
604
605 /* OR char bitmap and class bitmap separately */
606 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
607 cl->bitmap[i] |= or_with->bitmap[i];
608 if (or_with->flags & ANYOF_CLASS) {
609 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
610 cl->classflags[i] |= or_with->classflags[i];
611 cl->flags |= ANYOF_CLASS;
612 }
613 }
614 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 615 cl_anything(pRExC_state, cl);
653099ff
GS
616 }
617 }
618 if (or_with->flags & ANYOF_EOS)
619 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
620
621 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
622 ARG(cl) != ARG(or_with)) {
623 cl->flags |= ANYOF_UNICODE_ALL;
624 cl->flags &= ~ANYOF_UNICODE;
625 }
626 if (or_with->flags & ANYOF_UNICODE_ALL) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
629 }
653099ff
GS
630}
631
5d1c421c 632/*
a3621e74
YO
633
634 make_trie(startbranch,first,last,tail,flags)
635 startbranch: the first branch in the whole branch sequence
636 first : start branch of sequence of branch-exact nodes.
637 May be the same as startbranch
638 last : Thing following the last branch.
639 May be the same as tail.
640 tail : item following the branch sequence
641 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
642
643Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
644
645A trie is an N'ary tree where the branches are determined by digital
646decomposition of the key. IE, at the root node you look up the 1st character and
647follow that branch repeat until you find the end of the branches. Nodes can be
648marked as "accepting" meaning they represent a complete word. Eg:
649
650 /he|she|his|hers/
651
652would convert into the following structure. Numbers represent states, letters
653following numbers represent valid transitions on the letter from that state, if
654the number is in square brackets it represents an accepting state, otherwise it
655will be in parenthesis.
656
657 +-h->+-e->[3]-+-r->(8)-+-s->[9]
658 | |
659 | (2)
660 | |
661 (1) +-i->(6)-+-s->[7]
662 |
663 +-s->(3)-+-h->(4)-+-e->[5]
664
665 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
666
667This shows that when matching against the string 'hers' we will begin at state 1
668read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
669then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
670is also accepting. Thus we know that we can match both 'he' and 'hers' with a
671single traverse. We store a mapping from accepting to state to which word was
672matched, and then when we have multiple possibilities we try to complete the
673rest of the regex in the order in which they occured in the alternation.
674
675The only prior NFA like behaviour that would be changed by the TRIE support is
676the silent ignoring of duplicate alternations which are of the form:
677
678 / (DUPE|DUPE) X? (?{ ... }) Y /x
679
680Thus EVAL blocks follwing a trie may be called a different number of times with
681and without the optimisation. With the optimisations dupes will be silently
682ignored. This inconsistant behaviour of EVAL type nodes is well established as
683the following demonstrates:
684
685 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
686
687which prints out 'word' three times, but
688
689 'words'=~/(word|word|word)(?{ print $1 })S/
690
691which doesnt print it out at all. This is due to other optimisations kicking in.
692
693Example of what happens on a structural level:
694
695The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
696
697 1: CURLYM[1] {1,32767}(18)
698 5: BRANCH(8)
699 6: EXACT <ac>(16)
700 8: BRANCH(11)
701 9: EXACT <ad>(16)
702 11: BRANCH(14)
703 12: EXACT <ab>(16)
704 16: SUCCEED(0)
705 17: NOTHING(18)
706 18: END(0)
707
708This would be optimizable with startbranch=5, first=5, last=16, tail=16
709and should turn into:
710
711 1: CURLYM[1] {1,32767}(18)
712 5: TRIE(16)
713 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
714 <ac>
715 <ad>
716 <ab>
717 16: SUCCEED(0)
718 17: NOTHING(18)
719 18: END(0)
720
721Cases where tail != last would be like /(?foo|bar)baz/:
722
723 1: BRANCH(4)
724 2: EXACT <foo>(8)
725 4: BRANCH(7)
726 5: EXACT <bar>(8)
727 7: TAIL(8)
728 8: EXACT <baz>(10)
729 10: END(0)
730
731which would be optimizable with startbranch=1, first=1, last=7, tail=8
732and would end up looking like:
733
734 1: TRIE(8)
735 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
736 <foo>
737 <bar>
738 7: TAIL(8)
739 8: EXACT <baz>(10)
740 10: END(0)
741
742*/
743
744#define TRIE_DEBUG_CHAR \
745 DEBUG_TRIE_COMPILE_r({ \
746 SV *tmp; \
747 if ( UTF ) { \
6136c704 748 tmp = newSVpvs( "" ); \
a3621e74
YO
749 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
750 } else { \
e4584336 751 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
a3621e74
YO
752 } \
753 av_push( trie->revcharmap, tmp ); \
754 })
755
756#define TRIE_READ_CHAR STMT_START { \
757 if ( UTF ) { \
758 if ( folder ) { \
759 if ( foldlen > 0 ) { \
760 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
761 foldlen -= len; \
762 scan += len; \
763 len = 0; \
764 } else { \
e1ec3a88 765 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
766 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
767 foldlen -= UNISKIP( uvc ); \
768 scan = foldbuf + UNISKIP( uvc ); \
769 } \
770 } else { \
e1ec3a88 771 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
772 } \
773 } else { \
774 uvc = (U32)*uc; \
775 len = 1; \
776 } \
777} STMT_END
778
779
780#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
781#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
782#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
783#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
784
785#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
786 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
787 TRIE_LIST_LEN( state ) *= 2; \
788 Renew( trie->states[ state ].trans.list, \
789 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
790 } \
791 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
792 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
793 TRIE_LIST_CUR( state )++; \
794} STMT_END
795
796#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 797 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
798 4, reg_trie_trans_le ); \
799 TRIE_LIST_CUR( state ) = 1; \
800 TRIE_LIST_LEN( state ) = 4; \
801} STMT_END
802
803STATIC I32
804S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
805{
27da23d5 806 dVAR;
a3621e74
YO
807 /* first pass, loop through and scan words */
808 reg_trie_data *trie;
809 regnode *cur;
9f7f3913 810 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
811 STRLEN len = 0;
812 UV uvc = 0;
813 U16 curword = 0;
814 U32 next_alloc = 0;
815 /* we just use folder as a flag in utf8 */
e1ec3a88 816 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
817 ? PL_fold
818 : ( flags == EXACTFL
819 ? PL_fold_locale
820 : NULL
821 )
822 );
823
e1ec3a88 824 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74
YO
825 SV *re_trie_maxbuff;
826
827 GET_RE_DEBUG_FLAGS_DECL;
828
a02a5408 829 Newxz( trie, 1, reg_trie_data );
a3621e74
YO
830 trie->refcount = 1;
831 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 832 Newxz( trie->charmap, 256, U16 );
a3621e74
YO
833 DEBUG_r({
834 trie->words = newAV();
835 trie->revcharmap = newAV();
836 });
837
838
0111c4fd 839 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 840 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 841 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74
YO
842 }
843
844 /* -- First loop and Setup --
845
846 We first traverse the branches and scan each word to determine if it
847 contains widechars, and how many unique chars there are, this is
848 important as we have to build a table with at least as many columns as we
849 have unique chars.
850
851 We use an array of integers to represent the character codes 0..255
852 (trie->charmap) and we use a an HV* to store unicode characters. We use the
853 native representation of the character value as the key and IV's for the
854 coded index.
855
856 *TODO* If we keep track of how many times each character is used we can
857 remap the columns so that the table compression later on is more
858 efficient in terms of memory by ensuring most common value is in the
859 middle and the least common are on the outside. IMO this would be better
860 than a most to least common mapping as theres a decent chance the most
861 common letter will share a node with the least common, meaning the node
862 will not be compressable. With a middle is most common approach the worst
863 case is when we have the least common nodes twice.
864
865 */
866
867
868 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 869 regnode * const noper = NEXTOPER( cur );
e1ec3a88 870 const U8 *uc = (U8*)STRING( noper );
a28509cc 871 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
872 STRLEN foldlen = 0;
873 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 874 const U8 *scan = (U8*)NULL;
a3621e74
YO
875
876 for ( ; uc < e ; uc += len ) {
877 trie->charcount++;
878 TRIE_READ_CHAR;
879 if ( uvc < 256 ) {
880 if ( !trie->charmap[ uvc ] ) {
881 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
882 if ( folder )
883 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
884 TRIE_DEBUG_CHAR;
885 }
886 } else {
887 SV** svpp;
888 if ( !trie->widecharmap )
889 trie->widecharmap = newHV();
890
891 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
892
893 if ( !svpp )
e4584336 894 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
895
896 if ( !SvTRUE( *svpp ) ) {
897 sv_setiv( *svpp, ++trie->uniquecharcount );
898 TRIE_DEBUG_CHAR;
899 }
900 }
901 }
902 trie->wordcount++;
903 } /* end first pass */
904 DEBUG_TRIE_COMPILE_r(
905 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
906 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
5d7488b2 907 (int)trie->charcount, trie->uniquecharcount )
a3621e74
YO
908 );
909
910
911 /*
912 We now know what we are dealing with in terms of unique chars and
913 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
914 representation using a flat table will take. If it's over a reasonable
915 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
916 conservative but potentially much slower representation using an array
917 of lists.
918
919 At the end we convert both representations into the same compressed
920 form that will be used in regexec.c for matching with. The latter
921 is a form that cannot be used to construct with but has memory
922 properties similar to the list form and access properties similar
923 to the table form making it both suitable for fast searches and
924 small enough that its feasable to store for the duration of a program.
925
926 See the comment in the code where the compressed table is produced
927 inplace from the flat tabe representation for an explanation of how
928 the compression works.
929
930 */
931
932
933 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
934 /*
935 Second Pass -- Array Of Lists Representation
936
937 Each state will be represented by a list of charid:state records
938 (reg_trie_trans_le) the first such element holds the CUR and LEN
939 points of the allocated array. (See defines above).
940
941 We build the initial structure using the lists, and then convert
942 it into the compressed table form which allows faster lookups
943 (but cant be modified once converted).
944
945
946 */
947
948
949 STRLEN transcount = 1;
950
a02a5408 951 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
952 TRIE_LIST_NEW(1);
953 next_alloc = 2;
954
955 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
956
c445ea15
AL
957 regnode * const noper = NEXTOPER( cur );
958 U8 *uc = (U8*)STRING( noper );
959 const U8 * const e = uc + STR_LEN( noper );
960 U32 state = 1; /* required init */
961 U16 charid = 0; /* sanity init */
962 U8 *scan = (U8*)NULL; /* sanity init */
963 STRLEN foldlen = 0; /* required init */
964 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
965
966 for ( ; uc < e ; uc += len ) {
967
968 TRIE_READ_CHAR;
969
970 if ( uvc < 256 ) {
971 charid = trie->charmap[ uvc ];
972 } else {
973 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
974 if ( !svpp ) {
975 charid = 0;
976 } else {
977 charid=(U16)SvIV( *svpp );
978 }
979 }
980 if ( charid ) {
a3621e74 981
c445ea15
AL
982 U16 check;
983 U32 newstate = 0;
a3621e74 984
c445ea15
AL
985 charid--;
986 if ( !trie->states[ state ].trans.list ) {
987 TRIE_LIST_NEW( state );
988 }
989 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
990 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
991 newstate = TRIE_LIST_ITEM( state, check ).newstate;
992 break;
993 }
994 }
995 if ( ! newstate ) {
996 newstate = next_alloc++;
997 TRIE_LIST_PUSH( state, charid, newstate );
998 transcount++;
999 }
1000 state = newstate;
1001 } else {
1002 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a28509cc 1003 }
c445ea15
AL
1004 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1005 }
a3621e74 1006
c445ea15
AL
1007 if ( !trie->states[ state ].wordnum ) {
1008 /* we havent inserted this word into the structure yet. */
1009 trie->states[ state ].wordnum = ++curword;
a3621e74 1010
c445ea15
AL
1011 DEBUG_r({
1012 /* store the word for dumping */
1013 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1014 if ( UTF ) SvUTF8_on( tmp );
1015 av_push( trie->words, tmp );
1016 });
a3621e74 1017
c445ea15 1018 } else {
bb263b4e 1019 /*EMPTY*/; /* It's a dupe. So ignore it. */
c445ea15 1020 }
a3621e74
YO
1021
1022 } /* end second pass */
1023
1024 trie->laststate = next_alloc;
1025 Renew( trie->states, next_alloc, reg_trie_state );
1026
1027 DEBUG_TRIE_COMPILE_MORE_r({
1028 U32 state;
a3621e74 1029
a28509cc 1030 /* print out the table precompression. */
a3621e74
YO
1031
1032 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1033 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1034
1035 for( state=1 ; state < next_alloc ; state ++ ) {
a28509cc 1036 U16 charid;
a3621e74 1037
e4584336 1038 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
a3621e74
YO
1039 if ( ! trie->states[ state ].wordnum ) {
1040 PerlIO_printf( Perl_debug_log, "%5s| ","");
1041 } else {
e4584336 1042 PerlIO_printf( Perl_debug_log, "W%04x| ",
a3621e74
YO
1043 trie->states[ state ].wordnum
1044 );
1045 }
1046 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1047 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
e4584336 1048 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
cfd0369c 1049 SvPV_nolen_const( *tmp ),
a3621e74 1050 TRIE_LIST_ITEM(state,charid).forid,
e4584336 1051 (UV)TRIE_LIST_ITEM(state,charid).newstate
a3621e74
YO
1052 );
1053 }
1054
1055 }
1056 PerlIO_printf( Perl_debug_log, "\n\n" );
1057 });
1058
a02a5408 1059 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1060 {
1061 U32 state;
a3621e74
YO
1062 U32 tp = 0;
1063 U32 zp = 0;
1064
1065
1066 for( state=1 ; state < next_alloc ; state ++ ) {
1067 U32 base=0;
1068
1069 /*
1070 DEBUG_TRIE_COMPILE_MORE_r(
1071 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1072 );
1073 */
1074
1075 if (trie->states[state].trans.list) {
1076 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1077 U16 maxid=minid;
a28509cc 1078 U16 idx;
a3621e74
YO
1079
1080 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1081 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1082 if ( forid < minid ) {
1083 minid=forid;
1084 } else if ( forid > maxid ) {
1085 maxid=forid;
1086 }
a3621e74
YO
1087 }
1088 if ( transcount < tp + maxid - minid + 1) {
1089 transcount *= 2;
1090 Renew( trie->trans, transcount, reg_trie_trans );
1091 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1092 }
1093 base = trie->uniquecharcount + tp - minid;
1094 if ( maxid == minid ) {
1095 U32 set = 0;
1096 for ( ; zp < tp ; zp++ ) {
1097 if ( ! trie->trans[ zp ].next ) {
1098 base = trie->uniquecharcount + zp - minid;
1099 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1100 trie->trans[ zp ].check = state;
1101 set = 1;
1102 break;
1103 }
1104 }
1105 if ( !set ) {
1106 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1107 trie->trans[ tp ].check = state;
1108 tp++;
1109 zp = tp;
1110 }
1111 } else {
1112 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1113 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1114 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1115 trie->trans[ tid ].check = state;
1116 }
1117 tp += ( maxid - minid + 1 );
1118 }
1119 Safefree(trie->states[ state ].trans.list);
1120 }
1121 /*
1122 DEBUG_TRIE_COMPILE_MORE_r(
1123 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1124 );
1125 */
1126 trie->states[ state ].trans.base=base;
1127 }
cc601c31 1128 trie->lasttrans = tp + 1;
a3621e74
YO
1129 }
1130 } else {
1131 /*
1132 Second Pass -- Flat Table Representation.
1133
1134 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1135 We know that we will need Charcount+1 trans at most to store the data
1136 (one row per char at worst case) So we preallocate both structures
1137 assuming worst case.
1138
1139 We then construct the trie using only the .next slots of the entry
1140 structs.
1141
1142 We use the .check field of the first entry of the node temporarily to
1143 make compression both faster and easier by keeping track of how many non
1144 zero fields are in the node.
1145
1146 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1147 transition.
1148
1149 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1150 number representing the first entry of the node, and state as a
1151 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1152 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1153 are 2 entrys per node. eg:
1154
1155 A B A B
1156 1. 2 4 1. 3 7
1157 2. 0 3 3. 0 5
1158 3. 0 0 5. 0 0
1159 4. 0 0 7. 0 0
1160
1161 The table is internally in the right hand, idx form. However as we also
1162 have to deal with the states array which is indexed by nodenum we have to
1163 use TRIE_NODENUM() to convert.
1164
1165 */
1166
a02a5408 1167 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
a3621e74 1168 reg_trie_trans );
a02a5408 1169 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
1170 next_alloc = trie->uniquecharcount + 1;
1171
1172 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1173
c445ea15 1174 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1175 const U8 *uc = (U8*)STRING( noper );
1176 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1177
1178 U32 state = 1; /* required init */
1179
1180 U16 charid = 0; /* sanity init */
1181 U32 accept_state = 0; /* sanity init */
1182 U8 *scan = (U8*)NULL; /* sanity init */
1183
1184 STRLEN foldlen = 0; /* required init */
1185 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1186
1187
1188 for ( ; uc < e ; uc += len ) {
1189
1190 TRIE_READ_CHAR;
1191
1192 if ( uvc < 256 ) {
1193 charid = trie->charmap[ uvc ];
1194 } else {
c445ea15
AL
1195 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1196 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74
YO
1197 }
1198 if ( charid ) {
1199 charid--;
1200 if ( !trie->trans[ state + charid ].next ) {
1201 trie->trans[ state + charid ].next = next_alloc;
1202 trie->trans[ state ].check++;
1203 next_alloc += trie->uniquecharcount;
1204 }
1205 state = trie->trans[ state + charid ].next;
1206 } else {
e4584336 1207 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1208 }
1209 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1210 }
1211
1212 accept_state = TRIE_NODENUM( state );
1213 if ( !trie->states[ accept_state ].wordnum ) {
1214 /* we havent inserted this word into the structure yet. */
1215 trie->states[ accept_state ].wordnum = ++curword;
1216
1217 DEBUG_r({
1218 /* store the word for dumping */
1219 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1220 if ( UTF ) SvUTF8_on( tmp );
1221 av_push( trie->words, tmp );
1222 });
1223
1224 } else {
bb263b4e 1225 /*EMPTY*/; /* Its a dupe. So ignore it. */
a3621e74
YO
1226 }
1227
1228 } /* end second pass */
1229
1230 DEBUG_TRIE_COMPILE_MORE_r({
1231 /*
1232 print out the table precompression so that we can do a visual check
1233 that they are identical.
1234 */
1235 U32 state;
1236 U16 charid;
1237 PerlIO_printf( Perl_debug_log, "\nChar : " );
1238
1239 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1240 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1241 if ( tmp ) {
cfd0369c 1242 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1243 }
1244 }
1245
1246 PerlIO_printf( Perl_debug_log, "\nState+-" );
1247
1248 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1249 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1250 }
1251
1252 PerlIO_printf( Perl_debug_log, "\n" );
1253
1254 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1255
e4584336 1256 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
a3621e74
YO
1257
1258 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
e4584336
RB
1259 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1260 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
a3621e74
YO
1261 }
1262 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
e4584336 1263 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
a3621e74 1264 } else {
e4584336 1265 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
a3621e74
YO
1266 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1267 }
1268 }
1269 PerlIO_printf( Perl_debug_log, "\n\n" );
1270 });
1271 {
1272 /*
1273 * Inplace compress the table.*
1274
1275 For sparse data sets the table constructed by the trie algorithm will
1276 be mostly 0/FAIL transitions or to put it another way mostly empty.
1277 (Note that leaf nodes will not contain any transitions.)
1278
1279 This algorithm compresses the tables by eliminating most such
1280 transitions, at the cost of a modest bit of extra work during lookup:
1281
1282 - Each states[] entry contains a .base field which indicates the
1283 index in the state[] array wheres its transition data is stored.
1284
1285 - If .base is 0 there are no valid transitions from that node.
1286
1287 - If .base is nonzero then charid is added to it to find an entry in
1288 the trans array.
1289
1290 -If trans[states[state].base+charid].check!=state then the
1291 transition is taken to be a 0/Fail transition. Thus if there are fail
1292 transitions at the front of the node then the .base offset will point
1293 somewhere inside the previous nodes data (or maybe even into a node
1294 even earlier), but the .check field determines if the transition is
1295 valid.
1296
1297 The following process inplace converts the table to the compressed
1298 table: We first do not compress the root node 1,and mark its all its
1299 .check pointers as 1 and set its .base pointer as 1 as well. This
1300 allows to do a DFA construction from the compressed table later, and
1301 ensures that any .base pointers we calculate later are greater than
1302 0.
1303
1304 - We set 'pos' to indicate the first entry of the second node.
1305
1306 - We then iterate over the columns of the node, finding the first and
1307 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1308 and set the .check pointers accordingly, and advance pos
1309 appropriately and repreat for the next node. Note that when we copy
1310 the next pointers we have to convert them from the original
1311 NODEIDX form to NODENUM form as the former is not valid post
1312 compression.
1313
1314 - If a node has no transitions used we mark its base as 0 and do not
1315 advance the pos pointer.
1316
1317 - If a node only has one transition we use a second pointer into the
1318 structure to fill in allocated fail transitions from other states.
1319 This pointer is independent of the main pointer and scans forward
1320 looking for null transitions that are allocated to a state. When it
1321 finds one it writes the single transition into the "hole". If the
1322 pointer doesnt find one the single transition is appeneded as normal.
1323
1324 - Once compressed we can Renew/realloc the structures to release the
1325 excess space.
1326
1327 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1328 specifically Fig 3.47 and the associated pseudocode.
1329
1330 demq
1331 */
a3b680e6 1332 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1333 U32 state, charid;
a3621e74
YO
1334 U32 pos = 0, zp=0;
1335 trie->laststate = laststate;
1336
1337 for ( state = 1 ; state < laststate ; state++ ) {
1338 U8 flag = 0;
a28509cc
AL
1339 const U32 stateidx = TRIE_NODEIDX( state );
1340 const U32 o_used = trie->trans[ stateidx ].check;
1341 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1342 trie->trans[ stateidx ].check = 0;
1343
1344 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1345 if ( flag || trie->trans[ stateidx + charid ].next ) {
1346 if ( trie->trans[ stateidx + charid ].next ) {
1347 if (o_used == 1) {
1348 for ( ; zp < pos ; zp++ ) {
1349 if ( ! trie->trans[ zp ].next ) {
1350 break;
1351 }
1352 }
1353 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1354 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1355 trie->trans[ zp ].check = state;
1356 if ( ++zp > pos ) pos = zp;
1357 break;
1358 }
1359 used--;
1360 }
1361 if ( !flag ) {
1362 flag = 1;
1363 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1364 }
1365 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1366 trie->trans[ pos ].check = state;
1367 pos++;
1368 }
1369 }
1370 }
cc601c31 1371 trie->lasttrans = pos + 1;
a3621e74
YO
1372 Renew( trie->states, laststate + 1, reg_trie_state);
1373 DEBUG_TRIE_COMPILE_MORE_r(
e4584336
RB
1374 PerlIO_printf( Perl_debug_log,
1375 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
5d7488b2
AL
1376 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1377 (IV)next_alloc,
1378 (IV)pos,
a3621e74
YO
1379 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1380 );
1381
1382 } /* end table compress */
1383 }
cc601c31
YO
1384 /* resize the trans array to remove unused space */
1385 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74
YO
1386
1387 DEBUG_TRIE_COMPILE_r({
1388 U32 state;
1389 /*
1390 Now we print it out again, in a slightly different form as there is additional
1391 info we want to be able to see when its compressed. They are close enough for
1392 visual comparison though.
1393 */
1394 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1395
1396 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1397 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1398 if ( tmp ) {
cfd0369c 1399 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1400 }
1401 }
1402 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
cc601c31 1403
a3621e74
YO
1404 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1405 PerlIO_printf( Perl_debug_log, "-----");
1406 PerlIO_printf( Perl_debug_log, "\n");
cc601c31 1407
a3621e74 1408 for( state = 1 ; state < trie->laststate ; state++ ) {
a28509cc 1409 const U32 base = trie->states[ state ].trans.base;
a3621e74 1410
e4584336 1411 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
a3621e74
YO
1412
1413 if ( trie->states[ state ].wordnum ) {
1414 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1415 } else {
1416 PerlIO_printf( Perl_debug_log, "%6s", "" );
1417 }
1418
e4584336 1419 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
a3621e74
YO
1420
1421 if ( base ) {
1422 U32 ofs = 0;
1423
cc601c31
YO
1424 while( ( base + ofs < trie->uniquecharcount ) ||
1425 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1426 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
a3621e74
YO
1427 ofs++;
1428
e4584336 1429 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
a3621e74
YO
1430
1431 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1432 if ( ( base + ofs >= trie->uniquecharcount ) &&
1433 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1434 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1435 {
e4584336
RB
1436 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1437 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
a3621e74
YO
1438 } else {
1439 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1440 }
1441 }
1442
e4584336 1443 PerlIO_printf( Perl_debug_log, "]");
a3621e74
YO
1444
1445 }
1446 PerlIO_printf( Perl_debug_log, "\n" );
1447 }
1448 });
1449
1450 {
1451 /* now finally we "stitch in" the new TRIE node
1452 This means we convert either the first branch or the first Exact,
1453 depending on whether the thing following (in 'last') is a branch
1454 or not and whther first is the startbranch (ie is it a sub part of
1455 the alternation or is it the whole thing.)
1456 Assuming its a sub part we conver the EXACT otherwise we convert
1457 the whole branch sequence, including the first.
1458 */
1459 regnode *convert;
1460
1461
1462
1463
1464 if ( first == startbranch && OP( last ) != BRANCH ) {
1465 convert = first;
1466 } else {
1467 convert = NEXTOPER( first );
1468 NEXT_OFF( first ) = (U16)(last - first);
1469 }
1470
1471 OP( convert ) = TRIE + (U8)( flags - EXACT );
1472 NEXT_OFF( convert ) = (U16)(tail - convert);
1473 ARG_SET( convert, data_slot );
1474
1475 /* tells us if we need to handle accept buffers specially */
1476 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1477
1478
1479 /* needed for dumping*/
1480 DEBUG_r({
1481 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1482 /* We now need to mark all of the space originally used by the
1483 branches as optimized away. This keeps the dumpuntil from
1484 throwing a wobbly as it doesnt use regnext() to traverse the
1485 opcodes.
1486 */
1487 while( optimize < last ) {
1488 OP( optimize ) = OPTIMIZED;
1489 optimize++;
1490 }
1491 });
1492 } /* end node insert */
1493 return 1;
1494}
1495
1496
1497
1498/*
5d1c421c
JH
1499 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1500 * These need to be revisited when a newer toolchain becomes available.
1501 */
1502#if defined(__sparc64__) && defined(__GNUC__)
1503# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1504# undef SPARC64_GCC_WORKAROUND
1505# define SPARC64_GCC_WORKAROUND 1
1506# endif
1507#endif
1508
653099ff
GS
1509/* REx optimizer. Converts nodes into quickier variants "in place".
1510 Finds fixed substrings. */
1511
a0288114 1512/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
1513 to the position after last scanned or to NULL. */
1514
a3621e74 1515
76e3520e 1516STATIC I32
9a957fbc
AL
1517S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1518 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1519 /* scanp: Start here (read-write). */
1520 /* deltap: Write maxlen-minlen here. */
1521 /* last: Stop before this one. */
1522{
97aff369 1523 dVAR;
c277df42
IZ
1524 I32 min = 0, pars = 0, code;
1525 regnode *scan = *scanp, *next;
1526 I32 delta = 0;
1527 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1528 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1529 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1530 scan_data_t data_fake;
653099ff 1531 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1532 SV *re_trie_maxbuff = NULL;
1533
1534 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1535
c277df42
IZ
1536 while (scan && OP(scan) != END && scan < last) {
1537 /* Peephole optimizer: */
a3621e74 1538 DEBUG_OPTIMISE_r({
c445ea15 1539 SV * const mysv=sv_newmortal();
32fc9b6a 1540 regprop(RExC_rx, mysv, scan);
e4584336 1541 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
cfd0369c 1542 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
a3621e74 1543 });
c277df42 1544
22c35a8c 1545 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 1546 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1547 regnode *n = regnext(scan);
1548 U32 stringok = 1;
1549#ifdef DEBUGGING
1550 regnode *stop = scan;
b81d288d 1551#endif
c277df42 1552
cd439c50 1553 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1554 /* Skip NOTHING, merge EXACT*. */
1555 while (n &&
b81d288d 1556 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
1557 (stringok && (OP(n) == OP(scan))))
1558 && NEXT_OFF(n)
1559 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1560 if (OP(n) == TAIL || n > next)
1561 stringok = 0;
22c35a8c 1562 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
1563 NEXT_OFF(scan) += NEXT_OFF(n);
1564 next = n + NODE_STEP_REGNODE;
1565#ifdef DEBUGGING
1566 if (stringok)
1567 stop = n;
b81d288d 1568#endif
c277df42 1569 n = regnext(n);
a0ed51b3 1570 }
f49d4d0f 1571 else if (stringok) {
a3b680e6 1572 const int oldl = STR_LEN(scan);
c445ea15 1573 regnode * const nnext = regnext(n);
f49d4d0f 1574
b81d288d 1575 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1576 break;
1577 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1578 STR_LEN(scan) += STR_LEN(n);
1579 next = n + NODE_SZ_STR(n);
c277df42 1580 /* Now we can overwrite *n : */
f49d4d0f 1581 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1582#ifdef DEBUGGING
f49d4d0f 1583 stop = next - 1;
b81d288d 1584#endif
c277df42
IZ
1585 n = nnext;
1586 }
1587 }
61a36c01 1588
a3621e74 1589 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1590/*
1591 Two problematic code points in Unicode casefolding of EXACT nodes:
1592
1593 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1594 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1595
1596 which casefold to
1597
1598 Unicode UTF-8
1599
1600 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1601 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1602
1603 This means that in case-insensitive matching (or "loose matching",
1604 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1605 length of the above casefolded versions) can match a target string
1606 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1607 This would rather mess up the minimum length computation.
1608
1609 What we'll do is to look for the tail four bytes, and then peek
1610 at the preceding two bytes to see whether we need to decrease
1611 the minimum length by four (six minus two).
1612
1613 Thanks to the design of UTF-8, there cannot be false matches:
1614 A sequence of valid UTF-8 bytes cannot be a subsequence of
1615 another valid sequence of UTF-8 bytes.
1616
1617*/
c445ea15
AL
1618 char * const s0 = STRING(scan), *s, *t;
1619 char * const s1 = s0 + STR_LEN(scan) - 1;
1620 char * const s2 = s1 - 4;
d4c19fe8 1621 const char t0[] = "\xcc\x88\xcc\x81";
a28509cc 1622 const char * const t1 = t0 + 3;
2af232bd 1623
61a36c01
JH
1624 for (s = s0 + 2;
1625 s < s2 && (t = ninstr(s, s1, t0, t1));
1626 s = t + 4) {
1627 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1628 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1629 min -= 4;
1630 }
1631 }
1632
c277df42
IZ
1633#ifdef DEBUGGING
1634 /* Allow dumping */
cd439c50 1635 n = scan + NODE_SZ_STR(scan);
c277df42 1636 while (n <= stop) {
22c35a8c 1637 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1638 OP(n) = OPTIMIZED;
1639 NEXT_OFF(n) = 0;
1640 }
1641 n++;
1642 }
653099ff 1643#endif
c277df42 1644 }
a3621e74
YO
1645
1646
1647
653099ff
GS
1648 /* Follow the next-chain of the current node and optimize
1649 away all the NOTHINGs from it. */
c277df42 1650 if (OP(scan) != CURLYX) {
a3b680e6 1651 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
1652 ? I32_MAX
1653 /* I32 may be smaller than U16 on CRAYs! */
1654 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1655 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1656 int noff;
1657 regnode *n = scan;
b81d288d 1658
c277df42
IZ
1659 /* Skip NOTHING and LONGJMP. */
1660 while ((n = regnext(n))
22c35a8c 1661 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1662 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1663 && off + noff < max)
1664 off += noff;
1665 if (reg_off_by_arg[OP(scan)])
1666 ARG(scan) = off;
b81d288d 1667 else
c277df42
IZ
1668 NEXT_OFF(scan) = off;
1669 }
a3621e74 1670
653099ff
GS
1671 /* The principal pseudo-switch. Cannot be a switch, since we
1672 look into several different things. */
b81d288d 1673 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1674 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1675 next = regnext(scan);
1676 code = OP(scan);
a3621e74 1677 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1678
1679 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1680 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1681 struct regnode_charclass_class accum;
d4c19fe8 1682 regnode * const startbranch=scan;
c277df42 1683
653099ff 1684 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1685 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1686 if (flags & SCF_DO_STCLASS)
830247a4 1687 cl_init_zero(pRExC_state, &accum);
a3621e74 1688
c277df42 1689 while (OP(scan) == code) {
830247a4 1690 I32 deltanext, minnext, f = 0, fake;
653099ff 1691 struct regnode_charclass_class this_class;
c277df42
IZ
1692
1693 num++;
1694 data_fake.flags = 0;
b81d288d 1695 if (data) {
2c2d71f5 1696 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1697 data_fake.last_closep = data->last_closep;
1698 }
1699 else
1700 data_fake.last_closep = &fake;
c277df42
IZ
1701 next = regnext(scan);
1702 scan = NEXTOPER(scan);
1703 if (code != BRANCH)
1704 scan = NEXTOPER(scan);
653099ff 1705 if (flags & SCF_DO_STCLASS) {
830247a4 1706 cl_init(pRExC_state, &this_class);
653099ff
GS
1707 data_fake.start_class = &this_class;
1708 f = SCF_DO_STCLASS_AND;
b81d288d 1709 }
e1901655
IZ
1710 if (flags & SCF_WHILEM_VISITED_POS)
1711 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1712
653099ff 1713 /* we suppose the run is continuous, last=next...*/
830247a4 1714 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1715 next, &data_fake, f,depth+1);
b81d288d 1716 if (min1 > minnext)
c277df42
IZ
1717 min1 = minnext;
1718 if (max1 < minnext + deltanext)
1719 max1 = minnext + deltanext;
1720 if (deltanext == I32_MAX)
aca2d497 1721 is_inf = is_inf_internal = 1;
c277df42
IZ
1722 scan = next;
1723 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1724 pars++;
405ff068 1725 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1726 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1727 if (data)
1728 data->whilem_c = data_fake.whilem_c;
653099ff 1729 if (flags & SCF_DO_STCLASS)
830247a4 1730 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1731 if (code == SUSPEND)
c277df42
IZ
1732 break;
1733 }
1734 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1735 min1 = 0;
1736 if (flags & SCF_DO_SUBSTR) {
1737 data->pos_min += min1;
1738 data->pos_delta += max1 - min1;
1739 if (max1 != min1 || is_inf)
1740 data->longest = &(data->longest_float);
1741 }
1742 min += min1;
1743 delta += max1 - min1;
653099ff 1744 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1745 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1746 if (min1) {
1747 cl_and(data->start_class, &and_with);
1748 flags &= ~SCF_DO_STCLASS;
1749 }
1750 }
1751 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1752 if (min1) {
1753 cl_and(data->start_class, &accum);
653099ff 1754 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1755 }
1756 else {
b81d288d 1757 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1758 * data->start_class */
1759 StructCopy(data->start_class, &and_with,
1760 struct regnode_charclass_class);
1761 flags &= ~SCF_DO_STCLASS_AND;
1762 StructCopy(&accum, data->start_class,
1763 struct regnode_charclass_class);
1764 flags |= SCF_DO_STCLASS_OR;
1765 data->start_class->flags |= ANYOF_EOS;
1766 }
653099ff 1767 }
a3621e74
YO
1768
1769 /* demq.
1770
1771 Assuming this was/is a branch we are dealing with: 'scan' now
1772 points at the item that follows the branch sequence, whatever
1773 it is. We now start at the beginning of the sequence and look
1774 for subsequences of
1775
1776 BRANCH->EXACT=>X
1777 BRANCH->EXACT=>X
1778
1779 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1780
1781 If we can find such a subseqence we need to turn the first
1782 element into a trie and then add the subsequent branch exact
1783 strings to the trie.
1784
1785 We have two cases
1786
1787 1. patterns where the whole set of branch can be converted to a trie,
1788
1789 2. patterns where only a subset of the alternations can be
1790 converted to a trie.
1791
1792 In case 1 we can replace the whole set with a single regop
1793 for the trie. In case 2 we need to keep the start and end
1794 branchs so
1795
1796 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1797 becomes BRANCH TRIE; BRANCH X;
1798
1799 Hypthetically when we know the regex isnt anchored we can
1800 turn a case 1 into a DFA and let it rip... Every time it finds a match
1801 it would just call its tail, no WHILEM/CURLY needed.
1802
1803 */
0111c4fd
RGS
1804 if (DO_TRIE) {
1805 if (!re_trie_maxbuff) {
1806 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1807 if (!SvIOK(re_trie_maxbuff))
1808 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1809 }
a3621e74
YO
1810 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1811 regnode *cur;
1812 regnode *first = (regnode *)NULL;
1813 regnode *last = (regnode *)NULL;
1814 regnode *tail = scan;
1815 U8 optype = 0;
1816 U32 count=0;
1817
1818#ifdef DEBUGGING
c445ea15 1819 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
1820#endif
1821 /* var tail is used because there may be a TAIL
1822 regop in the way. Ie, the exacts will point to the
1823 thing following the TAIL, but the last branch will
1824 point at the TAIL. So we advance tail. If we
1825 have nested (?:) we may have to move through several
1826 tails.
1827 */
1828
1829 while ( OP( tail ) == TAIL ) {
1830 /* this is the TAIL generated by (?:) */
1831 tail = regnext( tail );
1832 }
1833
1834 DEBUG_OPTIMISE_r({
32fc9b6a 1835 regprop(RExC_rx, mysv, tail );
a3621e74 1836 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
cfd0369c 1837 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
a3621e74
YO
1838 (RExC_seen_evals) ? "[EVAL]" : ""
1839 );
1840 });
1841 /*
1842
1843 step through the branches, cur represents each
1844 branch, noper is the first thing to be matched
1845 as part of that branch and noper_next is the
1846 regnext() of that node. if noper is an EXACT
1847 and noper_next is the same as scan (our current
1848 position in the regex) then the EXACT branch is
1849 a possible optimization target. Once we have
1850 two or more consequetive such branches we can
1851 create a trie of the EXACT's contents and stich
1852 it in place. If the sequence represents all of
1853 the branches we eliminate the whole thing and
1854 replace it with a single TRIE. If it is a
1855 subsequence then we need to stitch it in. This
1856 means the first branch has to remain, and needs
1857 to be repointed at the item on the branch chain
1858 following the last branch optimized. This could
1859 be either a BRANCH, in which case the
1860 subsequence is internal, or it could be the
1861 item following the branch sequence in which
1862 case the subsequence is at the end.
1863
1864 */
1865
1866 /* dont use tail as the end marker for this traverse */
1867 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
1868 regnode * const noper = NEXTOPER( cur );
1869 regnode * const noper_next = regnext( noper );
a3621e74 1870
a3621e74 1871 DEBUG_OPTIMISE_r({
32fc9b6a 1872 regprop(RExC_rx, mysv, cur);
a3621e74 1873 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1874 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
a3621e74 1875
32fc9b6a 1876 regprop(RExC_rx, mysv, noper);
a3621e74 1877 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 1878 SvPV_nolen_const(mysv));
a3621e74
YO
1879
1880 if ( noper_next ) {
32fc9b6a 1881 regprop(RExC_rx, mysv, noper_next );
a3621e74 1882 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 1883 SvPV_nolen_const(mysv));
a3621e74
YO
1884 }
1885 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1886 first, last, cur );
1887 });
1888 if ( ( first ? OP( noper ) == optype
1889 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1890 && noper_next == tail && count<U16_MAX)
1891 {
1892 count++;
1893 if ( !first ) {
1894 first = cur;
1895 optype = OP( noper );
1896 } else {
1897 DEBUG_OPTIMISE_r(
1898 if (!last ) {
32fc9b6a 1899 regprop(RExC_rx, mysv, first);
a3621e74 1900 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1901 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
32fc9b6a 1902 regprop(RExC_rx, mysv, NEXTOPER(first) );
a3621e74 1903 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1904 SvPV_nolen_const( mysv ) );
a3621e74
YO
1905 }
1906 );
1907 last = cur;
1908 DEBUG_OPTIMISE_r({
32fc9b6a 1909 regprop(RExC_rx, mysv, cur);
a3621e74 1910 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1911 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
32fc9b6a 1912 regprop(RExC_rx, mysv, noper );
a3621e74 1913 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1914 SvPV_nolen_const( mysv ) );
a3621e74
YO
1915 });
1916 }
1917 } else {
1918 if ( last ) {
1919 DEBUG_OPTIMISE_r(
1920 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1921 (int)depth * 2 + 2, "E:", "**END**" );
a3621e74
YO
1922 );
1923 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1924 }
1925 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1926 && noper_next == tail )
1927 {
1928 count = 1;
1929 first = cur;
1930 optype = OP( noper );
1931 } else {
1932 count = 0;
1933 first = NULL;
1934 optype = 0;
1935 }
1936 last = NULL;
1937 }
1938 }
1939 DEBUG_OPTIMISE_r({
32fc9b6a 1940 regprop(RExC_rx, mysv, cur);
a3621e74 1941 PerlIO_printf( Perl_debug_log,
e4584336 1942 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
cfd0369c 1943 " ", SvPV_nolen_const( mysv ), first, last, cur);
a3621e74
YO
1944
1945 });
1946 if ( last ) {
1947 DEBUG_OPTIMISE_r(
1948 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1949 (int)depth * 2 + 2, "E:", "==END==" );
a3621e74
YO
1950 );
1951 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1952 }
1953 }
1954 }
a0ed51b3 1955 }
a3621e74 1956 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 1957 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 1958 } else /* single branch is optimized. */
c277df42
IZ
1959 scan = NEXTOPER(scan);
1960 continue;
a0ed51b3
LW
1961 }
1962 else if (OP(scan) == EXACT) {
cd439c50 1963 I32 l = STR_LEN(scan);
c445ea15 1964 UV uc;
a0ed51b3 1965 if (UTF) {
a3b680e6 1966 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 1967 l = utf8_length(s, s + l);
9041c2e3 1968 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
1969 } else {
1970 uc = *((U8*)STRING(scan));
a0ed51b3
LW
1971 }
1972 min += l;
c277df42 1973 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
1974 /* The code below prefers earlier match for fixed
1975 offset, later match for variable offset. */
1976 if (data->last_end == -1) { /* Update the start info. */
1977 data->last_start_min = data->pos_min;
1978 data->last_start_max = is_inf
b81d288d 1979 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 1980 }
cd439c50 1981 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
1982 if (UTF)
1983 SvUTF8_on(data->last_found);
0eda9292 1984 {
9a957fbc 1985 SV * const sv = data->last_found;
a28509cc 1986 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
1987 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1988 if (mg && mg->mg_len >= 0)
5e43f467
JH
1989 mg->mg_len += utf8_length((U8*)STRING(scan),
1990 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 1991 }
c277df42
IZ
1992 data->last_end = data->pos_min + l;
1993 data->pos_min += l; /* As in the first entry. */
1994 data->flags &= ~SF_BEFORE_EOL;
1995 }
653099ff
GS
1996 if (flags & SCF_DO_STCLASS_AND) {
1997 /* Check whether it is compatible with what we know already! */
1998 int compat = 1;
1999
1aa99e6b 2000 if (uc >= 0x100 ||
516a5887 2001 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2002 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2003 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2004 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2005 )
653099ff
GS
2006 compat = 0;
2007 ANYOF_CLASS_ZERO(data->start_class);
2008 ANYOF_BITMAP_ZERO(data->start_class);
2009 if (compat)
1aa99e6b 2010 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2011 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2012 if (uc < 0x100)
2013 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2014 }
2015 else if (flags & SCF_DO_STCLASS_OR) {
2016 /* false positive possible if the class is case-folded */
1aa99e6b 2017 if (uc < 0x100)
9b877dbb
IH
2018 ANYOF_BITMAP_SET(data->start_class, uc);
2019 else
2020 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2021 data->start_class->flags &= ~ANYOF_EOS;
2022 cl_and(data->start_class, &and_with);
2023 }
2024 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2025 }
653099ff 2026 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2027 I32 l = STR_LEN(scan);
1aa99e6b 2028 UV uc = *((U8*)STRING(scan));
653099ff
GS
2029
2030 /* Search for fixed substrings supports EXACT only. */
b81d288d 2031 if (flags & SCF_DO_SUBSTR)
830247a4 2032 scan_commit(pRExC_state, data);
a0ed51b3 2033 if (UTF) {
6136c704 2034 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2035 l = utf8_length(s, s + l);
9041c2e3 2036 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2037 }
2038 min += l;
c277df42 2039 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 2040 data->pos_min += l;
653099ff
GS
2041 if (flags & SCF_DO_STCLASS_AND) {
2042 /* Check whether it is compatible with what we know already! */
2043 int compat = 1;
2044
1aa99e6b 2045 if (uc >= 0x100 ||
516a5887 2046 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2047 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2048 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2049 compat = 0;
2050 ANYOF_CLASS_ZERO(data->start_class);
2051 ANYOF_BITMAP_ZERO(data->start_class);
2052 if (compat) {
1aa99e6b 2053 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2054 data->start_class->flags &= ~ANYOF_EOS;
2055 data->start_class->flags |= ANYOF_FOLD;
2056 if (OP(scan) == EXACTFL)
2057 data->start_class->flags |= ANYOF_LOCALE;
2058 }
2059 }
2060 else if (flags & SCF_DO_STCLASS_OR) {
2061 if (data->start_class->flags & ANYOF_FOLD) {
2062 /* false positive possible if the class is case-folded.
2063 Assume that the locale settings are the same... */
1aa99e6b
IH
2064 if (uc < 0x100)
2065 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2066 data->start_class->flags &= ~ANYOF_EOS;
2067 }
2068 cl_and(data->start_class, &and_with);
2069 }
2070 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2071 }
bfed75c6 2072 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2073 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2074 I32 f = flags, pos_before = 0;
d4c19fe8 2075 regnode * const oscan = scan;
653099ff
GS
2076 struct regnode_charclass_class this_class;
2077 struct regnode_charclass_class *oclass = NULL;
727f22e3 2078 I32 next_is_eval = 0;
653099ff 2079
22c35a8c 2080 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2081 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2082 scan = NEXTOPER(scan);
2083 goto finish;
2084 case PLUS:
653099ff 2085 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2086 next = NEXTOPER(scan);
653099ff 2087 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2088 mincount = 1;
2089 maxcount = REG_INFTY;
c277df42
IZ
2090 next = regnext(scan);
2091 scan = NEXTOPER(scan);
2092 goto do_curly;
2093 }
2094 }
2095 if (flags & SCF_DO_SUBSTR)
2096 data->pos_min++;
2097 min++;
2098 /* Fall through. */
2099 case STAR:
653099ff
GS
2100 if (flags & SCF_DO_STCLASS) {
2101 mincount = 0;
b81d288d 2102 maxcount = REG_INFTY;
653099ff
GS
2103 next = regnext(scan);
2104 scan = NEXTOPER(scan);
2105 goto do_curly;
2106 }
b81d288d 2107 is_inf = is_inf_internal = 1;
c277df42
IZ
2108 scan = regnext(scan);
2109 if (flags & SCF_DO_SUBSTR) {
830247a4 2110 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2111 data->longest = &(data->longest_float);
2112 }
2113 goto optimize_curly_tail;
2114 case CURLY:
b81d288d 2115 mincount = ARG1(scan);
c277df42
IZ
2116 maxcount = ARG2(scan);
2117 next = regnext(scan);
cb434fcc
IZ
2118 if (OP(scan) == CURLYX) {
2119 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2120 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2121 }
c277df42 2122 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2123 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2124 do_curly:
2125 if (flags & SCF_DO_SUBSTR) {
830247a4 2126 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2127 pos_before = data->pos_min;
2128 }
2129 if (data) {
2130 fl = data->flags;
2131 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2132 if (is_inf)
2133 data->flags |= SF_IS_INF;
2134 }
653099ff 2135 if (flags & SCF_DO_STCLASS) {
830247a4 2136 cl_init(pRExC_state, &this_class);
653099ff
GS
2137 oclass = data->start_class;
2138 data->start_class = &this_class;
2139 f |= SCF_DO_STCLASS_AND;
2140 f &= ~SCF_DO_STCLASS_OR;
2141 }
e1901655
IZ
2142 /* These are the cases when once a subexpression
2143 fails at a particular position, it cannot succeed
2144 even after backtracking at the enclosing scope.
b81d288d 2145
e1901655
IZ
2146 XXXX what if minimal match and we are at the
2147 initial run of {n,m}? */
2148 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2149 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2150
c277df42 2151 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2152 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2153 (mincount == 0
2154 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2155
2156 if (flags & SCF_DO_STCLASS)
2157 data->start_class = oclass;
2158 if (mincount == 0 || minnext == 0) {
2159 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2160 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2161 }
2162 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2163 /* Switch to OR mode: cache the old value of
653099ff
GS
2164 * data->start_class */
2165 StructCopy(data->start_class, &and_with,
2166 struct regnode_charclass_class);
2167 flags &= ~SCF_DO_STCLASS_AND;
2168 StructCopy(&this_class, data->start_class,
2169 struct regnode_charclass_class);
2170 flags |= SCF_DO_STCLASS_OR;
2171 data->start_class->flags |= ANYOF_EOS;
2172 }
2173 } else { /* Non-zero len */
2174 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2175 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2176 cl_and(data->start_class, &and_with);
2177 }
2178 else if (flags & SCF_DO_STCLASS_AND)
2179 cl_and(data->start_class, &this_class);
2180 flags &= ~SCF_DO_STCLASS;
2181 }
c277df42
IZ
2182 if (!scan) /* It was not CURLYX, but CURLY. */
2183 scan = next;
041457d9
DM
2184 if ( /* ? quantifier ok, except for (?{ ... }) */
2185 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2186 && (minnext == 0) && (deltanext == 0)
99799961 2187 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2188 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2189 && ckWARN(WARN_REGEXP))
b45f050a 2190 {
830247a4 2191 vWARN(RExC_parse,
b45f050a
JF
2192 "Quantifier unexpected on zero-length expression");
2193 }
2194
c277df42 2195 min += minnext * mincount;
b81d288d 2196 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2197 && (minnext + deltanext) > 0)
2198 || deltanext == I32_MAX);
aca2d497 2199 is_inf |= is_inf_internal;
c277df42
IZ
2200 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2201
2202 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2203 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2204 && data->flags & SF_IN_PAR
2205 && !(data->flags & SF_HAS_EVAL)
2206 && !deltanext && minnext == 1 ) {
2207 /* Try to optimize to CURLYN. */
2208 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2209 regnode * const nxt1 = nxt;
497b47a8
JH
2210#ifdef DEBUGGING
2211 regnode *nxt2;
2212#endif
c277df42
IZ
2213
2214 /* Skip open. */
2215 nxt = regnext(nxt);
bfed75c6 2216 if (!strchr((const char*)PL_simple,OP(nxt))
22c35a8c 2217 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 2218 && STR_LEN(nxt) == 1))
c277df42 2219 goto nogo;
497b47a8 2220#ifdef DEBUGGING
c277df42 2221 nxt2 = nxt;
497b47a8 2222#endif
c277df42 2223 nxt = regnext(nxt);
b81d288d 2224 if (OP(nxt) != CLOSE)
c277df42
IZ
2225 goto nogo;
2226 /* Now we know that nxt2 is the only contents: */
eb160463 2227 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2228 OP(oscan) = CURLYN;
2229 OP(nxt1) = NOTHING; /* was OPEN. */
2230#ifdef DEBUGGING
2231 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2232 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2233 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2234 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2235 OP(nxt + 1) = OPTIMIZED; /* was count. */
2236 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2237#endif
c277df42 2238 }
c277df42
IZ
2239 nogo:
2240
2241 /* Try optimization CURLYX => CURLYM. */
b81d288d 2242 if ( OP(oscan) == CURLYX && data
c277df42 2243 && !(data->flags & SF_HAS_PAR)
c277df42 2244 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2245 && !deltanext /* atom is fixed width */
2246 && minnext != 0 /* CURLYM can't handle zero width */
2247 ) {
c277df42
IZ
2248 /* XXXX How to optimize if data == 0? */
2249 /* Optimize to a simpler form. */
2250 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2251 regnode *nxt2;
2252
2253 OP(oscan) = CURLYM;
2254 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2255 && (OP(nxt2) != WHILEM))
c277df42
IZ
2256 nxt = nxt2;
2257 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2258 /* Need to optimize away parenths. */
2259 if (data->flags & SF_IN_PAR) {
2260 /* Set the parenth number. */
2261 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2262
b81d288d 2263 if (OP(nxt) != CLOSE)
b45f050a 2264 FAIL("Panic opt close");
eb160463 2265 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2266 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2267 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2268#ifdef DEBUGGING
2269 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2270 OP(nxt + 1) = OPTIMIZED; /* was count. */
2271 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2272 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2273#endif
c277df42
IZ
2274#if 0
2275 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2276 regnode *nnxt = regnext(nxt1);
b81d288d 2277
c277df42
IZ
2278 if (nnxt == nxt) {
2279 if (reg_off_by_arg[OP(nxt1)])
2280 ARG_SET(nxt1, nxt2 - nxt1);
2281 else if (nxt2 - nxt1 < U16_MAX)
2282 NEXT_OFF(nxt1) = nxt2 - nxt1;
2283 else
2284 OP(nxt) = NOTHING; /* Cannot beautify */
2285 }
2286 nxt1 = nnxt;
2287 }
2288#endif
2289 /* Optimize again: */
b81d288d 2290 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2291 NULL, 0,depth+1);
a0ed51b3
LW
2292 }
2293 else
c277df42 2294 oscan->flags = 0;
c277df42 2295 }
e1901655
IZ
2296 else if ((OP(oscan) == CURLYX)
2297 && (flags & SCF_WHILEM_VISITED_POS)
2298 /* See the comment on a similar expression above.
2299 However, this time it not a subexpression
2300 we care about, but the expression itself. */
2301 && (maxcount == REG_INFTY)
2302 && data && ++data->whilem_c < 16) {
2303 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2304 /* Find WHILEM (as in regexec.c) */
2305 regnode *nxt = oscan + NEXT_OFF(oscan);
2306
2307 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2308 nxt += ARG(nxt);
eb160463
GS
2309 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2310 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2311 }
b81d288d 2312 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2313 pars++;
2314 if (flags & SCF_DO_SUBSTR) {
c445ea15 2315 SV *last_str = NULL;
c277df42
IZ
2316 int counted = mincount != 0;
2317
2318 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2319#if defined(SPARC64_GCC_WORKAROUND)
2320 I32 b = 0;
2321 STRLEN l = 0;
cfd0369c 2322 const char *s = NULL;
5d1c421c
JH
2323 I32 old = 0;
2324
2325 if (pos_before >= data->last_start_min)
2326 b = pos_before;
2327 else
2328 b = data->last_start_min;
2329
2330 l = 0;
cfd0369c 2331 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2332 old = b - data->last_start_min;
2333
2334#else
b81d288d 2335 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2336 ? pos_before : data->last_start_min;
2337 STRLEN l;
d4c19fe8 2338 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2339 I32 old = b - data->last_start_min;
5d1c421c 2340#endif
a0ed51b3
LW
2341
2342 if (UTF)
2343 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2344
a0ed51b3 2345 l -= old;
c277df42 2346 /* Get the added string: */
79cb57f6 2347 last_str = newSVpvn(s + old, l);
0e933229
IH
2348 if (UTF)
2349 SvUTF8_on(last_str);
c277df42
IZ
2350 if (deltanext == 0 && pos_before == b) {
2351 /* What was added is a constant string */
2352 if (mincount > 1) {
2353 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2354 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2355 SvPVX_const(last_str), l, mincount - 1);
b162af07 2356 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2357 /* Add additional parts. */
b81d288d 2358 SvCUR_set(data->last_found,
c277df42
IZ
2359 SvCUR(data->last_found) - l);
2360 sv_catsv(data->last_found, last_str);
0eda9292
JH
2361 {
2362 SV * sv = data->last_found;
2363 MAGIC *mg =
2364 SvUTF8(sv) && SvMAGICAL(sv) ?
2365 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2366 if (mg && mg->mg_len >= 0)
2367 mg->mg_len += CHR_SVLEN(last_str);
2368 }
c277df42
IZ
2369 data->last_end += l * (mincount - 1);
2370 }
2a8d9689
HS
2371 } else {
2372 /* start offset must point into the last copy */
2373 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2374 data->last_start_max += is_inf ? I32_MAX
2375 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2376 }
2377 }
2378 /* It is counted once already... */
2379 data->pos_min += minnext * (mincount - counted);
2380 data->pos_delta += - counted * deltanext +
2381 (minnext + deltanext) * maxcount - minnext * mincount;
2382 if (mincount != maxcount) {
653099ff
GS
2383 /* Cannot extend fixed substrings found inside
2384 the group. */
830247a4 2385 scan_commit(pRExC_state,data);
c277df42 2386 if (mincount && last_str) {
d4c19fe8
AL
2387 SV * const sv = data->last_found;
2388 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
2389 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2390
2391 if (mg)
2392 mg->mg_len = -1;
2393 sv_setsv(sv, last_str);
c277df42 2394 data->last_end = data->pos_min;
b81d288d 2395 data->last_start_min =
a0ed51b3 2396 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2397 data->last_start_max = is_inf
2398 ? I32_MAX
c277df42 2399 : data->pos_min + data->pos_delta
a0ed51b3 2400 - CHR_SVLEN(last_str);
c277df42
IZ
2401 }
2402 data->longest = &(data->longest_float);
2403 }
aca2d497 2404 SvREFCNT_dec(last_str);
c277df42 2405 }
405ff068 2406 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2407 data->flags |= SF_HAS_EVAL;
2408 optimize_curly_tail:
c277df42 2409 if (OP(oscan) != CURLYX) {
22c35a8c 2410 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2411 && NEXT_OFF(next))
2412 NEXT_OFF(oscan) += NEXT_OFF(next);
2413 }
c277df42 2414 continue;
653099ff 2415 default: /* REF and CLUMP only? */
c277df42 2416 if (flags & SCF_DO_SUBSTR) {
830247a4 2417 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2418 data->longest = &(data->longest_float);
2419 }
aca2d497 2420 is_inf = is_inf_internal = 1;
653099ff 2421 if (flags & SCF_DO_STCLASS_OR)
830247a4 2422 cl_anything(pRExC_state, data->start_class);
653099ff 2423 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2424 break;
2425 }
a0ed51b3 2426 }
bfed75c6 2427 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2428 int value = 0;
653099ff 2429
c277df42 2430 if (flags & SCF_DO_SUBSTR) {
830247a4 2431 scan_commit(pRExC_state,data);
c277df42
IZ
2432 data->pos_min++;
2433 }
2434 min++;
653099ff
GS
2435 if (flags & SCF_DO_STCLASS) {
2436 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2437
2438 /* Some of the logic below assumes that switching
2439 locale on will only add false positives. */
2440 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2441 case SANY:
653099ff
GS
2442 default:
2443 do_default:
2444 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2445 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2446 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2447 break;
2448 case REG_ANY:
2449 if (OP(scan) == SANY)
2450 goto do_default;
2451 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2452 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2453 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2454 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2455 }
2456 if (flags & SCF_DO_STCLASS_AND || !value)
2457 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2458 break;
2459 case ANYOF:
2460 if (flags & SCF_DO_STCLASS_AND)
2461 cl_and(data->start_class,
2462 (struct regnode_charclass_class*)scan);
2463 else
830247a4 2464 cl_or(pRExC_state, data->start_class,
653099ff
GS
2465 (struct regnode_charclass_class*)scan);
2466 break;
2467 case ALNUM:
2468 if (flags & SCF_DO_STCLASS_AND) {
2469 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2470 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2471 for (value = 0; value < 256; value++)
2472 if (!isALNUM(value))
2473 ANYOF_BITMAP_CLEAR(data->start_class, value);
2474 }
2475 }
2476 else {
2477 if (data->start_class->flags & ANYOF_LOCALE)
2478 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2479 else {
2480 for (value = 0; value < 256; value++)
2481 if (isALNUM(value))
b81d288d 2482 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2483 }
2484 }
2485 break;
2486 case ALNUML:
2487 if (flags & SCF_DO_STCLASS_AND) {
2488 if (data->start_class->flags & ANYOF_LOCALE)
2489 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2490 }
2491 else {
2492 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2493 data->start_class->flags |= ANYOF_LOCALE;
2494 }
2495 break;
2496 case NALNUM:
2497 if (flags & SCF_DO_STCLASS_AND) {
2498 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2499 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2500 for (value = 0; value < 256; value++)
2501 if (isALNUM(value))
2502 ANYOF_BITMAP_CLEAR(data->start_class, value);
2503 }
2504 }
2505 else {
2506 if (data->start_class->flags & ANYOF_LOCALE)
2507 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2508 else {
2509 for (value = 0; value < 256; value++)
2510 if (!isALNUM(value))
b81d288d 2511 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2512 }
2513 }
2514 break;
2515 case NALNUML:
2516 if (flags & SCF_DO_STCLASS_AND) {
2517 if (data->start_class->flags & ANYOF_LOCALE)
2518 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2519 }
2520 else {
2521 data->start_class->flags |= ANYOF_LOCALE;
2522 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2523 }
2524 break;
2525 case SPACE:
2526 if (flags & SCF_DO_STCLASS_AND) {
2527 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2528 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2529 for (value = 0; value < 256; value++)
2530 if (!isSPACE(value))
2531 ANYOF_BITMAP_CLEAR(data->start_class, value);
2532 }
2533 }
2534 else {
2535 if (data->start_class->flags & ANYOF_LOCALE)
2536 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2537 else {
2538 for (value = 0; value < 256; value++)
2539 if (isSPACE(value))
b81d288d 2540 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2541 }
2542 }
2543 break;
2544 case SPACEL:
2545 if (flags & SCF_DO_STCLASS_AND) {
2546 if (data->start_class->flags & ANYOF_LOCALE)
2547 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2548 }
2549 else {
2550 data->start_class->flags |= ANYOF_LOCALE;
2551 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2552 }
2553 break;
2554 case NSPACE:
2555 if (flags & SCF_DO_STCLASS_AND) {
2556 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2557 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2558 for (value = 0; value < 256; value++)
2559 if (isSPACE(value))
2560 ANYOF_BITMAP_CLEAR(data->start_class, value);
2561 }
2562 }
2563 else {
2564 if (data->start_class->flags & ANYOF_LOCALE)
2565 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2566 else {
2567 for (value = 0; value < 256; value++)
2568 if (!isSPACE(value))
b81d288d 2569 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2570 }
2571 }
2572 break;
2573 case NSPACEL:
2574 if (flags & SCF_DO_STCLASS_AND) {
2575 if (data->start_class->flags & ANYOF_LOCALE) {
2576 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2577 for (value = 0; value < 256; value++)
2578 if (!isSPACE(value))
2579 ANYOF_BITMAP_CLEAR(data->start_class, value);
2580 }
2581 }
2582 else {
2583 data->start_class->flags |= ANYOF_LOCALE;
2584 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2585 }
2586 break;
2587 case DIGIT:
2588 if (flags & SCF_DO_STCLASS_AND) {
2589 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2590 for (value = 0; value < 256; value++)
2591 if (!isDIGIT(value))
2592 ANYOF_BITMAP_CLEAR(data->start_class, value);
2593 }
2594 else {
2595 if (data->start_class->flags & ANYOF_LOCALE)
2596 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2597 else {
2598 for (value = 0; value < 256; value++)
2599 if (isDIGIT(value))
b81d288d 2600 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2601 }
2602 }
2603 break;
2604 case NDIGIT:
2605 if (flags & SCF_DO_STCLASS_AND) {
2606 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2607 for (value = 0; value < 256; value++)
2608 if (isDIGIT(value))
2609 ANYOF_BITMAP_CLEAR(data->start_class, value);
2610 }
2611 else {
2612 if (data->start_class->flags & ANYOF_LOCALE)
2613 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2614 else {
2615 for (value = 0; value < 256; value++)
2616 if (!isDIGIT(value))
b81d288d 2617 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2618 }
2619 }
2620 break;
2621 }
2622 if (flags & SCF_DO_STCLASS_OR)
2623 cl_and(data->start_class, &and_with);
2624 flags &= ~SCF_DO_STCLASS;
2625 }
a0ed51b3 2626 }
22c35a8c 2627 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2628 data->flags |= (OP(scan) == MEOL
2629 ? SF_BEFORE_MEOL
2630 : SF_BEFORE_SEOL);
a0ed51b3 2631 }
653099ff
GS
2632 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2633 /* Lookbehind, or need to calculate parens/evals/stclass: */
2634 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2635 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2636 /* Lookahead/lookbehind */
cb434fcc 2637 I32 deltanext, minnext, fake = 0;
c277df42 2638 regnode *nscan;
653099ff
GS
2639 struct regnode_charclass_class intrnl;
2640 int f = 0;
c277df42
IZ
2641
2642 data_fake.flags = 0;
b81d288d 2643 if (data) {
2c2d71f5 2644 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2645 data_fake.last_closep = data->last_closep;
2646 }
2647 else
2648 data_fake.last_closep = &fake;
653099ff
GS
2649 if ( flags & SCF_DO_STCLASS && !scan->flags
2650 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2651 cl_init(pRExC_state, &intrnl);
653099ff 2652 data_fake.start_class = &intrnl;
e1901655 2653 f |= SCF_DO_STCLASS_AND;
653099ff 2654 }
e1901655
IZ
2655 if (flags & SCF_WHILEM_VISITED_POS)
2656 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2657 next = regnext(scan);
2658 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2659 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2660 if (scan->flags) {
2661 if (deltanext) {
9baa0206 2662 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2663 }
2664 else if (minnext > U8_MAX) {
9baa0206 2665 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2666 }
eb160463 2667 scan->flags = (U8)minnext;
c277df42
IZ
2668 }
2669 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2670 pars++;
405ff068 2671 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2672 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2673 if (data)
2674 data->whilem_c = data_fake.whilem_c;
e1901655 2675 if (f & SCF_DO_STCLASS_AND) {
a28509cc 2676 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
2677
2678 cl_and(data->start_class, &intrnl);
2679 if (was)
2680 data->start_class->flags |= ANYOF_EOS;
2681 }
a0ed51b3
LW
2682 }
2683 else if (OP(scan) == OPEN) {
c277df42 2684 pars++;
a0ed51b3 2685 }
cb434fcc 2686 else if (OP(scan) == CLOSE) {
eb160463 2687 if ((I32)ARG(scan) == is_par) {
cb434fcc 2688 next = regnext(scan);
c277df42 2689
cb434fcc
IZ
2690 if ( next && (OP(next) != WHILEM) && next < last)
2691 is_par = 0; /* Disable optimization */
2692 }
2693 if (data)
2694 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2695 }
2696 else if (OP(scan) == EVAL) {
c277df42
IZ
2697 if (data)
2698 data->flags |= SF_HAS_EVAL;
2699 }
96776eda 2700 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2701 if (flags & SCF_DO_SUBSTR) {
830247a4 2702 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2703 data->longest = &(data->longest_float);
2704 }
2705 is_inf = is_inf_internal = 1;
653099ff 2706 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2707 cl_anything(pRExC_state, data->start_class);
96776eda 2708 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2709 }
c277df42
IZ
2710 /* Else: zero-length, ignore. */
2711 scan = regnext(scan);
2712 }
2713
2714 finish:
2715 *scanp = scan;
aca2d497 2716 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2717 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2718 data->pos_delta = I32_MAX - data->pos_min;
2719 if (is_par > U8_MAX)
2720 is_par = 0;
2721 if (is_par && pars==1 && data) {
2722 data->flags |= SF_IN_PAR;
2723 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2724 }
2725 else if (pars && data) {
c277df42
IZ
2726 data->flags |= SF_HAS_PAR;
2727 data->flags &= ~SF_IN_PAR;
2728 }
653099ff
GS
2729 if (flags & SCF_DO_STCLASS_OR)
2730 cl_and(data->start_class, &and_with);
c277df42
IZ
2731 return min;
2732}
2733
76e3520e 2734STATIC I32
5f66b61c 2735S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2736{
830247a4 2737 if (RExC_rx->data) {
b81d288d
AB
2738 Renewc(RExC_rx->data,
2739 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2740 char, struct reg_data);
830247a4
IZ
2741 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2742 RExC_rx->data->count += n;
a0ed51b3
LW
2743 }
2744 else {
a02a5408 2745 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2746 char, struct reg_data);
a02a5408 2747 Newx(RExC_rx->data->what, n, U8);
830247a4 2748 RExC_rx->data->count = n;
c277df42 2749 }
830247a4
IZ
2750 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2751 return RExC_rx->data->count - n;
c277df42
IZ
2752}
2753
d88dccdf 2754void
864dbfa3 2755Perl_reginitcolors(pTHX)
d88dccdf 2756{
97aff369 2757 dVAR;
1df70142 2758 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 2759 if (s) {
1df70142
AL
2760 char *t = savepv(s);
2761 int i = 0;
2762 PL_colors[0] = t;
d88dccdf 2763 while (++i < 6) {
1df70142
AL
2764 t = strchr(t, '\t');
2765 if (t) {
2766 *t = '\0';
2767 PL_colors[i] = ++t;
d88dccdf
IZ
2768 }
2769 else
1df70142 2770 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
2771 }
2772 } else {
1df70142 2773 int i = 0;
b81d288d 2774 while (i < 6)
06b5626a 2775 PL_colors[i++] = (char *)"";
d88dccdf
IZ
2776 }
2777 PL_colorset = 1;
2778}
2779
8615cb43 2780
a687059c 2781/*
e50aee73 2782 - pregcomp - compile a regular expression into internal code
a687059c
LW
2783 *
2784 * We can't allocate space until we know how big the compiled form will be,
2785 * but we can't compile it (and thus know how big it is) until we've got a
2786 * place to put the code. So we cheat: we compile it twice, once with code
2787 * generation turned off and size counting turned on, and once "for real".
2788 * This also means that we don't allocate space until we are sure that the
2789 * thing really will compile successfully, and we never have to move the
2790 * code and thus invalidate pointers into it. (Note that it has to be in
2791 * one piece because free() must be able to free it all.) [NB: not true in perl]
2792 *
2793 * Beware that the optimization-preparation code in here knows about some
2794 * of the structure of the compiled regexp. [I'll say.]
2795 */
2796regexp *
864dbfa3 2797Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2798{
97aff369 2799 dVAR;
a0d0e21e 2800 register regexp *r;
c277df42 2801 regnode *scan;
c277df42 2802 regnode *first;
a0d0e21e 2803 I32 flags;
a0d0e21e
LW
2804 I32 minlen = 0;
2805 I32 sawplus = 0;
2806 I32 sawopen = 0;
2c2d71f5 2807 scan_data_t data;
830247a4
IZ
2808 RExC_state_t RExC_state;
2809 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 2810
a3621e74
YO
2811 GET_RE_DEBUG_FLAGS_DECL;
2812
a0d0e21e 2813 if (exp == NULL)
c277df42 2814 FAIL("NULL regexp argument");
a0d0e21e 2815
a5961de5 2816 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 2817
5cfc7842 2818 RExC_precomp = exp;
a3621e74
YO
2819 DEBUG_r(if (!PL_colorset) reginitcolors());
2820 DEBUG_COMPILE_r({
2821 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
2822 PL_colors[4],PL_colors[5],PL_colors[0],
2823 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2824 });
e2509266 2825 RExC_flags = pm->op_pmflags;
830247a4 2826 RExC_sawback = 0;
bbce6d69 2827
830247a4
IZ
2828 RExC_seen = 0;
2829 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2830 RExC_seen_evals = 0;
2831 RExC_extralen = 0;
c277df42 2832
bbce6d69 2833 /* First pass: determine size, legality. */
830247a4 2834 RExC_parse = exp;
fac92740 2835 RExC_start = exp;
830247a4
IZ
2836 RExC_end = xend;
2837 RExC_naughty = 0;
2838 RExC_npar = 1;
2839 RExC_size = 0L;
2840 RExC_emit = &PL_regdummy;
2841 RExC_whilem_seen = 0;
85ddcde9
JH
2842#if 0 /* REGC() is (currently) a NOP at the first pass.
2843 * Clever compilers notice this and complain. --jhi */
830247a4 2844 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 2845#endif
830247a4 2846 if (reg(pRExC_state, 0, &flags) == NULL) {
c445ea15 2847 RExC_precomp = NULL;
a0d0e21e
LW
2848 return(NULL);
2849 }
a3621e74 2850 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 2851
c277df42
IZ
2852 /* Small enough for pointer-storage convention?
2853 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
2854 if (RExC_size >= 0x10000L && RExC_extralen)
2855 RExC_size += RExC_extralen;
c277df42 2856 else
830247a4
IZ
2857 RExC_extralen = 0;
2858 if (RExC_whilem_seen > 15)
2859 RExC_whilem_seen = 15;
a0d0e21e 2860
bbce6d69 2861 /* Allocate space and initialize. */
a02a5408 2862 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 2863 char, regexp);
a0d0e21e 2864 if (r == NULL)
b45f050a
JF
2865 FAIL("Regexp out of space");
2866
0f79a09d
GS
2867#ifdef DEBUGGING
2868 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 2869 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 2870#endif
c277df42 2871 r->refcnt = 1;
bbce6d69 2872 r->prelen = xend - exp;
5cfc7842 2873 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 2874 r->subbeg = NULL;
f8c7b90f 2875#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 2876 r->saved_copy = NULL;
ed252734 2877#endif
cf93c79d 2878 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 2879 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 2880 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
2881
2882 r->substrs = 0; /* Useful during FAIL. */
2883 r->startp = 0; /* Useful during FAIL. */
2884 r->endp = 0; /* Useful during FAIL. */
2885
a02a5408 2886 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 2887 if (r->offsets) {
2af232bd 2888 r->offsets[0] = RExC_size;
fac92740 2889 }
a3621e74 2890 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
2891 "%s %"UVuf" bytes for offset annotations.\n",
2892 r->offsets ? "Got" : "Couldn't get",
392fbf5d 2893 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 2894
830247a4 2895 RExC_rx = r;
bbce6d69 2896
2897 /* Second pass: emit code. */
e2509266 2898 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
2899 RExC_parse = exp;
2900 RExC_end = xend;
2901 RExC_naughty = 0;
2902 RExC_npar = 1;
fac92740 2903 RExC_emit_start = r->program;
830247a4 2904 RExC_emit = r->program;
2cd61cdb 2905 /* Store the count of eval-groups for security checks: */
eb160463 2906 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 2907 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 2908 r->data = 0;
830247a4 2909 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
2910 return(NULL);
2911
a3621e74 2912
a0d0e21e 2913 /* Dig out information for optimizations. */
cf93c79d 2914 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 2915 pm->op_pmflags = RExC_flags;
a0ed51b3 2916 if (UTF)
5ff6fc6d 2917 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 2918 r->regstclass = NULL;
830247a4 2919 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 2920 r->reganch |= ROPT_NAUGHTY;
c277df42 2921 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
2922
2923 /* XXXX To minimize changes to RE engine we always allocate
2924 3-units-long substrs field. */
a02a5408 2925 Newxz(r->substrs, 1, struct reg_substr_data);
2779dcf1 2926
2c2d71f5 2927 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 2928 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 2929 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 2930 I32 fake;
c5254dd6 2931 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
2932 struct regnode_charclass_class ch_class;
2933 int stclass_flag;
cb434fcc 2934 I32 last_close = 0;
a0d0e21e
LW
2935
2936 first = scan;
c277df42 2937 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 2938 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 2939 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
2940 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2941 (OP(first) == PLUS) ||
2942 (OP(first) == MINMOD) ||
653099ff 2943 /* An {n,m} with n>0 */
22c35a8c 2944 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
2945 if (OP(first) == PLUS)
2946 sawplus = 1;
2947 else
2948 first += regarglen[(U8)OP(first)];
2949 first = NEXTOPER(first);
a687059c
LW
2950 }
2951
a0d0e21e
LW
2952 /* Starting-point info. */
2953 again:
653099ff 2954 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b 2955 if (OP(first) == EXACT)
bb263b4e 2956 /*EMPTY*/; /* Empty, get anchored substr later. */
1aa99e6b 2957 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
2958 r->regstclass = first;
2959 }
bfed75c6 2960 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 2961 r->regstclass = first;
22c35a8c
GS
2962 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2963 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 2964 r->regstclass = first;
22c35a8c 2965 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
2966 r->reganch |= (OP(first) == MBOL
2967 ? ROPT_ANCH_MBOL
2968 : (OP(first) == SBOL
2969 ? ROPT_ANCH_SBOL
2970 : ROPT_ANCH_BOL));
a0d0e21e 2971 first = NEXTOPER(first);
774d564b 2972 goto again;
2973 }
2974 else if (OP(first) == GPOS) {
2975 r->reganch |= ROPT_ANCH_GPOS;
2976 first = NEXTOPER(first);
2977 goto again;
a0d0e21e 2978 }
e09294f4 2979 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 2980 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
2981 !(r->reganch & ROPT_ANCH) )
2982 {
2983 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
2984 const int type =
2985 (OP(NEXTOPER(first)) == REG_ANY)
2986 ? ROPT_ANCH_MBOL
2987 : ROPT_ANCH_SBOL;
cad2e5aa 2988 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 2989 first = NEXTOPER(first);
774d564b 2990 goto again;
a0d0e21e 2991 }
b81d288d 2992 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 2993 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
2994 /* x+ must match at the 1st pos of run of x's */
2995 r->reganch |= ROPT_SKIP;
a0d0e21e 2996
c277df42 2997 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 2998 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 2999 (IV)(first - scan + 1)));
a0d0e21e
LW
3000 /*
3001 * If there's something expensive in the r.e., find the
3002 * longest literal string that must appear and make it the
3003 * regmust. Resolve ties in favor of later strings, since
3004 * the regstart check works with the beginning of the r.e.
3005 * and avoiding duplication strengthens checking. Not a
3006 * strong reason, but sufficient in the absence of others.
3007 * [Now we resolve ties in favor of the earlier string if
c277df42 3008 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3009 * earlier string may buy us something the later one won't.]
3010 */
a0d0e21e 3011 minlen = 0;
a687059c 3012
396482e1
GA
3013 data.longest_fixed = newSVpvs("");
3014 data.longest_float = newSVpvs("");
3015 data.last_found = newSVpvs("");
c277df42
IZ
3016 data.longest = &(data.longest_fixed);
3017 first = scan;
653099ff 3018 if (!r->regstclass) {
830247a4 3019 cl_init(pRExC_state, &ch_class);
653099ff
GS
3020 data.start_class = &ch_class;
3021 stclass_flag = SCF_DO_STCLASS_AND;
3022 } else /* XXXX Check for BOUND? */
3023 stclass_flag = 0;
cb434fcc 3024 data.last_closep = &last_close;
653099ff 3025
830247a4 3026 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3027 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3028 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3029 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3030 && !RExC_seen_zerolen
3031 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3032 r->reganch |= ROPT_CHECK_ALL;
830247a4 3033 scan_commit(pRExC_state, &data);
c277df42
IZ
3034 SvREFCNT_dec(data.last_found);
3035
a0ed51b3 3036 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3037 if (longest_float_length
c277df42
IZ
3038 || (data.flags & SF_FL_BEFORE_EOL
3039 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3040 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3041 int t;
3042
a0ed51b3 3043 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3044 && data.offset_fixed == data.offset_float_min
3045 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3046 goto remove_float; /* As in (a)+. */
3047
33b8afdf
JH
3048 if (SvUTF8(data.longest_float)) {
3049 r->float_utf8 = data.longest_float;
c445ea15 3050 r->float_substr = NULL;
33b8afdf
JH
3051 } else {
3052 r->float_substr = data.longest_float;
c445ea15 3053 r->float_utf8 = NULL;
33b8afdf 3054 }
c277df42
IZ
3055 r->float_min_offset = data.offset_float_min;
3056 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3057 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3058 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3059 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3060 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3061 }
3062 else {
aca2d497 3063 remove_float:
c445ea15 3064 r->float_substr = r->float_utf8 = NULL;
c277df42 3065 SvREFCNT_dec(data.longest_float);
c5254dd6 3066 longest_float_length = 0;
a0d0e21e 3067 }
c277df42 3068
a0ed51b3 3069 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3070 if (longest_fixed_length
c277df42
IZ
3071 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3072 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3073 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3074 int t;
3075
33b8afdf
JH
3076 if (SvUTF8(data.longest_fixed)) {
3077 r->anchored_utf8 = data.longest_fixed;
c445ea15 3078 r->anchored_substr = NULL;
33b8afdf
JH
3079 } else {
3080 r->anchored_substr = data.longest_fixed;
c445ea15 3081 r->anchored_utf8 = NULL;
33b8afdf 3082 }
c277df42 3083 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3084 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3085 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3086 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3087 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3088 }
3089 else {
c445ea15 3090 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 3091 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3092 longest_fixed_length = 0;
a0d0e21e 3093 }
b81d288d 3094 if (r->regstclass
ffc61ed2 3095 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3096 r->regstclass = NULL;
33b8afdf
JH
3097 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3098 && stclass_flag
653099ff 3099 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3100 && !cl_is_anything(data.start_class))
3101 {
1df70142 3102 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3103
a02a5408 3104 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3105 struct regnode_charclass_class);
3106 StructCopy(data.start_class,
830247a4 3107 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3108 struct regnode_charclass_class);
830247a4 3109 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3110 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3111 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 3112 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3113 PerlIO_printf(Perl_debug_log,
a0288114 3114 "synthetic stclass \"%s\".\n",
3f7c398e 3115 SvPVX_const(sv));});
653099ff 3116 }
c277df42
IZ
3117
3118 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3119 if (longest_fixed_length > longest_float_length) {
c277df42 3120 r->check_substr = r->anchored_substr;
33b8afdf 3121 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3122 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3123 if (r->reganch & ROPT_ANCH_SINGLE)
3124 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3125 }
3126 else {
c277df42 3127 r->check_substr = r->float_substr;
33b8afdf 3128 r->check_utf8 = r->float_utf8;
c277df42
IZ
3129 r->check_offset_min = data.offset_float_min;
3130 r->check_offset_max = data.offset_float_max;
a0d0e21e 3131 }
30382c73
IZ
3132 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3133 This should be changed ASAP! */
33b8afdf 3134 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3135 r->reganch |= RE_USE_INTUIT;
33b8afdf 3136 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3137 r->reganch |= RE_INTUIT_TAIL;
3138 }
a0ed51b3
LW
3139 }
3140 else {
c277df42
IZ
3141 /* Several toplevels. Best we can is to set minlen. */
3142 I32 fake;
653099ff 3143 struct regnode_charclass_class ch_class;
cb434fcc 3144 I32 last_close = 0;
c277df42 3145
a3621e74 3146 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3147 scan = r->program + 1;
830247a4 3148 cl_init(pRExC_state, &ch_class);
653099ff 3149 data.start_class = &ch_class;
cb434fcc 3150 data.last_closep = &last_close;
a3621e74 3151 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf 3152 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 3153 = r->float_substr = r->float_utf8 = NULL;
653099ff 3154 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3155 && !cl_is_anything(data.start_class))
3156 {
1df70142 3157 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3158
a02a5408 3159 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3160 struct regnode_charclass_class);
3161 StructCopy(data.start_class,
830247a4 3162 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3163 struct regnode_charclass_class);
830247a4 3164 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3165 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3166 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 3167 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3168 PerlIO_printf(Perl_debug_log,
a0288114 3169 "synthetic stclass \"%s\".\n",
3f7c398e 3170 SvPVX_const(sv));});
653099ff 3171 }
a0d0e21e
LW
3172 }
3173
a0d0e21e 3174 r->minlen = minlen;
b81d288d 3175 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3176 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3177 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3178 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3179 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3180 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3181 if (RExC_seen & REG_SEEN_CANY)
3182 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3183 Newxz(r->startp, RExC_npar, I32);
3184 Newxz(r->endp, RExC_npar, I32);
a3621e74 3185 DEBUG_COMPILE_r(regdump(r));
a0d0e21e 3186 return(r);
a687059c
LW
3187}
3188
3189/*
3190 - reg - regular expression, i.e. main body or parenthesized thing
3191 *
3192 * Caller must absorb opening parenthesis.
3193 *
3194 * Combining parenthesis handling with the base level of regular expression
3195 * is a trifle forced, but the need to tie the tails of the branches to what
3196 * follows makes it hard to avoid.
3197 */
76e3520e 3198STATIC regnode *
830247a4 3199S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 3200 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3201{
27da23d5 3202 dVAR;
c277df42
IZ
3203 register regnode *ret; /* Will be the head of the group. */
3204 register regnode *br;
3205 register regnode *lastbr;
cbbf8932 3206 register regnode *ender = NULL;
a0d0e21e 3207 register I32 parno = 0;
cbbf8932
AL
3208 I32 flags;
3209 const I32 oregflags = RExC_flags;
6136c704
AL
3210 bool have_branch = 0;
3211 bool is_open = 0;
9d1d55b5
JP
3212
3213 /* for (?g), (?gc), and (?o) warnings; warning
3214 about (?c) will warn about (?g) -- japhy */
3215
6136c704
AL
3216#define WASTED_O 0x01
3217#define WASTED_G 0x02
3218#define WASTED_C 0x04
3219#define WASTED_GC (0x02|0x04)
cbbf8932 3220 I32 wastedflags = 0x00;
9d1d55b5 3221
fac92740 3222 char * parse_start = RExC_parse; /* MJD */
a28509cc 3223 char * const oregcomp_parse = RExC_parse;
a0d0e21e 3224
821b33a5 3225 *flagp = 0; /* Tentatively. */
a0d0e21e 3226
9d1d55b5 3227
a0d0e21e
LW
3228 /* Make an OPEN node, if parenthesized. */
3229 if (paren) {
fac92740 3230 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3231 U32 posflags = 0, negflags = 0;
3232 U32 *flagsp = &posflags;
6136c704 3233 bool is_logical = 0;
a28509cc 3234 const char * const seqstart = RExC_parse;
ca9dfc88 3235
830247a4
IZ
3236 RExC_parse++;
3237 paren = *RExC_parse++;
c277df42 3238 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3239 switch (paren) {
fac92740 3240 case '<': /* (?<...) */
830247a4 3241 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3242 if (*RExC_parse == '!')
c277df42 3243 paren = ',';
b81d288d 3244 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3245 goto unknown;
830247a4 3246 RExC_parse++;
fac92740
MJD
3247 case '=': /* (?=...) */
3248 case '!': /* (?!...) */
830247a4 3249 RExC_seen_zerolen++;
fac92740
MJD
3250 case ':': /* (?:...) */
3251 case '>': /* (?>...) */
a0d0e21e 3252 break;
fac92740
MJD
3253 case '$': /* (?$...) */
3254 case '@': /* (?@...) */
8615cb43 3255 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3256 break;
fac92740 3257 case '#': /* (?#...) */
830247a4
IZ
3258 while (*RExC_parse && *RExC_parse != ')')
3259 RExC_parse++;
3260 if (*RExC_parse != ')')
c277df42 3261 FAIL("Sequence (?#... not terminated");
830247a4 3262 nextchar(pRExC_state);
a0d0e21e
LW
3263 *flagp = TRYAGAIN;
3264 return NULL;
fac92740 3265 case 'p': /* (?p...) */
9014280d 3266 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3267 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3268 /* FALL THROUGH*/
fac92740 3269 case '?': /* (??...) */
6136c704 3270 is_logical = 1;
438a3801
YST
3271 if (*RExC_parse != '{')
3272 goto unknown;
830247a4 3273 paren = *RExC_parse++;
0f5d15d6 3274 /* FALL THROUGH */
fac92740 3275 case '{': /* (?{...}) */
c277df42 3276 {
c277df42
IZ
3277 I32 count = 1, n = 0;
3278 char c;
830247a4 3279 char *s = RExC_parse;
c277df42 3280
830247a4
IZ
3281 RExC_seen_zerolen++;
3282 RExC_seen |= REG_SEEN_EVAL;
3283 while (count && (c = *RExC_parse)) {
6136c704
AL
3284 if (c == '\\') {
3285 if (RExC_parse[1])
3286 RExC_parse++;
3287 }
b81d288d 3288 else if (c == '{')
c277df42 3289 count++;
b81d288d 3290 else if (c == '}')
c277df42 3291 count--;
830247a4 3292 RExC_parse++;
c277df42 3293 }
6136c704 3294 if (*RExC_parse != ')') {
b81d288d 3295 RExC_parse = s;
b45f050a
JF
3296 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3297 }
c277df42 3298 if (!SIZE_ONLY) {
f3548bdc 3299 PAD *pad;
6136c704
AL
3300 OP_4tree *sop, *rop;
3301 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3302
569233ed
SB
3303 ENTER;
3304 Perl_save_re_context(aTHX);
f3548bdc 3305 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3306 sop->op_private |= OPpREFCOUNTED;
3307 /* re_dup will OpREFCNT_inc */
3308 OpREFCNT_set(sop, 1);
569233ed 3309 LEAVE;
c277df42 3310
830247a4
IZ
3311 n = add_data(pRExC_state, 3, "nop");
3312 RExC_rx->data->data[n] = (void*)rop;
3313 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3314 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3315 SvREFCNT_dec(sv);
a0ed51b3 3316 }
e24b16f9 3317 else { /* First pass */
830247a4 3318 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3319 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3320 /* No compiled RE interpolated, has runtime
3321 components ===> unsafe. */
3322 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3323 if (PL_tainting && PL_tainted)
cc6b7395 3324 FAIL("Eval-group in insecure regular expression");
923e4eb5 3325 if (IN_PERL_COMPILETIME)
b5c19bd7 3326 PL_cv_has_eval = 1;
c277df42 3327 }
b5c19bd7 3328
830247a4 3329 nextchar(pRExC_state);
6136c704 3330 if (is_logical) {
830247a4 3331 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3332 if (!SIZE_ONLY)
3333 ret->flags = 2;
830247a4 3334 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3335 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3336 return ret;
3337 }
ccb2c380
MP
3338 ret = reganode(pRExC_state, EVAL, n);
3339 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3340 Set_Node_Offset(ret, parse_start);
3341 return ret;
c277df42 3342 }
fac92740 3343 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3344 {
fac92740 3345 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3346 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3347 || RExC_parse[1] == '<'
830247a4 3348 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3349 I32 flag;
3350
830247a4 3351 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3352 if (!SIZE_ONLY)
3353 ret->flags = 1;
830247a4 3354 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 3355 goto insert_if;
b81d288d 3356 }
a0ed51b3 3357 }
830247a4 3358 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3359 /* (?(1)...) */
6136c704 3360 char c;
830247a4 3361 parno = atoi(RExC_parse++);
c277df42 3362
830247a4
IZ
3363 while (isDIGIT(*RExC_parse))
3364 RExC_parse++;
fac92740 3365 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3366
830247a4 3367 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3368 vFAIL("Switch condition not recognized");
c277df42 3369 insert_if:
830247a4
IZ
3370 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3371 br = regbranch(pRExC_state, &flags, 1);
c277df42 3372 if (br == NULL)
830247a4 3373 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3374 else
830247a4
IZ
3375 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3376 c = *nextchar(pRExC_state);
d1b80229
IZ
3377 if (flags&HASWIDTH)
3378 *flagp |= HASWIDTH;
c277df42 3379 if (c == '|') {
830247a4
IZ
3380 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3381 regbranch(pRExC_state, &flags, 1);
3382 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
3383 if (flags&HASWIDTH)
3384 *flagp |= HASWIDTH;
830247a4 3385 c = *nextchar(pRExC_state);
a0ed51b3
LW
3386 }
3387 else
c277df42
IZ
3388 lastbr = NULL;
3389 if (c != ')')
8615cb43 3390 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
3391 ender = reg_node(pRExC_state, TAIL);
3392 regtail(pRExC_state, br, ender);
c277df42 3393 if (lastbr) {
830247a4
IZ
3394 regtail(pRExC_state, lastbr, ender);
3395 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3396 }
3397 else
830247a4 3398 regtail(pRExC_state, ret, ender);
c277df42 3399 return ret;
a0ed51b3
LW
3400 }
3401 else {
830247a4 3402 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3403 }
3404 }
1b1626e4 3405 case 0:
830247a4 3406 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3407 vFAIL("Sequence (? incomplete");
1b1626e4 3408 break;
a0d0e21e 3409 default:
830247a4 3410 --RExC_parse;
fac92740 3411 parse_flags: /* (?i) */
830247a4 3412 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3413 /* (?g), (?gc) and (?o) are useless here
3414 and must be globally applied -- japhy */
3415
3416 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3417 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 3418 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
3419 if (! (wastedflags & wflagbit) ) {
3420 wastedflags |= wflagbit;
3421 vWARN5(
3422 RExC_parse + 1,
3423 "Useless (%s%c) - %suse /%c modifier",
3424 flagsp == &negflags ? "?-" : "?",
3425 *RExC_parse,
3426 flagsp == &negflags ? "don't " : "",
3427 *RExC_parse
3428 );
3429 }
3430 }
3431 }
3432 else if (*RExC_parse == 'c') {
3433 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
3434 if (! (wastedflags & WASTED_C) ) {
3435 wastedflags |= WASTED_GC;
9d1d55b5
JP
3436 vWARN3(
3437 RExC_parse + 1,
3438 "Useless (%sc) - %suse /gc modifier",
3439 flagsp == &negflags ? "?-" : "?",
3440 flagsp == &negflags ? "don't " : ""
3441 );
3442 }
3443 }
3444 }
3445 else { pmflag(flagsp, *RExC_parse); }
3446
830247a4 3447 ++RExC_parse;
ca9dfc88 3448 }
830247a4 3449 if (*RExC_parse == '-') {
ca9dfc88 3450 flagsp = &negflags;
9d1d55b5 3451 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3452 ++RExC_parse;
ca9dfc88 3453 goto parse_flags;
48c036b1 3454 }
e2509266
JH
3455 RExC_flags |= posflags;
3456 RExC_flags &= ~negflags;
830247a4
IZ
3457 if (*RExC_parse == ':') {
3458 RExC_parse++;
ca9dfc88
IZ
3459 paren = ':';
3460 break;
3461 }
c277df42 3462 unknown:
830247a4
IZ
3463 if (*RExC_parse != ')') {
3464 RExC_parse++;
3465 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3466 }
830247a4 3467 nextchar(pRExC_state);
a0d0e21e
LW
3468 *flagp = TRYAGAIN;
3469 return NULL;
3470 }
3471 }
fac92740 3472 else { /* (...) */
830247a4
IZ
3473 parno = RExC_npar;
3474 RExC_npar++;
3475 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3476 Set_Node_Length(ret, 1); /* MJD */
3477 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 3478 is_open = 1;
a0d0e21e 3479 }
a0ed51b3 3480 }
fac92740 3481 else /* ! paren */
a0d0e21e
LW
3482 ret = NULL;
3483
3484 /* Pick up the branches, linking them together. */
fac92740 3485 parse_start = RExC_parse; /* MJD */
830247a4 3486 br = regbranch(pRExC_state, &flags, 1);
fac92740 3487 /* branch_len = (paren != 0); */
2af232bd 3488
a0d0e21e
LW
3489 if (br == NULL)
3490 return(NULL);
830247a4
IZ
3491 if (*RExC_parse == '|') {
3492 if (!SIZE_ONLY && RExC_extralen) {
3493 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3494 }
fac92740 3495 else { /* MJD */
830247a4 3496 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3497 Set_Node_Length(br, paren != 0);
3498 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3499 }
c277df42
IZ
3500 have_branch = 1;
3501 if (SIZE_ONLY)
830247a4 3502 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3503 }
3504 else if (paren == ':') {
c277df42
IZ
3505 *flagp |= flags&SIMPLE;
3506 }
6136c704 3507 if (is_open) { /* Starts with OPEN. */
830247a4 3508 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3509 }
3510 else if (paren != '?') /* Not Conditional */
a0d0e21e 3511 ret = br;
32a0ca98 3512 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3513 lastbr = br;
830247a4
IZ
3514 while (*RExC_parse == '|') {
3515 if (!SIZE_ONLY && RExC_extralen) {
3516 ender = reganode(pRExC_state, LONGJMP,0);
3517 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3518 }
3519 if (SIZE_ONLY)
830247a4
IZ
3520 RExC_extralen += 2; /* Account for LONGJMP. */
3521 nextchar(pRExC_state);
3522 br = regbranch(pRExC_state, &flags, 0);
2af232bd 3523
a687059c 3524 if (br == NULL)
a0d0e21e 3525 return(NULL);
830247a4 3526 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3527 lastbr = br;
821b33a5
IZ
3528 if (flags&HASWIDTH)
3529 *flagp |= HASWIDTH;
a687059c 3530 *flagp |= flags&SPSTART;
a0d0e21e
LW
3531 }
3532
c277df42
IZ
3533 if (have_branch || paren != ':') {
3534 /* Make a closing node, and hook it on the end. */
3535 switch (paren) {
3536 case ':':
830247a4 3537 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3538 break;
3539 case 1:
830247a4 3540 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3541 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3542 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3543 break;
3544 case '<':
c277df42
IZ
3545 case ',':
3546 case '=':
3547 case '!':
c277df42 3548 *flagp &= ~HASWIDTH;
821b33a5
IZ
3549 /* FALL THROUGH */
3550 case '>':
830247a4 3551 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3552 break;
3553 case 0:
830247a4 3554 ender = reg_node(pRExC_state, END);
c277df42
IZ
3555 break;
3556 }
830247a4 3557 regtail(pRExC_state, lastbr, ender);
a0d0e21e 3558
c277df42
IZ
3559 if (have_branch) {
3560 /* Hook the tails of the branches to the closing node. */
3561 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 3562 regoptail(pRExC_state, br, ender);
c277df42
IZ
3563 }
3564 }
a0d0e21e 3565 }
c277df42
IZ
3566
3567 {
e1ec3a88
AL
3568 const char *p;
3569 static const char parens[] = "=!<,>";
c277df42
IZ
3570
3571 if (paren && (p = strchr(parens, paren))) {
eb160463 3572 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
3573 int flag = (p - parens) > 1;
3574
3575 if (paren == '>')
3576 node = SUSPEND, flag = 0;
830247a4 3577 reginsert(pRExC_state, node,ret);
45948336
EP
3578 Set_Node_Cur_Length(ret);
3579 Set_Node_Offset(ret, parse_start + 1);
c277df42 3580 ret->flags = flag;
830247a4 3581 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 3582 }
a0d0e21e
LW
3583 }
3584
3585 /* Check for proper termination. */
ce3e6498 3586 if (paren) {
e2509266 3587 RExC_flags = oregflags;
830247a4
IZ
3588 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3589 RExC_parse = oregcomp_parse;
380a0633 3590 vFAIL("Unmatched (");
ce3e6498 3591 }
a0ed51b3 3592 }
830247a4
IZ
3593 else if (!paren && RExC_parse < RExC_end) {
3594 if (*RExC_parse == ')') {
3595 RExC_parse++;
380a0633 3596 vFAIL("Unmatched )");
a0ed51b3
LW
3597 }
3598 else
b45f050a 3599 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
3600 /* NOTREACHED */
3601 }
a687059c 3602
a0d0e21e 3603 return(ret);
a687059c
LW
3604}
3605
3606/*
3607 - regbranch - one alternative of an | operator
3608 *
3609 * Implements the concatenation operator.
3610 */
76e3520e 3611STATIC regnode *
830247a4 3612S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 3613{
97aff369 3614 dVAR;
c277df42
IZ
3615 register regnode *ret;
3616 register regnode *chain = NULL;
3617 register regnode *latest;
3618 I32 flags = 0, c = 0;
a0d0e21e 3619
b81d288d 3620 if (first)
c277df42
IZ
3621 ret = NULL;
3622 else {
b81d288d 3623 if (!SIZE_ONLY && RExC_extralen)
830247a4 3624 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 3625 else {
830247a4 3626 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
3627 Set_Node_Length(ret, 1);
3628 }
c277df42
IZ
3629 }
3630
b81d288d 3631 if (!first && SIZE_ONLY)
830247a4 3632 RExC_extralen += 1; /* BRANCHJ */
b81d288d 3633
c277df42 3634 *flagp = WORST; /* Tentatively. */
a0d0e21e 3635
830247a4
IZ
3636 RExC_parse--;
3637 nextchar(pRExC_state);
3638 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 3639 flags &= ~TRYAGAIN;
830247a4 3640 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
3641 if (latest == NULL) {
3642 if (flags & TRYAGAIN)
3643 continue;
3644 return(NULL);
a0ed51b3
LW
3645 }
3646 else if (ret == NULL)
c277df42 3647 ret = latest;
a0d0e21e 3648 *flagp |= flags&HASWIDTH;
c277df42 3649 if (chain == NULL) /* First piece. */
a0d0e21e
LW
3650 *flagp |= flags&SPSTART;
3651 else {
830247a4
IZ
3652 RExC_naughty++;
3653 regtail(pRExC_state, chain, latest);
a687059c 3654 }
a0d0e21e 3655 chain = latest;
c277df42
IZ
3656 c++;
3657 }
3658 if (chain == NULL) { /* Loop ran zero times. */
830247a4 3659 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
3660 if (ret == NULL)
3661 ret = chain;
3662 }
3663 if (c == 1) {
3664 *flagp |= flags&SIMPLE;
a0d0e21e 3665 }
a687059c 3666
d4c19fe8 3667 return ret;
a687059c
LW
3668}
3669
3670/*
3671 - regpiece - something followed by possible [*+?]
3672 *
3673 * Note that the branching code sequences used for ? and the general cases
3674 * of * and + are somewhat optimized: they use the same NOTHING node as
3675 * both the endmarker for their branch list and the body of the last branch.
3676 * It might seem that this node could be dispensed with entirely, but the
3677 * endmarker role is not redundant.
3678 */
76e3520e 3679STATIC regnode *
830247a4 3680S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3681{
97aff369 3682 dVAR;
c277df42 3683 register regnode *ret;
a0d0e21e
LW
3684 register char op;
3685 register char *next;
3686 I32 flags;
1df70142 3687 const char * const origparse = RExC_parse;
a0d0e21e
LW
3688 char *maxpos;
3689 I32 min;
c277df42 3690 I32 max = REG_INFTY;
fac92740 3691 char *parse_start;
a0d0e21e 3692
830247a4 3693 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
3694 if (ret == NULL) {
3695 if (flags & TRYAGAIN)
3696 *flagp |= TRYAGAIN;
3697 return(NULL);
3698 }
3699
830247a4 3700 op = *RExC_parse;
a0d0e21e 3701
830247a4 3702 if (op == '{' && regcurly(RExC_parse)) {
fac92740 3703 parse_start = RExC_parse; /* MJD */
830247a4 3704 next = RExC_parse + 1;
c445ea15 3705 maxpos = NULL;
a0d0e21e
LW
3706 while (isDIGIT(*next) || *next == ',') {
3707 if (*next == ',') {
3708 if (maxpos)
3709 break;
3710 else
3711 maxpos = next;
a687059c 3712 }
a0d0e21e
LW
3713 next++;
3714 }
3715 if (*next == '}') { /* got one */
3716 if (!maxpos)
3717 maxpos = next;
830247a4
IZ
3718 RExC_parse++;
3719 min = atoi(RExC_parse);
a0d0e21e
LW
3720 if (*maxpos == ',')
3721 maxpos++;
3722 else
830247a4 3723 maxpos = RExC_parse;
a0d0e21e
LW
3724 max = atoi(maxpos);
3725 if (!max && *maxpos != '0')
c277df42
IZ
3726 max = REG_INFTY; /* meaning "infinity" */
3727 else if (max >= REG_INFTY)
8615cb43 3728 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
3729 RExC_parse = next;
3730 nextchar(pRExC_state);
a0d0e21e
LW
3731
3732 do_curly:
3733 if ((flags&SIMPLE)) {
830247a4
IZ
3734 RExC_naughty += 2 + RExC_naughty / 2;
3735 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
3736 Set_Node_Offset(ret, parse_start+1); /* MJD */
3737 Set_Node_Cur_Length(ret);
a0d0e21e
LW
3738 }
3739 else {
830247a4 3740 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
3741
3742 w->flags = 0;
830247a4
IZ
3743 regtail(pRExC_state, ret, w);
3744 if (!SIZE_ONLY && RExC_extralen) {
3745 reginsert(pRExC_state, LONGJMP,ret);
3746 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
3747 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3748 }
830247a4 3749 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
3750 /* MJD hk */
3751 Set_Node_Offset(ret, parse_start+1);
2af232bd 3752 Set_Node_Length(ret,
fac92740 3753 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 3754
830247a4 3755 if (!SIZE_ONLY && RExC_extralen)
c277df42 3756 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 3757 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 3758 if (SIZE_ONLY)
830247a4
IZ
3759 RExC_whilem_seen++, RExC_extralen += 3;
3760 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 3761 }
c277df42 3762 ret->flags = 0;
a0d0e21e
LW
3763
3764 if (min > 0)
821b33a5
IZ
3765 *flagp = WORST;
3766 if (max > 0)
3767 *flagp |= HASWIDTH;
a0d0e21e 3768 if (max && max < min)
8615cb43 3769 vFAIL("Can't do {n,m} with n > m");
c277df42 3770 if (!SIZE_ONLY) {
eb160463
GS
3771 ARG1_SET(ret, (U16)min);
3772 ARG2_SET(ret, (U16)max);
a687059c 3773 }
a687059c 3774
a0d0e21e 3775 goto nest_check;
a687059c 3776 }
a0d0e21e 3777 }
a687059c 3778
a0d0e21e
LW
3779 if (!ISMULT1(op)) {
3780 *flagp = flags;
a687059c 3781 return(ret);
a0d0e21e 3782 }
bb20fd44 3783
c277df42 3784#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
3785
3786 /* if this is reinstated, don't forget to put this back into perldiag:
3787
3788 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3789
3790 (F) The part of the regexp subject to either the * or + quantifier
3791 could match an empty string. The {#} shows in the regular
3792 expression about where the problem was discovered.
3793
3794 */
3795
bb20fd44 3796 if (!(flags&HASWIDTH) && op != '?')
b45f050a 3797 vFAIL("Regexp *+ operand could be empty");
b81d288d 3798#endif
bb20fd44 3799
fac92740 3800 parse_start = RExC_parse;
830247a4 3801 nextchar(pRExC_state);
a0d0e21e 3802
821b33a5 3803 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
3804
3805 if (op == '*' && (flags&SIMPLE)) {
830247a4 3806 reginsert(pRExC_state, STAR, ret);
c277df42 3807 ret->flags = 0;
830247a4 3808 RExC_naughty += 4;
a0d0e21e
LW
3809 }
3810 else if (op == '*') {
3811 min = 0;
3812 goto do_curly;
a0ed51b3
LW
3813 }
3814 else if (op == '+' && (flags&SIMPLE)) {
830247a4 3815 reginsert(pRExC_state, PLUS, ret);
c277df42 3816 ret->flags = 0;
830247a4 3817 RExC_naughty += 3;
a0d0e21e
LW
3818 }
3819 else if (op == '+') {
3820 min = 1;
3821 goto do_curly;
a0ed51b3
LW
3822 }
3823 else if (op == '?') {
a0d0e21e
LW
3824 min = 0; max = 1;
3825 goto do_curly;
3826 }
3827 nest_check:
041457d9 3828 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 3829 vWARN3(RExC_parse,
b45f050a 3830 "%.*s matches null string many times",
afd78fd5 3831 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 3832 origparse);
a0d0e21e
LW
3833 }
3834
830247a4
IZ
3835 if (*RExC_parse == '?') {
3836 nextchar(pRExC_state);
3837 reginsert(pRExC_state, MINMOD, ret);
3838 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 3839 }
830247a4
IZ
3840 if (ISMULT2(RExC_parse)) {
3841 RExC_parse++;
b45f050a
JF
3842 vFAIL("Nested quantifiers");
3843 }
a0d0e21e
LW
3844
3845 return(ret);
a687059c
LW
3846}
3847
3848/*
3849 - regatom - the lowest level
3850 *
3851 * Optimization: gobbles an entire sequence of ordinary characters so that
3852 * it can turn them into a single node, which is smaller to store and
3853 * faster to run. Backslashed characters are exceptions, each becoming a
3854 * separate node; the code is simpler that way and it's not worth fixing.
3855 *
b45f050a 3856 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 3857STATIC regnode *
830247a4 3858S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3859{
97aff369 3860 dVAR;
cbbf8932 3861 register regnode *ret = NULL;
a0d0e21e 3862 I32 flags;
45948336 3863 char *parse_start = RExC_parse;
a0d0e21e
LW
3864
3865 *flagp = WORST; /* Tentatively. */
3866
3867tryagain:
830247a4 3868 switch (*RExC_parse) {
a0d0e21e 3869 case '^':
830247a4
IZ
3870 RExC_seen_zerolen++;
3871 nextchar(pRExC_state);
e2509266 3872 if (RExC_flags & PMf_MULTILINE)
830247a4 3873 ret = reg_node(pRExC_state, MBOL);
e2509266 3874 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3875 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3876 else
830247a4 3877 ret = reg_node(pRExC_state, BOL);
fac92740 3878 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3879 break;
3880 case '$':
830247a4 3881 nextchar(pRExC_state);
b81d288d 3882 if (*RExC_parse)
830247a4 3883 RExC_seen_zerolen++;
e2509266 3884 if (RExC_flags & PMf_MULTILINE)
830247a4 3885 ret = reg_node(pRExC_state, MEOL);
e2509266 3886 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3887 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3888 else
830247a4 3889 ret = reg_node(pRExC_state, EOL);
fac92740 3890 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3891 break;
3892 case '.':
830247a4 3893 nextchar(pRExC_state);
e2509266 3894 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
3895 ret = reg_node(pRExC_state, SANY);
3896 else
3897 ret = reg_node(pRExC_state, REG_ANY);
3898 *flagp |= HASWIDTH|SIMPLE;
830247a4 3899 RExC_naughty++;
fac92740 3900 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3901 break;
3902 case '[':
b45f050a 3903 {
830247a4 3904 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 3905 ret = regclass(pRExC_state);
830247a4
IZ
3906 if (*RExC_parse != ']') {
3907 RExC_parse = oregcomp_parse;
b45f050a
JF
3908 vFAIL("Unmatched [");
3909 }
830247a4 3910 nextchar(pRExC_state);
a0d0e21e 3911 *flagp |= HASWIDTH|SIMPLE;
fac92740 3912 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 3913 break;
b45f050a 3914 }
a0d0e21e 3915 case '(':
830247a4
IZ
3916 nextchar(pRExC_state);
3917 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 3918 if (ret == NULL) {
bf93d4cc 3919 if (flags & TRYAGAIN) {
830247a4 3920 if (RExC_parse == RExC_end) {
bf93d4cc
GS
3921 /* Make parent create an empty node if needed. */
3922 *flagp |= TRYAGAIN;
3923 return(NULL);
3924 }
a0d0e21e 3925 goto tryagain;
bf93d4cc 3926 }
a0d0e21e
LW
3927 return(NULL);
3928 }
c277df42 3929 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
3930 break;
3931 case '|':
3932 case ')':
3933 if (flags & TRYAGAIN) {
3934 *flagp |= TRYAGAIN;
3935 return NULL;
3936 }
b45f050a 3937 vFAIL("Internal urp");
a0d0e21e
LW
3938 /* Supposed to be caught earlier. */
3939 break;
85afd4ae 3940 case '{':
830247a4
IZ
3941 if (!regcurly(RExC_parse)) {
3942 RExC_parse++;
85afd4ae
CS
3943 goto defchar;
3944 }
3945 /* FALL THROUGH */
a0d0e21e
LW
3946 case '?':
3947 case '+':
3948 case '*':
830247a4 3949 RExC_parse++;
b45f050a 3950 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
3951 break;
3952 case '\\':
830247a4 3953 switch (*++RExC_parse) {
a0d0e21e 3954 case 'A':
830247a4
IZ
3955 RExC_seen_zerolen++;
3956 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3957 *flagp |= SIMPLE;
830247a4 3958 nextchar(pRExC_state);
fac92740 3959 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3960 break;
3961 case 'G':
830247a4
IZ
3962 ret = reg_node(pRExC_state, GPOS);
3963 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 3964 *flagp |= SIMPLE;
830247a4 3965 nextchar(pRExC_state);
fac92740 3966 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3967 break;
3968 case 'Z':
830247a4 3969 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3970 *flagp |= SIMPLE;
a1917ab9 3971 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 3972 nextchar(pRExC_state);
a0d0e21e 3973 break;
b85d18e9 3974 case 'z':
830247a4 3975 ret = reg_node(pRExC_state, EOS);
b85d18e9 3976 *flagp |= SIMPLE;
830247a4
IZ
3977 RExC_seen_zerolen++; /* Do not optimize RE away */
3978 nextchar(pRExC_state);
fac92740 3979 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 3980 break;
4a2d328f 3981 case 'C':
f33976b4
DB
3982 ret = reg_node(pRExC_state, CANY);
3983 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 3984 *flagp |= HASWIDTH|SIMPLE;
830247a4 3985 nextchar(pRExC_state);
fac92740 3986 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
3987 break;
3988 case 'X':
830247a4 3989 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 3990 *flagp |= HASWIDTH;
830247a4 3991 nextchar(pRExC_state);
fac92740 3992 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 3993 break;
a0d0e21e 3994 case 'w':
eb160463 3995 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 3996 *flagp |= HASWIDTH|SIMPLE;
830247a4 3997 nextchar(pRExC_state);
fac92740 3998 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3999 break;
4000 case 'W':
eb160463 4001 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4002 *flagp |= HASWIDTH|SIMPLE;
830247a4 4003 nextchar(pRExC_state);
fac92740 4004 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4005 break;
4006 case 'b':
830247a4
IZ
4007 RExC_seen_zerolen++;
4008 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4009 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4010 *flagp |= SIMPLE;
830247a4 4011 nextchar(pRExC_state);
fac92740 4012 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4013 break;
4014 case 'B':
830247a4
IZ
4015 RExC_seen_zerolen++;
4016 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4017 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4018 *flagp |= SIMPLE;
830247a4 4019 nextchar(pRExC_state);
fac92740 4020 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4021 break;
4022 case 's':
eb160463 4023 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4024 *flagp |= HASWIDTH|SIMPLE;
830247a4 4025 nextchar(pRExC_state);
fac92740 4026 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4027 break;
4028 case 'S':
eb160463 4029 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4030 *flagp |= HASWIDTH|SIMPLE;
830247a4 4031 nextchar(pRExC_state);
fac92740 4032 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4033 break;
4034 case 'd':
ffc61ed2 4035 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4036 *flagp |= HASWIDTH|SIMPLE;
830247a4 4037 nextchar(pRExC_state);
fac92740 4038 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4039 break;
4040 case 'D':
ffc61ed2 4041 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4042 *flagp |= HASWIDTH|SIMPLE;
830247a4 4043 nextchar(pRExC_state);
fac92740 4044 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4045 break;
a14b48bc
LW
4046 case 'p':
4047 case 'P':
3568d838 4048 {
830247a4 4049 char* oldregxend = RExC_end;
ccb2c380 4050 char* parse_start = RExC_parse - 2;
a14b48bc 4051
830247a4 4052 if (RExC_parse[1] == '{') {
3568d838 4053 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4054 RExC_end = strchr(RExC_parse, '}');
4055 if (!RExC_end) {
0da60cf5 4056 U8 c = (U8)*RExC_parse;
830247a4
IZ
4057 RExC_parse += 2;
4058 RExC_end = oldregxend;
0da60cf5 4059 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4060 }
830247a4 4061 RExC_end++;
a14b48bc 4062 }
af6f566e 4063 else {
830247a4 4064 RExC_end = RExC_parse + 2;
af6f566e
HS
4065 if (RExC_end > oldregxend)
4066 RExC_end = oldregxend;
4067 }
830247a4 4068 RExC_parse--;
a14b48bc 4069
ffc61ed2 4070 ret = regclass(pRExC_state);
a14b48bc 4071
830247a4
IZ
4072 RExC_end = oldregxend;
4073 RExC_parse--;
ccb2c380
MP
4074
4075 Set_Node_Offset(ret, parse_start + 2);
4076 Set_Node_Cur_Length(ret);
830247a4 4077 nextchar(pRExC_state);
a14b48bc
LW
4078 *flagp |= HASWIDTH|SIMPLE;
4079 }
4080 break;
a0d0e21e
LW
4081 case 'n':
4082 case 'r':
4083 case 't':
4084 case 'f':
4085 case 'e':
4086 case 'a':
4087 case 'x':
4088 case 'c':
4089 case '0':
4090 goto defchar;
4091 case '1': case '2': case '3': case '4':
4092 case '5': case '6': case '7': case '8': case '9':
4093 {
1df70142 4094 const I32 num = atoi(RExC_parse);
a0d0e21e 4095
830247a4 4096 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4097 goto defchar;
4098 else {
fac92740 4099 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4100 while (isDIGIT(*RExC_parse))
4101 RExC_parse++;
b45f050a 4102
eb160463 4103 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4104 vFAIL("Reference to nonexistent group");
830247a4 4105 RExC_sawback = 1;
eb160463
GS
4106 ret = reganode(pRExC_state,
4107 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4108 num);
a0d0e21e 4109 *flagp |= HASWIDTH;
2af232bd 4110
fac92740 4111 /* override incorrect value set in reganode MJD */
2af232bd 4112 Set_Node_Offset(ret, parse_start+1);
fac92740 4113 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4114 RExC_parse--;
4115 nextchar(pRExC_state);
a0d0e21e
LW
4116 }
4117 }
4118 break;
4119 case '\0':
830247a4 4120 if (RExC_parse >= RExC_end)
b45f050a 4121 FAIL("Trailing \\");
a0d0e21e
LW
4122 /* FALL THROUGH */
4123 default:
a0288114 4124 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 4125 back into the quick-grab loop below */
45948336 4126 parse_start--;
a0d0e21e
LW
4127 goto defchar;
4128 }
4129 break;
4633a7c4
LW
4130
4131 case '#':
e2509266 4132 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4133 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4134 if (RExC_parse < RExC_end)
4633a7c4
LW
4135 goto tryagain;
4136 }
4137 /* FALL THROUGH */
4138
a0d0e21e 4139 default: {
ba210ebe 4140 register STRLEN len;
58ae7d3f 4141 register UV ender;
a0d0e21e 4142 register char *p;
c277df42 4143 char *oldp, *s;
80aecb99 4144 STRLEN foldlen;
89ebb4a3 4145 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4146
4147 parse_start = RExC_parse - 1;
a0d0e21e 4148
830247a4 4149 RExC_parse++;
a0d0e21e
LW
4150
4151 defchar:
58ae7d3f 4152 ender = 0;
eb160463
GS
4153 ret = reg_node(pRExC_state,
4154 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4155 s = STRING(ret);
830247a4
IZ
4156 for (len = 0, p = RExC_parse - 1;
4157 len < 127 && p < RExC_end;
a0d0e21e
LW
4158 len++)
4159 {
4160 oldp = p;
5b5a24f7 4161
e2509266 4162 if (RExC_flags & PMf_EXTENDED)
830247a4 4163 p = regwhite(p, RExC_end);
a0d0e21e
LW
4164 switch (*p) {
4165 case '^':
4166 case '$':
4167 case '.':
4168 case '[':
4169 case '(':
4170 case ')':
4171 case '|':
4172 goto loopdone;
4173 case '\\':
4174 switch (*++p) {
4175 case 'A':
1ed8eac0
JF
4176 case 'C':
4177 case 'X':
a0d0e21e
LW
4178 case 'G':
4179 case 'Z':
b85d18e9 4180 case 'z':
a0d0e21e
LW
4181 case 'w':
4182 case 'W':
4183 case 'b':
4184 case 'B':
4185 case 's':
4186 case 'S':
4187 case 'd':
4188 case 'D':
a14b48bc
LW
4189 case 'p':
4190 case 'P':
a0d0e21e
LW
4191 --p;
4192 goto loopdone;
4193 case 'n':
4194 ender = '\n';
4195 p++;
a687059c 4196 break;
a0d0e21e
LW
4197 case 'r':
4198 ender = '\r';
4199 p++;
a687059c 4200 break;
a0d0e21e
LW
4201 case 't':
4202 ender = '\t';
4203 p++;
a687059c 4204 break;
a0d0e21e
LW
4205 case 'f':
4206 ender = '\f';
4207 p++;
a687059c 4208 break;
a0d0e21e 4209 case 'e':
c7f1f016 4210 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4211 p++;
a687059c 4212 break;
a0d0e21e 4213 case 'a':
c7f1f016 4214 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4215 p++;
a687059c 4216 break;
a0d0e21e 4217 case 'x':
a0ed51b3 4218 if (*++p == '{') {
1df70142 4219 char* const e = strchr(p, '}');
b81d288d 4220
b45f050a 4221 if (!e) {
830247a4 4222 RExC_parse = p + 1;
b45f050a
JF
4223 vFAIL("Missing right brace on \\x{}");
4224 }
de5f0749 4225 else {
a4c04bdc
NC
4226 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4227 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 4228 STRLEN numlen = e - p - 1;
53305cf1 4229 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4230 if (ender > 0xff)
4231 RExC_utf8 = 1;
a0ed51b3
LW
4232 p = e + 1;
4233 }
a0ed51b3
LW
4234 }
4235 else {
a4c04bdc 4236 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 4237 STRLEN numlen = 2;
53305cf1 4238 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4239 p += numlen;
4240 }
a687059c 4241 break;
a0d0e21e
LW
4242 case 'c':
4243 p++;
bbce6d69 4244 ender = UCHARAT(p++);
4245 ender = toCTRL(ender);
a687059c 4246 break;
a0d0e21e
LW
4247 case '0': case '1': case '2': case '3':case '4':
4248 case '5': case '6': case '7': case '8':case '9':
4249 if (*p == '0' ||
830247a4 4250 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 4251 I32 flags = 0;
1df70142 4252 STRLEN numlen = 3;
53305cf1 4253 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4254 p += numlen;
4255 }
4256 else {
4257 --p;
4258 goto loopdone;
a687059c
LW
4259 }
4260 break;
a0d0e21e 4261 case '\0':
830247a4 4262 if (p >= RExC_end)
b45f050a 4263 FAIL("Trailing \\");
a687059c 4264 /* FALL THROUGH */
a0d0e21e 4265 default:
041457d9 4266 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 4267 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4268 goto normal_default;
a0d0e21e
LW
4269 }
4270 break;
a687059c 4271 default:
a0ed51b3 4272 normal_default:
fd400ab9 4273 if (UTF8_IS_START(*p) && UTF) {
1df70142 4274 STRLEN numlen;
5e12f4fb 4275 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 4276 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
4277 p += numlen;
4278 }
4279 else
4280 ender = *p++;
a0d0e21e 4281 break;
a687059c 4282 }
e2509266 4283 if (RExC_flags & PMf_EXTENDED)
830247a4 4284 p = regwhite(p, RExC_end);
60a8b682
JH
4285 if (UTF && FOLD) {
4286 /* Prime the casefolded buffer. */
ac7e0132 4287 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4288 }
a0d0e21e
LW
4289 if (ISMULT2(p)) { /* Back off on ?+*. */
4290 if (len)
4291 p = oldp;
16ea2a2e 4292 else if (UTF) {
0ebc6274
JH
4293 STRLEN unilen;
4294
80aecb99 4295 if (FOLD) {
60a8b682 4296 /* Emit all the Unicode characters. */
1df70142 4297 STRLEN numlen;
80aecb99
JH
4298 for (foldbuf = tmpbuf;
4299 foldlen;
4300 foldlen -= numlen) {
4301 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4302 if (numlen > 0) {
0ebc6274
JH
4303 reguni(pRExC_state, ender, s, &unilen);
4304 s += unilen;
4305 len += unilen;
4306 /* In EBCDIC the numlen
4307 * and unilen can differ. */
9dc45d57 4308 foldbuf += numlen;
47654450
JH
4309 if (numlen >= foldlen)
4310 break;
9dc45d57
JH
4311 }
4312 else
4313 break; /* "Can't happen." */
80aecb99
JH
4314 }
4315 }
4316 else {
0ebc6274 4317 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4318 if (unilen > 0) {
0ebc6274
JH
4319 s += unilen;
4320 len += unilen;
9dc45d57 4321 }
80aecb99 4322 }
a0ed51b3 4323 }
a0d0e21e
LW
4324 else {
4325 len++;
eb160463 4326 REGC((char)ender, s++);
a0d0e21e
LW
4327 }
4328 break;
a687059c 4329 }
16ea2a2e 4330 if (UTF) {
0ebc6274
JH
4331 STRLEN unilen;
4332
80aecb99 4333 if (FOLD) {
60a8b682 4334 /* Emit all the Unicode characters. */
1df70142 4335 STRLEN numlen;
80aecb99
JH
4336 for (foldbuf = tmpbuf;
4337 foldlen;
4338 foldlen -= numlen) {
4339 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4340 if (numlen > 0) {
0ebc6274
JH
4341 reguni(pRExC_state, ender, s, &unilen);
4342 len += unilen;
4343 s += unilen;
4344 /* In EBCDIC the numlen
4345 * and unilen can differ. */
9dc45d57 4346 foldbuf += numlen;
47654450
JH
4347 if (numlen >= foldlen)
4348 break;
9dc45d57
JH
4349 }
4350 else
4351 break;
80aecb99
JH
4352 }
4353 }
4354 else {
0ebc6274 4355 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4356 if (unilen > 0) {
0ebc6274
JH
4357 s += unilen;
4358 len += unilen;
9dc45d57 4359 }
80aecb99
JH
4360 }
4361 len--;
a0ed51b3
LW
4362 }
4363 else
eb160463 4364 REGC((char)ender, s++);
a0d0e21e
LW
4365 }
4366 loopdone:
830247a4 4367 RExC_parse = p - 1;
fac92740 4368 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4369 nextchar(pRExC_state);
793db0cb
JH
4370 {
4371 /* len is STRLEN which is unsigned, need to copy to signed */
4372 IV iv = len;
4373 if (iv < 0)
4374 vFAIL("Internal disaster");
4375 }
a0d0e21e
LW
4376 if (len > 0)
4377 *flagp |= HASWIDTH;
090f7165 4378 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4379 *flagp |= SIMPLE;
c277df42 4380 if (!SIZE_ONLY)
cd439c50
IZ
4381 STR_LEN(ret) = len;
4382 if (SIZE_ONLY)
830247a4 4383 RExC_size += STR_SZ(len);
cd439c50 4384 else
830247a4 4385 RExC_emit += STR_SZ(len);
a687059c 4386 }
a0d0e21e
LW
4387 break;
4388 }
a687059c 4389
60a8b682
JH
4390 /* If the encoding pragma is in effect recode the text of
4391 * any EXACT-kind nodes. */
22c54be3 4392 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
4393 STRLEN oldlen = STR_LEN(ret);
4394 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4395
4396 if (RExC_utf8)
4397 SvUTF8_on(sv);
4398 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
4399 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4400 const STRLEN newlen = SvCUR(sv);
d0063567
DK
4401
4402 if (SvUTF8(sv))
4403 RExC_utf8 = 1;
4404 if (!SIZE_ONLY) {
a3621e74
YO
4405 GET_RE_DEBUG_FLAGS_DECL;
4406 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
4407 (int)oldlen, STRING(ret),
4408 (int)newlen, s));
4409 Copy(s, STRING(ret), newlen, char);
4410 STR_LEN(ret) += newlen - oldlen;
4411 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4412 } else
4413 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4414 }
a72c7584
JH
4415 }
4416
a0d0e21e 4417 return(ret);
a687059c
LW
4418}
4419
873ef191 4420STATIC char *
5f66b61c 4421S_regwhite(char *p, const char *e)
5b5a24f7
CS
4422{
4423 while (p < e) {
4424 if (isSPACE(*p))
4425 ++p;
4426 else if (*p == '#') {
4427 do {
4428 p++;
4429 } while (p < e && *p != '\n');
4430 }
4431 else
4432 break;
4433 }
4434 return p;
4435}
4436
b8c5462f
JH
4437/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4438 Character classes ([:foo:]) can also be negated ([:^foo:]).
4439 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4440 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 4441 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
4442
4443#define POSIXCC_DONE(c) ((c) == ':')
4444#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4445#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4446
b8c5462f 4447STATIC I32
830247a4 4448S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 4449{
97aff369 4450 dVAR;
936ed897 4451 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 4452
830247a4 4453 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 4454 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 4455 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 4456 const char c = UCHARAT(RExC_parse);
097eb12c 4457 char* const s = RExC_parse++;
b81d288d 4458
9a86a77b 4459 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
4460 RExC_parse++;
4461 if (RExC_parse == RExC_end)
620e46c5 4462 /* Grandfather lone [:, [=, [. */
830247a4 4463 RExC_parse = s;
620e46c5 4464 else {
1df70142 4465 const char* t = RExC_parse++; /* skip over the c */
a28509cc 4466 const char *posixcc;
b8c5462f 4467
80916619
NC
4468 assert(*t == c);
4469
9a86a77b 4470 if (UCHARAT(RExC_parse) == ']') {
830247a4 4471 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
4472 posixcc = s + 1;
4473 if (*s == ':') {
1df70142
AL
4474 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4475 const I32 skip = t - posixcc;
80916619
NC
4476
4477 /* Initially switch on the length of the name. */
4478 switch (skip) {
4479 case 4:
4480 if (memEQ(posixcc, "word", 4)) {
4481 /* this is not POSIX, this is the Perl \w */;
4482 namedclass
4483 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4484 }
cc4319de 4485 break;
80916619
NC
4486 case 5:
4487 /* Names all of length 5. */
4488 /* alnum alpha ascii blank cntrl digit graph lower
4489 print punct space upper */
4490 /* Offset 4 gives the best switch position. */
4491 switch (posixcc[4]) {
4492 case 'a':
4493 if (memEQ(posixcc, "alph", 4)) {
4494 /* a */
4495 namedclass
4496 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4497 }
4498 break;
4499 case 'e':
4500 if (memEQ(posixcc, "spac", 4)) {
4501 /* e */
4502 namedclass
4503 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4504 }
4505 break;
4506 case 'h':
4507 if (memEQ(posixcc, "grap", 4)) {
4508 /* h */
4509 namedclass
4510 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4511 }
4512 break;
4513 case 'i':
4514 if (memEQ(posixcc, "asci", 4)) {
4515 /* i */
4516 namedclass
4517 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4518 }
4519 break;
4520 case 'k':
4521 if (memEQ(posixcc, "blan", 4)) {
4522 /* k */
4523 namedclass
4524 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4525 }
4526 break;
4527 case 'l':
4528 if (memEQ(posixcc, "cntr", 4)) {
4529 /* l */
4530 namedclass
4531 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4532 }
4533 break;
4534 case 'm':
4535 if (memEQ(posixcc, "alnu", 4)) {
4536 /* m */
4537 namedclass
4538 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4539 }
4540 break;
4541 case 'r':
4542 if (memEQ(posixcc, "lowe", 4)) {
4543 /* r */
4544 namedclass
4545 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4546 }
4547 if (memEQ(posixcc, "uppe", 4)) {
8fdec511 4548 /* r */
80916619
NC
4549 namedclass
4550 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4551 }
4552 break;
4553 case 't':
4554 if (memEQ(posixcc, "digi", 4)) {
4555 /* t */
4556 namedclass
4557 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4558 }
4559 if (memEQ(posixcc, "prin", 4)) {
8fdec511 4560 /* t */
80916619
NC
4561 namedclass
4562 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4563 }
4564 if (memEQ(posixcc, "punc", 4)) {
4565 /* t */
4566 namedclass
4567 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4568 }
4569 break;
b8c5462f
JH
4570 }
4571 break;
80916619
NC
4572 case 6:
4573 if (memEQ(posixcc, "xdigit", 6)) {
4574 namedclass
4575 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
4576 }
4577 break;
4578 }
80916619
NC
4579
4580 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
4581 {
4582 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4583 t - s - 1, s + 1);
4584 }
80916619
NC
4585 assert (posixcc[skip] == ':');
4586 assert (posixcc[skip+1] == ']');
b45f050a 4587 } else if (!SIZE_ONLY) {
b8c5462f 4588 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 4589
830247a4 4590 /* adjust RExC_parse so the warning shows after
b45f050a 4591 the class closes */
9a86a77b 4592 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 4593 RExC_parse++;
b45f050a
JF
4594 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4595 }
b8c5462f
JH
4596 } else {
4597 /* Maternal grandfather:
4598 * "[:" ending in ":" but not in ":]" */
830247a4 4599 RExC_parse = s;
767d463e 4600 }
620e46c5
JH
4601 }
4602 }
4603
b8c5462f
JH
4604 return namedclass;
4605}
4606
4607STATIC void
830247a4 4608S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 4609{
97aff369 4610 dVAR;
b938889d 4611 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
4612 const char *s = RExC_parse;
4613 const char c = *s++;
b8c5462f
JH
4614
4615 while(*s && isALNUM(*s))
4616 s++;
4617 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
4618 if (ckWARN(WARN_REGEXP))
4619 vWARN3(s+2,
4620 "POSIX syntax [%c %c] belongs inside character classes",
4621 c, c);
b45f050a
JF
4622
4623 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 4624 if (POSIXCC_NOTYET(c)) {
830247a4 4625 /* adjust RExC_parse so the error shows after
b45f050a 4626 the class closes */
9a86a77b 4627 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
4628 ;
4629 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4630 }
b8c5462f
JH
4631 }
4632 }
620e46c5
JH
4633}
4634
76e3520e 4635STATIC regnode *
830247a4 4636S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 4637{
97aff369 4638 dVAR;
ffc61ed2 4639 register UV value;
9a86a77b 4640 register UV nextvalue;
3568d838 4641 register IV prevvalue = OOB_UNICODE;
ffc61ed2 4642 register IV range = 0;
c277df42 4643 register regnode *ret;
ba210ebe 4644 STRLEN numlen;
ffc61ed2 4645 IV namedclass;
cbbf8932 4646 char *rangebegin = NULL;
936ed897 4647 bool need_class = 0;
c445ea15 4648 SV *listsv = NULL;
ffc61ed2
JH
4649 register char *e;
4650 UV n;
9e55ce06 4651 bool optimize_invert = TRUE;
cbbf8932 4652 AV* unicode_alternate = NULL;
1b2d223b
JH
4653#ifdef EBCDIC
4654 UV literal_endpoint = 0;
4655#endif
ffc61ed2
JH
4656
4657 ret = reganode(pRExC_state, ANYOF, 0);
4658
4659 if (!SIZE_ONLY)
4660 ANYOF_FLAGS(ret) = 0;
4661
9a86a77b 4662 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
4663 RExC_naughty++;
4664 RExC_parse++;
4665 if (!SIZE_ONLY)
4666 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4667 }
a0d0e21e 4668
73060fc4 4669 if (SIZE_ONLY) {
830247a4 4670 RExC_size += ANYOF_SKIP;
73060fc4
JH
4671 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
4672 }
936ed897 4673 else {
830247a4 4674 RExC_emit += ANYOF_SKIP;
936ed897
IZ
4675 if (FOLD)
4676 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4677 if (LOC)
4678 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 4679 ANYOF_BITMAP_ZERO(ret);
396482e1 4680 listsv = newSVpvs("# comment\n");
a0d0e21e 4681 }
b8c5462f 4682
9a86a77b
JH
4683 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4684
b938889d 4685 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 4686 checkposixcc(pRExC_state);
b8c5462f 4687
f064b6ad
HS
4688 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4689 if (UCHARAT(RExC_parse) == ']')
4690 goto charclassloop;
ffc61ed2 4691
9a86a77b 4692 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
4693
4694 charclassloop:
4695
4696 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4697
73b437c8 4698 if (!range)
830247a4 4699 rangebegin = RExC_parse;
ffc61ed2 4700 if (UTF) {
5e12f4fb 4701 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 4702 RExC_end - RExC_parse,
9f7f3913 4703 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
4704 RExC_parse += numlen;
4705 }
4706 else
4707 value = UCHARAT(RExC_parse++);
9a86a77b
JH
4708 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4709 if (value == '[' && POSIXCC(nextvalue))
830247a4 4710 namedclass = regpposixcc(pRExC_state, value);
620e46c5 4711 else if (value == '\\') {
ffc61ed2 4712 if (UTF) {
5e12f4fb 4713 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 4714 RExC_end - RExC_parse,
9f7f3913 4715 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
4716 RExC_parse += numlen;
4717 }
4718 else
4719 value = UCHARAT(RExC_parse++);
470c3474 4720 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 4721 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
4722 * be a problem later if we want switch on Unicode.
4723 * A similar issue a little bit later when switching on
4724 * namedclass. --jhi */
ffc61ed2 4725 switch ((I32)value) {
b8c5462f
JH
4726 case 'w': namedclass = ANYOF_ALNUM; break;
4727 case 'W': namedclass = ANYOF_NALNUM; break;
4728 case 's': namedclass = ANYOF_SPACE; break;
4729 case 'S': namedclass = ANYOF_NSPACE; break;
4730 case 'd': namedclass = ANYOF_DIGIT; break;
4731 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
4732 case 'p':
4733 case 'P':
af6f566e 4734 if (RExC_parse >= RExC_end)
2a4859cd 4735 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 4736 if (*RExC_parse == '{') {
1df70142 4737 const U8 c = (U8)value;
ffc61ed2
JH
4738 e = strchr(RExC_parse++, '}');
4739 if (!e)
0da60cf5 4740 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
4741 while (isSPACE(UCHARAT(RExC_parse)))
4742 RExC_parse++;
4743 if (e == RExC_parse)
0da60cf5 4744 vFAIL2("Empty \\%c{}", c);
ffc61ed2 4745 n = e - RExC_parse;
ab13f0c7
JH
4746 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4747 n--;
ffc61ed2
JH
4748 }
4749 else {
4750 e = RExC_parse;
4751 n = 1;
4752 }
4753 if (!SIZE_ONLY) {
ab13f0c7
JH
4754 if (UCHARAT(RExC_parse) == '^') {
4755 RExC_parse++;
4756 n--;
4757 value = value == 'p' ? 'P' : 'p'; /* toggle */
4758 while (isSPACE(UCHARAT(RExC_parse))) {
4759 RExC_parse++;
4760 n--;
4761 }
4762 }
097eb12c
AL
4763 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
4764 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
4765 }
4766 RExC_parse = e + 1;
4767 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2
JP
4768 namedclass = ANYOF_MAX; /* no official name, but it's named */
4769 break;
b8c5462f
JH
4770 case 'n': value = '\n'; break;
4771 case 'r': value = '\r'; break;
4772 case 't': value = '\t'; break;
4773 case 'f': value = '\f'; break;
4774 case 'b': value = '\b'; break;
c7f1f016
NIS
4775 case 'e': value = ASCII_TO_NATIVE('\033');break;
4776 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 4777 case 'x':
ffc61ed2 4778 if (*RExC_parse == '{') {
a4c04bdc
NC
4779 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4780 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 4781 e = strchr(RExC_parse++, '}');
b81d288d 4782 if (!e)
ffc61ed2 4783 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
4784
4785 numlen = e - RExC_parse;
4786 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4787 RExC_parse = e + 1;
4788 }
4789 else {
a4c04bdc 4790 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4791 numlen = 2;
4792 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4793 RExC_parse += numlen;
4794 }
b8c5462f
JH
4795 break;
4796 case 'c':
830247a4 4797 value = UCHARAT(RExC_parse++);
b8c5462f
JH
4798 value = toCTRL(value);
4799 break;
4800 case '0': case '1': case '2': case '3': case '4':
4801 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
4802 {
4803 I32 flags = 0;
4804 numlen = 3;
4805 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 4806 RExC_parse += numlen;
b8c5462f 4807 break;
53305cf1 4808 }
1028017a 4809 default:
041457d9 4810 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
4811 vWARN2(RExC_parse,
4812 "Unrecognized escape \\%c in character class passed through",
4813 (int)value);
1028017a 4814 break;
b8c5462f 4815 }
ffc61ed2 4816 } /* end of \blah */
1b2d223b
JH
4817#ifdef EBCDIC
4818 else
4819 literal_endpoint++;
4820#endif
ffc61ed2
JH
4821
4822 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4823
4824 if (!SIZE_ONLY && !need_class)
936ed897 4825 ANYOF_CLASS_ZERO(ret);
ffc61ed2 4826
936ed897 4827 need_class = 1;
ffc61ed2
JH
4828
4829 /* a bad range like a-\d, a-[:digit:] ? */
4830 if (range) {
73b437c8 4831 if (!SIZE_ONLY) {
afd78fd5 4832 if (ckWARN(WARN_REGEXP)) {
097eb12c 4833 const int w =
afd78fd5
JH
4834 RExC_parse >= rangebegin ?
4835 RExC_parse - rangebegin : 0;
830247a4 4836 vWARN4(RExC_parse,
b45f050a 4837 "False [] range \"%*.*s\"",
097eb12c 4838 w, w, rangebegin);
afd78fd5 4839 }
3568d838
JH
4840 if (prevvalue < 256) {
4841 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
4842 ANYOF_BITMAP_SET(ret, '-');
4843 }
4844 else {
4845 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4846 Perl_sv_catpvf(aTHX_ listsv,
3568d838 4847 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 4848 }
b8c5462f 4849 }
ffc61ed2
JH
4850
4851 range = 0; /* this was not a true range */
73b437c8 4852 }
ffc61ed2 4853
73b437c8 4854 if (!SIZE_ONLY) {
c49a72a9
NC
4855 const char *what = NULL;
4856 char yesno = 0;
4857
3568d838
JH
4858 if (namedclass > OOB_NAMEDCLASS)
4859 optimize_invert = FALSE;
e2962f66
JH
4860 /* Possible truncation here but in some 64-bit environments
4861 * the compiler gets heartburn about switch on 64-bit values.
4862 * A similar issue a little earlier when switching on value.
98f323fa 4863 * --jhi */
e2962f66 4864 switch ((I32)namedclass) {
73b437c8
JH
4865 case ANYOF_ALNUM:
4866 if (LOC)
936ed897 4867 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
4868 else {
4869 for (value = 0; value < 256; value++)
4870 if (isALNUM(value))
936ed897 4871 ANYOF_BITMAP_SET(ret, value);
73b437c8 4872 }
c49a72a9
NC
4873 yesno = '+';
4874 what = "Word";
73b437c8
JH
4875 break;
4876 case ANYOF_NALNUM:
4877 if (LOC)
936ed897 4878 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
4879 else {
4880 for (value = 0; value < 256; value++)
4881 if (!isALNUM(value))
936ed897 4882 ANYOF_BITMAP_SET(ret, value);
73b437c8 4883 }
c49a72a9
NC
4884 yesno = '!';
4885 what = "Word";
73b437c8 4886 break;
ffc61ed2 4887 case ANYOF_ALNUMC:
73b437c8 4888 if (LOC)
ffc61ed2 4889 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
4890 else {
4891 for (value = 0; value < 256; value++)
ffc61ed2 4892 if (isALNUMC(value))
936ed897 4893 ANYOF_BITMAP_SET(ret, value);
73b437c8 4894 }
c49a72a9
NC
4895 yesno = '+';
4896 what = "Alnum";
73b437c8
JH
4897 break;
4898 case ANYOF_NALNUMC:
4899 if (LOC)
936ed897 4900 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
4901 else {
4902 for (value = 0; value < 256; value++)
4903 if (!isALNUMC(value))
936ed897 4904 ANYOF_BITMAP_SET(ret, value);
73b437c8 4905 }
c49a72a9
NC
4906 yesno = '!';
4907 what = "Alnum";
73b437c8
JH
4908 break;
4909 case ANYOF_ALPHA:
4910 if (LOC)
936ed897 4911 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
4912 else {
4913 for (value = 0; value < 256; value++)
4914 if (isALPHA(value))
936ed897 4915 ANYOF_BITMAP_SET(ret, value);
73b437c8 4916 }
c49a72a9
NC
4917 yesno = '+';
4918 what = "Alpha";
73b437c8
JH
4919 break;
4920 case ANYOF_NALPHA:
4921 if (LOC)
936ed897 4922 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
4923 else {
4924 for (value = 0; value < 256; value++)
4925 if (!isALPHA(value))
936ed897 4926 ANYOF_BITMAP_SET(ret, value);
73b437c8 4927 }
c49a72a9
NC
4928 yesno = '!';
4929 what = "Alpha";
73b437c8
JH
4930 break;
4931 case ANYOF_ASCII:
4932 if (LOC)
936ed897 4933 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 4934 else {
c7f1f016 4935#ifndef EBCDIC
1ba5c669
JH
4936 for (value = 0; value < 128; value++)
4937 ANYOF_BITMAP_SET(ret, value);
4938#else /* EBCDIC */
ffbc6a93 4939 for (value = 0; value < 256; value++) {
3a3c4447
JH
4940 if (isASCII(value))
4941 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4942 }
1ba5c669 4943#endif /* EBCDIC */
73b437c8 4944 }
c49a72a9
NC
4945 yesno = '+';
4946 what = "ASCII";
73b437c8
JH
4947 break;
4948 case ANYOF_NASCII:
4949 if (LOC)
936ed897 4950 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 4951 else {
c7f1f016 4952#ifndef EBCDIC
1ba5c669
JH
4953 for (value = 128; value < 256; value++)
4954 ANYOF_BITMAP_SET(ret, value);
4955#else /* EBCDIC */
ffbc6a93 4956 for (value = 0; value < 256; value++) {
3a3c4447
JH
4957 if (!isASCII(value))
4958 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4959 }
1ba5c669 4960#endif /* EBCDIC */
73b437c8 4961 }
c49a72a9
NC
4962 yesno = '!';
4963 what = "ASCII";
73b437c8 4964 break;
aaa51d5e
JF
4965 case ANYOF_BLANK:
4966 if (LOC)
4967 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4968 else {
4969 for (value = 0; value < 256; value++)
4970 if (isBLANK(value))
4971 ANYOF_BITMAP_SET(ret, value);
4972 }
c49a72a9
NC
4973 yesno = '+';
4974 what = "Blank";
aaa51d5e
JF
4975 break;
4976 case ANYOF_NBLANK:
4977 if (LOC)
4978 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4979 else {
4980 for (value = 0; value < 256; value++)
4981 if (!isBLANK(value))
4982 ANYOF_BITMAP_SET(ret, value);
4983 }
c49a72a9
NC
4984 yesno = '!';
4985 what = "Blank";
aaa51d5e 4986 break;
73b437c8
JH
4987 case ANYOF_CNTRL:
4988 if (LOC)
936ed897 4989 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
4990 else {
4991 for (value = 0; value < 256; value++)
4992 if (isCNTRL(value))
936ed897 4993 ANYOF_BITMAP_SET(ret, value);
73b437c8 4994 }
c49a72a9
NC
4995 yesno = '+';
4996 what = "Cntrl";
73b437c8
JH
4997 break;
4998 case ANYOF_NCNTRL:
4999 if (LOC)
936ed897 5000 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5001 else {
5002 for (value = 0; value < 256; value++)
5003 if (!isCNTRL(value))
936ed897 5004 ANYOF_BITMAP_SET(ret, value);
73b437c8 5005 }
c49a72a9
NC
5006 yesno = '!';
5007 what = "Cntrl";
ffc61ed2
JH
5008 break;
5009 case ANYOF_DIGIT:
5010 if (LOC)
5011 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5012 else {
5013 /* consecutive digits assumed */
5014 for (value = '0'; value <= '9'; value++)
5015 ANYOF_BITMAP_SET(ret, value);
5016 }
c49a72a9
NC
5017 yesno = '+';
5018 what = "Digit";
ffc61ed2
JH
5019 break;
5020 case ANYOF_NDIGIT:
5021 if (LOC)
5022 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5023 else {
5024 /* consecutive digits assumed */
5025 for (value = 0; value < '0'; value++)
5026 ANYOF_BITMAP_SET(ret, value);
5027 for (value = '9' + 1; value < 256; value++)
5028 ANYOF_BITMAP_SET(ret, value);
5029 }
c49a72a9
NC
5030 yesno = '!';
5031 what = "Digit";
73b437c8
JH
5032 break;
5033 case ANYOF_GRAPH:
5034 if (LOC)
936ed897 5035 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5036 else {
5037 for (value = 0; value < 256; value++)
5038 if (isGRAPH(value))
936ed897 5039 ANYOF_BITMAP_SET(ret, value);
73b437c8 5040 }
c49a72a9
NC
5041 yesno = '+';
5042 what = "Graph";
73b437c8
JH
5043 break;
5044 case ANYOF_NGRAPH:
5045 if (LOC)
936ed897 5046 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5047 else {
5048 for (value = 0; value < 256; value++)
5049 if (!isGRAPH(value))
936ed897 5050 ANYOF_BITMAP_SET(ret, value);
73b437c8 5051 }
c49a72a9
NC
5052 yesno = '!';
5053 what = "Graph";
73b437c8
JH
5054 break;
5055 case ANYOF_LOWER:
5056 if (LOC)
936ed897 5057 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5058 else {
5059 for (value = 0; value < 256; value++)
5060 if (isLOWER(value))
936ed897 5061 ANYOF_BITMAP_SET(ret, value);
73b437c8 5062 }
c49a72a9
NC
5063 yesno = '+';
5064 what = "Lower";
73b437c8
JH
5065 break;
5066 case ANYOF_NLOWER:
5067 if (LOC)
936ed897 5068 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5069 else {
5070 for (value = 0; value < 256; value++)
5071 if (!isLOWER(value))
936ed897 5072 ANYOF_BITMAP_SET(ret, value);
73b437c8 5073 }
c49a72a9
NC
5074 yesno = '!';
5075 what = "Lower";
73b437c8
JH
5076 break;
5077 case ANYOF_PRINT:
5078 if (LOC)
936ed897 5079 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5080 else {
5081 for (value = 0; value < 256; value++)
5082 if (isPRINT(value))
936ed897 5083 ANYOF_BITMAP_SET(ret, value);
73b437c8 5084 }
c49a72a9
NC
5085 yesno = '+';
5086 what = "Print";
73b437c8
JH
5087 break;
5088 case ANYOF_NPRINT:
5089 if (LOC)
936ed897 5090 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5091 else {
5092 for (value = 0; value < 256; value++)
5093 if (!isPRINT(value))
936ed897 5094 ANYOF_BITMAP_SET(ret, value);
73b437c8 5095 }
c49a72a9
NC
5096 yesno = '!';
5097 what = "Print";
73b437c8 5098 break;
aaa51d5e
JF
5099 case ANYOF_PSXSPC:
5100 if (LOC)
5101 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5102 else {
5103 for (value = 0; value < 256; value++)
5104 if (isPSXSPC(value))
5105 ANYOF_BITMAP_SET(ret, value);
5106 }
c49a72a9
NC
5107 yesno = '+';
5108 what = "Space";
aaa51d5e
JF
5109 break;
5110 case ANYOF_NPSXSPC:
5111 if (LOC)
5112 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5113 else {
5114 for (value = 0; value < 256; value++)
5115 if (!isPSXSPC(value))
5116 ANYOF_BITMAP_SET(ret, value);
5117 }
c49a72a9
NC
5118 yesno = '!';
5119 what = "Space";
aaa51d5e 5120 break;
73b437c8
JH
5121 case ANYOF_PUNCT:
5122 if (LOC)
936ed897 5123 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5124 else {
5125 for (value = 0; value < 256; value++)
5126 if (isPUNCT(value))
936ed897 5127 ANYOF_BITMAP_SET(ret, value);
73b437c8 5128 }
c49a72a9
NC
5129 yesno = '+';
5130 what = "Punct";
73b437c8
JH
5131 break;
5132 case ANYOF_NPUNCT:
5133 if (LOC)
936ed897 5134 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5135 else {
5136 for (value = 0; value < 256; value++)
5137 if (!isPUNCT(value))
936ed897 5138 ANYOF_BITMAP_SET(ret, value);
73b437c8 5139 }
c49a72a9
NC
5140 yesno = '!';
5141 what = "Punct";
ffc61ed2
JH
5142 break;
5143 case ANYOF_SPACE:
5144 if (LOC)
5145 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5146 else {
5147 for (value = 0; value < 256; value++)
5148 if (isSPACE(value))
5149 ANYOF_BITMAP_SET(ret, value);
5150 }
c49a72a9
NC
5151 yesno = '+';
5152 what = "SpacePerl";
ffc61ed2
JH
5153 break;
5154 case ANYOF_NSPACE:
5155 if (LOC)
5156 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5157 else {
5158 for (value = 0; value < 256; value++)
5159 if (!isSPACE(value))
5160 ANYOF_BITMAP_SET(ret, value);
5161 }
c49a72a9
NC
5162 yesno = '!';
5163 what = "SpacePerl";
73b437c8
JH
5164 break;
5165 case ANYOF_UPPER:
5166 if (LOC)
936ed897 5167 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5168 else {
5169 for (value = 0; value < 256; value++)
5170 if (isUPPER(value))
936ed897 5171 ANYOF_BITMAP_SET(ret, value);
73b437c8 5172 }
c49a72a9
NC
5173 yesno = '+';
5174 what = "Upper";
73b437c8
JH
5175 break;
5176 case ANYOF_NUPPER:
5177 if (LOC)
936ed897 5178 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5179 else {
5180 for (value = 0; value < 256; value++)
5181 if (!isUPPER(value))
936ed897 5182 ANYOF_BITMAP_SET(ret, value);
73b437c8 5183 }
c49a72a9
NC
5184 yesno = '!';
5185 what = "Upper";
73b437c8
JH
5186 break;
5187 case ANYOF_XDIGIT:
5188 if (LOC)
936ed897 5189 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5190 else {
5191 for (value = 0; value < 256; value++)
5192 if (isXDIGIT(value))
936ed897 5193 ANYOF_BITMAP_SET(ret, value);
73b437c8 5194 }
c49a72a9
NC
5195 yesno = '+';
5196 what = "XDigit";
73b437c8
JH
5197 break;
5198 case ANYOF_NXDIGIT:
5199 if (LOC)
936ed897 5200 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5201 else {
5202 for (value = 0; value < 256; value++)
5203 if (!isXDIGIT(value))
936ed897 5204 ANYOF_BITMAP_SET(ret, value);
73b437c8 5205 }
c49a72a9
NC
5206 yesno = '!';
5207 what = "XDigit";
73b437c8 5208 break;
f81125e2
JP
5209 case ANYOF_MAX:
5210 /* this is to handle \p and \P */
5211 break;
73b437c8 5212 default:
b45f050a 5213 vFAIL("Invalid [::] class");
73b437c8 5214 break;
b8c5462f 5215 }
c49a72a9
NC
5216 if (what) {
5217 /* Strings such as "+utf8::isWord\n" */
5218 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5219 }
b8c5462f 5220 if (LOC)
936ed897 5221 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5222 continue;
a0d0e21e 5223 }
ffc61ed2
JH
5224 } /* end of namedclass \blah */
5225
a0d0e21e 5226 if (range) {
eb160463 5227 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
5228 const int w = RExC_parse - rangebegin;
5229 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 5230 range = 0; /* not a valid range */
73b437c8 5231 }
a0d0e21e
LW
5232 }
5233 else {
3568d838 5234 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5235 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5236 RExC_parse[1] != ']') {
5237 RExC_parse++;
ffc61ed2
JH
5238
5239 /* a bad range like \w-, [:word:]- ? */
5240 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 5241 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 5242 const int w =
afd78fd5
JH
5243 RExC_parse >= rangebegin ?
5244 RExC_parse - rangebegin : 0;
830247a4 5245 vWARN4(RExC_parse,
b45f050a 5246 "False [] range \"%*.*s\"",
097eb12c 5247 w, w, rangebegin);
afd78fd5 5248 }
73b437c8 5249 if (!SIZE_ONLY)
936ed897 5250 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5251 } else
ffc61ed2
JH
5252 range = 1; /* yeah, it's a range! */
5253 continue; /* but do it the next time */
a0d0e21e 5254 }
a687059c 5255 }
ffc61ed2 5256
93733859 5257 /* now is the next time */
ae5c130c 5258 if (!SIZE_ONLY) {
3568d838
JH
5259 IV i;
5260
5261 if (prevvalue < 256) {
1df70142 5262 const IV ceilvalue = value < 256 ? value : 255;
3568d838
JH
5263
5264#ifdef EBCDIC
1b2d223b
JH
5265 /* In EBCDIC [\x89-\x91] should include
5266 * the \x8e but [i-j] should not. */
5267 if (literal_endpoint == 2 &&
5268 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5269 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5270 {
3568d838
JH
5271 if (isLOWER(prevvalue)) {
5272 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5273 if (isLOWER(i))
5274 ANYOF_BITMAP_SET(ret, i);
5275 } else {
3568d838 5276 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5277 if (isUPPER(i))
5278 ANYOF_BITMAP_SET(ret, i);
5279 }
8ada0baa 5280 }
ffc61ed2 5281 else
8ada0baa 5282#endif
a5961de5
JH
5283 for (i = prevvalue; i <= ceilvalue; i++)
5284 ANYOF_BITMAP_SET(ret, i);
3568d838 5285 }
a5961de5 5286 if (value > 255 || UTF) {
1df70142
AL
5287 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5288 const UV natvalue = NATIVE_TO_UNI(value);
b08decb7 5289
ffc61ed2 5290 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5291 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5292 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5293 prevnatvalue, natvalue);
5294 }
5295 else if (prevnatvalue == natvalue) {
5296 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5297 if (FOLD) {
89ebb4a3 5298 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5299 STRLEN foldlen;
1df70142 5300 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5301
c840d2a2
JH
5302 /* If folding and foldable and a single
5303 * character, insert also the folded version
5304 * to the charclass. */
9e55ce06 5305 if (f != value) {
eb160463 5306 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5307 Perl_sv_catpvf(aTHX_ listsv,
5308 "%04"UVxf"\n", f);
5309 else {
5310 /* Any multicharacter foldings
5311 * require the following transform:
5312 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5313 * where E folds into "pq" and F folds
5314 * into "rst", all other characters
5315 * fold to single characters. We save
5316 * away these multicharacter foldings,
5317 * to be later saved as part of the
5318 * additional "s" data. */
5319 SV *sv;
5320
5321 if (!unicode_alternate)
5322 unicode_alternate = newAV();
5323 sv = newSVpvn((char*)foldbuf, foldlen);
5324 SvUTF8_on(sv);
5325 av_push(unicode_alternate, sv);
5326 }
5327 }
254ba52a 5328
60a8b682
JH
5329 /* If folding and the value is one of the Greek
5330 * sigmas insert a few more sigmas to make the
5331 * folding rules of the sigmas to work right.
5332 * Note that not all the possible combinations
5333 * are handled here: some of them are handled
9e55ce06
JH
5334 * by the standard folding rules, and some of
5335 * them (literal or EXACTF cases) are handled
5336 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5337 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5338 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5339 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5340 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5341 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5342 }
5343 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5344 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5345 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5346 }
5347 }
ffc61ed2 5348 }
1b2d223b
JH
5349#ifdef EBCDIC
5350 literal_endpoint = 0;
5351#endif
8ada0baa 5352 }
ffc61ed2
JH
5353
5354 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5355 }
ffc61ed2 5356
936ed897 5357 if (need_class) {
4f66b38d 5358 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5359 if (SIZE_ONLY)
830247a4 5360 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5361 else
830247a4 5362 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5363 }
ffc61ed2 5364
ae5c130c 5365 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 5366 if (!SIZE_ONLY &&
ffc61ed2 5367 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5368 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5369 ) {
a0ed51b3 5370 for (value = 0; value < 256; ++value) {
936ed897 5371 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5372 UV fold = PL_fold[value];
ffc61ed2
JH
5373
5374 if (fold != value)
5375 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5376 }
5377 }
936ed897 5378 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5379 }
ffc61ed2 5380
ae5c130c 5381 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 5382 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
5383 /* If the only flag is inversion. */
5384 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5385 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5386 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5387 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5388 }
a0d0e21e 5389
b81d288d 5390 if (!SIZE_ONLY) {
097eb12c 5391 AV * const av = newAV();
ffc61ed2
JH
5392 SV *rv;
5393
9e55ce06 5394 /* The 0th element stores the character class description
6a0407ee 5395 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5396 * to initialize the appropriate swash (which gets stored in
5397 * the 1st element), and also useful for dumping the regnode.
5398 * The 2nd element stores the multicharacter foldings,
6a0407ee 5399 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
5400 av_store(av, 0, listsv);
5401 av_store(av, 1, NULL);
9e55ce06 5402 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 5403 rv = newRV_noinc((SV*)av);
19860706 5404 n = add_data(pRExC_state, 1, "s");
830247a4 5405 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 5406 ARG_SET(ret, n);
a0ed51b3
LW
5407 }
5408
5409 return ret;
5410}
5411
76e3520e 5412STATIC char*
830247a4 5413S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 5414{
097eb12c 5415 char* const retval = RExC_parse++;
a0d0e21e 5416
4633a7c4 5417 for (;;) {
830247a4
IZ
5418 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5419 RExC_parse[2] == '#') {
e994fd66
AE
5420 while (*RExC_parse != ')') {
5421 if (RExC_parse == RExC_end)
5422 FAIL("Sequence (?#... not terminated");
830247a4 5423 RExC_parse++;
e994fd66 5424 }
830247a4 5425 RExC_parse++;
4633a7c4
LW
5426 continue;
5427 }
e2509266 5428 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
5429 if (isSPACE(*RExC_parse)) {
5430 RExC_parse++;
748a9306
LW
5431 continue;
5432 }
830247a4 5433 else if (*RExC_parse == '#') {
e994fd66
AE
5434 while (RExC_parse < RExC_end)
5435 if (*RExC_parse++ == '\n') break;
748a9306
LW
5436 continue;
5437 }
748a9306 5438 }
4633a7c4 5439 return retval;
a0d0e21e 5440 }
a687059c
LW
5441}
5442
5443/*
c277df42 5444- reg_node - emit a node
a0d0e21e 5445*/
76e3520e 5446STATIC regnode * /* Location. */
830247a4 5447S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 5448{
97aff369 5449 dVAR;
c277df42 5450 register regnode *ptr;
504618e9 5451 regnode * const ret = RExC_emit;
a687059c 5452
c277df42 5453 if (SIZE_ONLY) {
830247a4
IZ
5454 SIZE_ALIGN(RExC_size);
5455 RExC_size += 1;
a0d0e21e
LW
5456 return(ret);
5457 }
a687059c 5458
c277df42 5459 NODE_ALIGN_FILL(ret);
a0d0e21e 5460 ptr = ret;
c277df42 5461 FILL_ADVANCE_NODE(ptr, op);
fac92740 5462 if (RExC_offsets) { /* MJD */
ccb2c380 5463 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
5464 "reg_node", __LINE__,
5465 reg_name[op],
5466 RExC_emit - RExC_emit_start > RExC_offsets[0]
5467 ? "Overwriting end of array!\n" : "OK",
5468 RExC_emit - RExC_emit_start,
5469 RExC_parse - RExC_start,
5470 RExC_offsets[0]));
ccb2c380 5471 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
5472 }
5473
830247a4 5474 RExC_emit = ptr;
a687059c 5475
a0d0e21e 5476 return(ret);
a687059c
LW
5477}
5478
5479/*
a0d0e21e
LW
5480- reganode - emit a node with an argument
5481*/
76e3520e 5482STATIC regnode * /* Location. */
830247a4 5483S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 5484{
97aff369 5485 dVAR;
c277df42 5486 register regnode *ptr;
504618e9 5487 regnode * const ret = RExC_emit;
fe14fcc3 5488
c277df42 5489 if (SIZE_ONLY) {
830247a4
IZ
5490 SIZE_ALIGN(RExC_size);
5491 RExC_size += 2;
a0d0e21e
LW
5492 return(ret);
5493 }
fe14fcc3 5494
c277df42 5495 NODE_ALIGN_FILL(ret);
a0d0e21e 5496 ptr = ret;
c277df42 5497 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 5498 if (RExC_offsets) { /* MJD */
ccb2c380 5499 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5500 "reganode",
ccb2c380
MP
5501 __LINE__,
5502 reg_name[op],
fac92740
MJD
5503 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5504 "Overwriting end of array!\n" : "OK",
5505 RExC_emit - RExC_emit_start,
5506 RExC_parse - RExC_start,
5507 RExC_offsets[0]));
ccb2c380 5508 Set_Cur_Node_Offset;
fac92740
MJD
5509 }
5510
830247a4 5511 RExC_emit = ptr;
fe14fcc3 5512
a0d0e21e 5513 return(ret);
fe14fcc3
LW
5514}
5515
5516/*
cd439c50 5517- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
5518*/
5519STATIC void
a28509cc 5520S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 5521{
97aff369 5522 dVAR;
5e12f4fb 5523 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
5524}
5525
5526/*
a0d0e21e
LW
5527- reginsert - insert an operator in front of already-emitted operand
5528*
5529* Means relocating the operand.
5530*/
76e3520e 5531STATIC void
830247a4 5532S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 5533{
97aff369 5534 dVAR;
c277df42
IZ
5535 register regnode *src;
5536 register regnode *dst;
5537 register regnode *place;
504618e9 5538 const int offset = regarglen[(U8)op];
b81d288d 5539
22c35a8c 5540/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
5541
5542 if (SIZE_ONLY) {
830247a4 5543 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
5544 return;
5545 }
a687059c 5546
830247a4
IZ
5547 src = RExC_emit;
5548 RExC_emit += NODE_STEP_REGNODE + offset;
5549 dst = RExC_emit;
fac92740 5550 while (src > opnd) {
c277df42 5551 StructCopy(--src, --dst, regnode);
fac92740 5552 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 5553 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 5554 "reg_insert",
ccb2c380
MP
5555 __LINE__,
5556 reg_name[op],
fac92740
MJD
5557 dst - RExC_emit_start > RExC_offsets[0]
5558 ? "Overwriting end of array!\n" : "OK",
5559 src - RExC_emit_start,
5560 dst - RExC_emit_start,
5561 RExC_offsets[0]));
ccb2c380
MP
5562 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5563 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
5564 }
5565 }
5566
a0d0e21e
LW
5567
5568 place = opnd; /* Op node, where operand used to be. */
fac92740 5569 if (RExC_offsets) { /* MJD */
ccb2c380 5570 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5571 "reginsert",
ccb2c380
MP
5572 __LINE__,
5573 reg_name[op],
fac92740
MJD
5574 place - RExC_emit_start > RExC_offsets[0]
5575 ? "Overwriting end of array!\n" : "OK",
5576 place - RExC_emit_start,
5577 RExC_parse - RExC_start,
5578 RExC_offsets[0]));
ccb2c380 5579 Set_Node_Offset(place, RExC_parse);
45948336 5580 Set_Node_Length(place, 1);
fac92740 5581 }
c277df42
IZ
5582 src = NEXTOPER(place);
5583 FILL_ADVANCE_NODE(place, op);
5584 Zero(src, offset, regnode);
a687059c
LW
5585}
5586
5587/*
c277df42 5588- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 5589*/
097eb12c 5590/* TODO: All three parms should be const */
76e3520e 5591STATIC void
097eb12c 5592S_regtail(pTHX_ const RExC_state_t *pRExC_state, regnode *p, const regnode *val)
a687059c 5593{
97aff369 5594 dVAR;
c277df42 5595 register regnode *scan;
a0d0e21e 5596
c277df42 5597 if (SIZE_ONLY)
a0d0e21e
LW
5598 return;
5599
5600 /* Find last node. */
5601 scan = p;
5602 for (;;) {
504618e9 5603 regnode * const temp = regnext(scan);
a0d0e21e
LW
5604 if (temp == NULL)
5605 break;
5606 scan = temp;
5607 }
a687059c 5608
c277df42
IZ
5609 if (reg_off_by_arg[OP(scan)]) {
5610 ARG_SET(scan, val - scan);
a0ed51b3
LW
5611 }
5612 else {
c277df42
IZ
5613 NEXT_OFF(scan) = val - scan;
5614 }
a687059c
LW
5615}
5616
5617/*
a0d0e21e
LW
5618- regoptail - regtail on operand of first argument; nop if operandless
5619*/
097eb12c 5620/* TODO: All three parms should be const */
76e3520e 5621STATIC void
097eb12c 5622S_regoptail(pTHX_ const RExC_state_t *pRExC_state, regnode *p, const regnode *val)
a687059c 5623{
97aff369 5624 dVAR;
a0d0e21e 5625 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
5626 if (p == NULL || SIZE_ONLY)
5627 return;
22c35a8c 5628 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 5629 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 5630 }
22c35a8c 5631 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 5632 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
5633 }
5634 else
a0d0e21e 5635 return;
a687059c
LW
5636}
5637
5638/*
5639 - regcurly - a little FSA that accepts {\d+,?\d*}
5640 */
79072805 5641STATIC I32
5f66b61c 5642S_regcurly(register const char *s)
a687059c
LW
5643{
5644 if (*s++ != '{')
5645 return FALSE;
f0fcb552 5646 if (!isDIGIT(*s))
a687059c 5647 return FALSE;
f0fcb552 5648 while (isDIGIT(*s))
a687059c
LW
5649 s++;
5650 if (*s == ',')
5651 s++;
f0fcb552 5652 while (isDIGIT(*s))
a687059c
LW
5653 s++;
5654 if (*s != '}')
5655 return FALSE;
5656 return TRUE;
5657}
5658
a687059c
LW
5659
5660/*
fd181c75 5661 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
5662 */
5663void
097eb12c 5664Perl_regdump(pTHX_ const regexp *r)
a687059c 5665{
35ff7856 5666#ifdef DEBUGGING
97aff369 5667 dVAR;
c445ea15 5668 SV * const sv = sv_newmortal();
a687059c 5669
4f639d21 5670 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
5671
5672 /* Header fields of interest. */
c277df42 5673 if (r->anchored_substr)
7b0972df 5674 PerlIO_printf(Perl_debug_log,
a0288114 5675 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
3280af22 5676 PL_colors[0],
7b0972df 5677 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
3f7c398e 5678 SvPVX_const(r->anchored_substr),
3280af22 5679 PL_colors[1],
c277df42 5680 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 5681 (IV)r->anchored_offset);
33b8afdf
JH
5682 else if (r->anchored_utf8)
5683 PerlIO_printf(Perl_debug_log,
a0288114 5684 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
33b8afdf
JH
5685 PL_colors[0],
5686 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
3f7c398e 5687 SvPVX_const(r->anchored_utf8),
33b8afdf
JH
5688 PL_colors[1],
5689 SvTAIL(r->anchored_utf8) ? "$" : "",
5690 (IV)r->anchored_offset);
c277df42 5691 if (r->float_substr)
7b0972df 5692 PerlIO_printf(Perl_debug_log,
a0288114 5693 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
3280af22 5694 PL_colors[0],
b81d288d 5695 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
3f7c398e 5696 SvPVX_const(r->float_substr),
3280af22 5697 PL_colors[1],
c277df42 5698 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 5699 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
5700 else if (r->float_utf8)
5701 PerlIO_printf(Perl_debug_log,
a0288114 5702 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
33b8afdf
JH
5703 PL_colors[0],
5704 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
3f7c398e 5705 SvPVX_const(r->float_utf8),
33b8afdf
JH
5706 PL_colors[1],
5707 SvTAIL(r->float_utf8) ? "$" : "",
5708 (IV)r->float_min_offset, (UV)r->float_max_offset);
5709 if (r->check_substr || r->check_utf8)
b81d288d
AB
5710 PerlIO_printf(Perl_debug_log,
5711 r->check_substr == r->float_substr
33b8afdf 5712 && r->check_utf8 == r->float_utf8
c277df42
IZ
5713 ? "(checking floating" : "(checking anchored");
5714 if (r->reganch & ROPT_NOSCAN)
5715 PerlIO_printf(Perl_debug_log, " noscan");
5716 if (r->reganch & ROPT_CHECK_ALL)
5717 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 5718 if (r->check_substr || r->check_utf8)
c277df42
IZ
5719 PerlIO_printf(Perl_debug_log, ") ");
5720
46fc3d4c 5721 if (r->regstclass) {
32fc9b6a 5722 regprop(r, sv, r->regstclass);
3f7c398e 5723 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
46fc3d4c 5724 }
774d564b 5725 if (r->reganch & ROPT_ANCH) {
5726 PerlIO_printf(Perl_debug_log, "anchored");
5727 if (r->reganch & ROPT_ANCH_BOL)
5728 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
5729 if (r->reganch & ROPT_ANCH_MBOL)
5730 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
5731 if (r->reganch & ROPT_ANCH_SBOL)
5732 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 5733 if (r->reganch & ROPT_ANCH_GPOS)
5734 PerlIO_printf(Perl_debug_log, "(GPOS)");
5735 PerlIO_putc(Perl_debug_log, ' ');
5736 }
c277df42
IZ
5737 if (r->reganch & ROPT_GPOS_SEEN)
5738 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 5739 if (r->reganch & ROPT_SKIP)
760ac839 5740 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 5741 if (r->reganch & ROPT_IMPLICIT)
760ac839 5742 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 5743 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
5744 if (r->reganch & ROPT_EVAL_SEEN)
5745 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 5746 PerlIO_printf(Perl_debug_log, "\n");
fac92740 5747 if (r->offsets) {
e4584336 5748 const U32 len = r->offsets[0];
a3621e74
YO
5749 GET_RE_DEBUG_FLAGS_DECL;
5750 DEBUG_OFFSETS_r({
1df70142 5751 U32 i;
e4584336
RB
5752 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5753 for (i = 1; i <= len; i++)
5754 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5755 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5756 PerlIO_printf(Perl_debug_log, "\n");
a3621e74 5757 });
fac92740 5758 }
65e66c80 5759#else
96a5add6 5760 PERL_UNUSED_CONTEXT;
65e66c80 5761 PERL_UNUSED_ARG(r);
17c3b450 5762#endif /* DEBUGGING */
a687059c
LW
5763}
5764
5765/*
a0d0e21e
LW
5766- regprop - printable representation of opcode
5767*/
46fc3d4c 5768void
32fc9b6a 5769Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 5770{
35ff7856 5771#ifdef DEBUGGING
97aff369 5772 dVAR;
9b155405 5773 register int k;
a0d0e21e 5774
54dc92de 5775 sv_setpvn(sv, "", 0);
9b155405 5776 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
5777 /* It would be nice to FAIL() here, but this may be called from
5778 regexec.c, and it would be hard to supply pRExC_state. */
5779 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 5780 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405
IZ
5781
5782 k = PL_regkind[(U8)OP(o)];
5783
2a782b5b 5784 if (k == EXACT) {
396482e1 5785 SV * const dsv = sv_2mortal(newSVpvs(""));
c728cb41
JH
5786 /* Using is_utf8_string() is a crude hack but it may
5787 * be the best for now since we have no flag "this EXACTish
5788 * node was UTF-8" --jhi */
1df70142 5789 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
2d03de9c 5790 const char * const s = do_utf8 ?
c728cb41
JH
5791 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5792 UNI_DISPLAY_REGEX) :
2a782b5b 5793 STRING(o);
e1ec3a88 5794 const int len = do_utf8 ?
2a782b5b
JH
5795 strlen(s) :
5796 STR_LEN(o);
5797 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5798 PL_colors[0],
5799 len, s,
5800 PL_colors[1]);
bb263b4e
AL
5801 } else if (k == TRIE) {
5802 /*EMPTY*/;
4f639d21
DM
5803 /* print the details od the trie in dumpuntil instead, as
5804 * prog->data isn't available here */
a3621e74 5805 } else if (k == CURLY) {
cb434fcc 5806 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
5807 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5808 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 5809 }
2c2d71f5
JH
5810 else if (k == WHILEM && o->flags) /* Ordinal/of */
5811 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 5812 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 5813 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 5814 else if (k == LOGICAL)
04ebc1ab 5815 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
5816 else if (k == ANYOF) {
5817 int i, rangestart = -1;
2d03de9c 5818 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
5819
5820 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
5821 static const char * const anyofs[] = {
653099ff
GS
5822 "\\w",
5823 "\\W",
5824 "\\s",
5825 "\\S",
5826 "\\d",
5827 "\\D",
5828 "[:alnum:]",
5829 "[:^alnum:]",
5830 "[:alpha:]",
5831 "[:^alpha:]",
5832 "[:ascii:]",
5833 "[:^ascii:]",
5834 "[:ctrl:]",
5835 "[:^ctrl:]",
5836 "[:graph:]",
5837 "[:^graph:]",
5838 "[:lower:]",
5839 "[:^lower:]",
5840 "[:print:]",
5841 "[:^print:]",
5842 "[:punct:]",
5843 "[:^punct:]",
5844 "[:upper:]",
aaa51d5e 5845 "[:^upper:]",
653099ff 5846 "[:xdigit:]",
aaa51d5e
JF
5847 "[:^xdigit:]",
5848 "[:space:]",
5849 "[:^space:]",
5850 "[:blank:]",
5851 "[:^blank:]"
653099ff
GS
5852 };
5853
19860706 5854 if (flags & ANYOF_LOCALE)
396482e1 5855 sv_catpvs(sv, "{loc}");
19860706 5856 if (flags & ANYOF_FOLD)
396482e1 5857 sv_catpvs(sv, "{i}");
653099ff 5858 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 5859 if (flags & ANYOF_INVERT)
396482e1 5860 sv_catpvs(sv, "^");
ffc61ed2
JH
5861 for (i = 0; i <= 256; i++) {
5862 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5863 if (rangestart == -1)
5864 rangestart = i;
5865 } else if (rangestart != -1) {
5866 if (i <= rangestart + 3)
5867 for (; rangestart < i; rangestart++)
653099ff 5868 put_byte(sv, rangestart);
ffc61ed2
JH
5869 else {
5870 put_byte(sv, rangestart);
396482e1 5871 sv_catpvs(sv, "-");
ffc61ed2 5872 put_byte(sv, i - 1);
653099ff 5873 }
ffc61ed2 5874 rangestart = -1;
653099ff 5875 }
847a199f 5876 }
ffc61ed2
JH
5877
5878 if (o->flags & ANYOF_CLASS)
5879 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
5880 if (ANYOF_CLASS_TEST(o,i))
5881 sv_catpv(sv, anyofs[i]);
5882
5883 if (flags & ANYOF_UNICODE)
396482e1 5884 sv_catpvs(sv, "{unicode}");
1aa99e6b 5885 else if (flags & ANYOF_UNICODE_ALL)
396482e1 5886 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
5887
5888 {
5889 SV *lv;
32fc9b6a 5890 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 5891
ffc61ed2
JH
5892 if (lv) {
5893 if (sw) {
89ebb4a3 5894 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 5895
ffc61ed2 5896 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 5897 uvchr_to_utf8(s, i);
ffc61ed2 5898
3568d838 5899 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
5900 if (rangestart == -1)
5901 rangestart = i;
5902 } else if (rangestart != -1) {
ffc61ed2
JH
5903 if (i <= rangestart + 3)
5904 for (; rangestart < i; rangestart++) {
2d03de9c
AL
5905 const U8 * const e = uvchr_to_utf8(s,rangestart);
5906 U8 *p;
5907 for(p = s; p < e; p++)
ffc61ed2
JH
5908 put_byte(sv, *p);
5909 }
5910 else {
2d03de9c
AL
5911 const U8 *e = uvchr_to_utf8(s,rangestart);
5912 U8 *p;
5913 for (p = s; p < e; p++)
ffc61ed2 5914 put_byte(sv, *p);
396482e1 5915 sv_catpvs(sv, "-");
2d03de9c
AL
5916 e = uvchr_to_utf8(s, i-1);
5917 for (p = s; p < e; p++)
1df70142 5918 put_byte(sv, *p);
ffc61ed2
JH
5919 }
5920 rangestart = -1;
5921 }
19860706 5922 }
ffc61ed2 5923
396482e1 5924 sv_catpvs(sv, "..."); /* et cetera */
19860706 5925 }
fde631ed 5926
ffc61ed2 5927 {
2e0de35c 5928 char *s = savesvpv(lv);
c445ea15 5929 char * const origs = s;
b81d288d 5930
ffc61ed2 5931 while(*s && *s != '\n') s++;
b81d288d 5932
ffc61ed2 5933 if (*s == '\n') {
2d03de9c 5934 const char * const t = ++s;
ffc61ed2
JH
5935
5936 while (*s) {
5937 if (*s == '\n')
5938 *s = ' ';
5939 s++;
5940 }
5941 if (s[-1] == ' ')
5942 s[-1] = 0;
5943
5944 sv_catpv(sv, t);
fde631ed 5945 }
b81d288d 5946
ffc61ed2 5947 Safefree(origs);
fde631ed
JH
5948 }
5949 }
653099ff 5950 }
ffc61ed2 5951
653099ff
GS
5952 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5953 }
9b155405 5954 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 5955 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
65e66c80 5956#else
96a5add6 5957 PERL_UNUSED_CONTEXT;
65e66c80
SP
5958 PERL_UNUSED_ARG(sv);
5959 PERL_UNUSED_ARG(o);
17c3b450 5960#endif /* DEBUGGING */
35ff7856 5961}
a687059c 5962
cad2e5aa
JH
5963SV *
5964Perl_re_intuit_string(pTHX_ regexp *prog)
5965{ /* Assume that RE_INTUIT is set */
97aff369 5966 dVAR;
a3621e74 5967 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
5968 PERL_UNUSED_CONTEXT;
5969
a3621e74 5970 DEBUG_COMPILE_r(
cfd0369c 5971 {
2d03de9c 5972 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 5973 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
5974
5975 if (!PL_colorset) reginitcolors();
5976 PerlIO_printf(Perl_debug_log,
a0288114 5977 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
5978 PL_colors[4],
5979 prog->check_substr ? "" : "utf8 ",
5980 PL_colors[5],PL_colors[0],
cad2e5aa
JH
5981 s,
5982 PL_colors[1],
5983 (strlen(s) > 60 ? "..." : ""));
5984 } );
5985
33b8afdf 5986 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
5987}
5988
2b69d0c2 5989void
864dbfa3 5990Perl_pregfree(pTHX_ struct regexp *r)
a687059c 5991{
27da23d5 5992 dVAR;
9e55ce06 5993#ifdef DEBUGGING
c445ea15
AL
5994 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
5995 SV * const re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
9e55ce06 5996#endif
7821416a 5997
a3621e74 5998
7821416a
IZ
5999 if (!r || (--r->refcnt > 0))
6000 return;
a3621e74 6001 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
c445ea15 6002 const char * const s = (r->reganch & ROPT_UTF8)
e1ec3a88 6003 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 6004 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
e1ec3a88 6005 const int len = SvCUR(dsv);
9e55ce06
JH
6006 if (!PL_colorset)
6007 reginitcolors();
6008 PerlIO_printf(Perl_debug_log,
a3621e74 6009 "%sFreeing REx:%s %s%*.*s%s%s\n",
9e55ce06
JH
6010 PL_colors[4],PL_colors[5],PL_colors[0],
6011 len, len, s,
6012 PL_colors[1],
6013 len > 60 ? "..." : "");
6014 });
cad2e5aa 6015
43c5f42d
NC
6016 /* gcov results gave these as non-null 100% of the time, so there's no
6017 optimisation in checking them before calling Safefree */
6018 Safefree(r->precomp);
6019 Safefree(r->offsets); /* 20010421 MJD */
ed252734 6020 RX_MATCH_COPY_FREE(r);
f8c7b90f 6021#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
6022 if (r->saved_copy)
6023 SvREFCNT_dec(r->saved_copy);
6024#endif
a193d654
GS
6025 if (r->substrs) {
6026 if (r->anchored_substr)
6027 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6028 if (r->anchored_utf8)
6029 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6030 if (r->float_substr)
6031 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6032 if (r->float_utf8)
6033 SvREFCNT_dec(r->float_utf8);
2779dcf1 6034 Safefree(r->substrs);
a193d654 6035 }
c277df42
IZ
6036 if (r->data) {
6037 int n = r->data->count;
f3548bdc
DM
6038 PAD* new_comppad = NULL;
6039 PAD* old_comppad;
4026c95a 6040 PADOFFSET refcnt;
dfad63ad 6041
c277df42 6042 while (--n >= 0) {
261faec3 6043 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6044 switch (r->data->what[n]) {
6045 case 's':
6046 SvREFCNT_dec((SV*)r->data->data[n]);
6047 break;
653099ff
GS
6048 case 'f':
6049 Safefree(r->data->data[n]);
6050 break;
dfad63ad
HS
6051 case 'p':
6052 new_comppad = (AV*)r->data->data[n];
6053 break;
c277df42 6054 case 'o':
dfad63ad 6055 if (new_comppad == NULL)
cea2e8a9 6056 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6057 PAD_SAVE_LOCAL(old_comppad,
6058 /* Watch out for global destruction's random ordering. */
c445ea15 6059 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 6060 );
b34c0dd4 6061 OP_REFCNT_LOCK;
4026c95a
SH
6062 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6063 OP_REFCNT_UNLOCK;
6064 if (!refcnt)
9b978d73 6065 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6066
f3548bdc 6067 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6068 SvREFCNT_dec((SV*)new_comppad);
6069 new_comppad = NULL;
c277df42
IZ
6070 break;
6071 case 'n':
9e55ce06 6072 break;
a3621e74
YO
6073 case 't':
6074 {
c445ea15 6075 reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
a3621e74
YO
6076 U32 refcount;
6077 OP_REFCNT_LOCK;
e27afef8 6078 refcount = --trie->refcount;
a3621e74
YO
6079 OP_REFCNT_UNLOCK;
6080 if ( !refcount ) {
43c5f42d 6081 Safefree(trie->charmap);
a3621e74
YO
6082 if (trie->widecharmap)
6083 SvREFCNT_dec((SV*)trie->widecharmap);
43c5f42d
NC
6084 Safefree(trie->states);
6085 Safefree(trie->trans);
a3621e74
YO
6086#ifdef DEBUGGING
6087 if (trie->words)
6088 SvREFCNT_dec((SV*)trie->words);
6089 if (trie->revcharmap)
6090 SvREFCNT_dec((SV*)trie->revcharmap);
6091#endif
6092 Safefree(r->data->data[n]); /* do this last!!!! */
6093 }
6094 break;
6095 }
c277df42 6096 default:
830247a4 6097 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6098 }
6099 }
6100 Safefree(r->data->what);
6101 Safefree(r->data);
a0d0e21e
LW
6102 }
6103 Safefree(r->startp);
6104 Safefree(r->endp);
6105 Safefree(r);
a687059c 6106}
c277df42
IZ
6107
6108/*
6109 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6110 */
6111regnode *
864dbfa3 6112Perl_regnext(pTHX_ register regnode *p)
c277df42 6113{
97aff369 6114 dVAR;
c277df42
IZ
6115 register I32 offset;
6116
3280af22 6117 if (p == &PL_regdummy)
c277df42
IZ
6118 return(NULL);
6119
6120 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6121 if (offset == 0)
6122 return(NULL);
6123
c277df42 6124 return(p+offset);
c277df42
IZ
6125}
6126
01f988be 6127STATIC void
cea2e8a9 6128S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6129{
6130 va_list args;
6131 STRLEN l1 = strlen(pat1);
6132 STRLEN l2 = strlen(pat2);
6133 char buf[512];
06bf62c7 6134 SV *msv;
73d840c0 6135 const char *message;
c277df42
IZ
6136
6137 if (l1 > 510)
6138 l1 = 510;
6139 if (l1 + l2 > 510)
6140 l2 = 510 - l1;
6141 Copy(pat1, buf, l1 , char);
6142 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6143 buf[l1 + l2] = '\n';
6144 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6145#ifdef I_STDARG
6146 /* ANSI variant takes additional second argument */
c277df42 6147 va_start(args, pat2);
8736538c
AS
6148#else
6149 va_start(args);
6150#endif
5a844595 6151 msv = vmess(buf, &args);
c277df42 6152 va_end(args);
cfd0369c 6153 message = SvPV_const(msv,l1);
c277df42
IZ
6154 if (l1 > 512)
6155 l1 = 512;
6156 Copy(message, buf, l1 , char);
197cf9b9 6157 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6158 Perl_croak(aTHX_ "%s", buf);
c277df42 6159}
a0ed51b3
LW
6160
6161/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6162
6163void
864dbfa3 6164Perl_save_re_context(pTHX)
b81d288d 6165{
97aff369 6166 dVAR;
1ade1aa1
NC
6167
6168 struct re_save_state *state;
6169
6170 SAVEVPTR(PL_curcop);
6171 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6172
6173 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6174 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6175 SSPUSHINT(SAVEt_RE_STATE);
6176
46ab3289 6177 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 6178
a0ed51b3 6179 PL_reg_start_tmp = 0;
a0ed51b3 6180 PL_reg_start_tmpl = 0;
c445ea15 6181 PL_reg_oldsaved = NULL;
a5db57d6 6182 PL_reg_oldsavedlen = 0;
a5db57d6 6183 PL_reg_maxiter = 0;
a5db57d6 6184 PL_reg_leftiter = 0;
c445ea15 6185 PL_reg_poscache = NULL;
a5db57d6 6186 PL_reg_poscache_size = 0;
1ade1aa1
NC
6187#ifdef PERL_OLD_COPY_ON_WRITE
6188 PL_nrs = NULL;
6189#endif
ada6e8a9 6190
c445ea15
AL
6191 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6192 if (PL_curpm) {
6193 const REGEXP * const rx = PM_GETRE(PL_curpm);
6194 if (rx) {
1df70142 6195 U32 i;
ada6e8a9 6196 for (i = 1; i <= rx->nparens; i++) {
1df70142 6197 char digits[TYPE_CHARS(long)];
e5105eda 6198 const STRLEN len = my_sprintf(digits, "%lu", (long)i);
49f27e4b
NC
6199 GV *const *const gvp
6200 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6201
b37c2d43
AL
6202 if (gvp) {
6203 GV * const gv = *gvp;
6204 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6205 save_scalar(gv);
49f27e4b 6206 }
ada6e8a9
AMS
6207 }
6208 }
6209 }
a0ed51b3 6210}
51371543 6211
51371543 6212static void
acfe0abc 6213clear_re(pTHX_ void *r)
51371543 6214{
97aff369 6215 dVAR;
51371543
GS
6216 ReREFCNT_dec((regexp *)r);
6217}
ffbc6a93 6218
a28509cc
AL
6219#ifdef DEBUGGING
6220
6221STATIC void
6222S_put_byte(pTHX_ SV *sv, int c)
6223{
6224 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6225 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6226 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6227 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6228 else
6229 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6230}
6231
6232
b5a2f8d8
NC
6233STATIC const regnode *
6234S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6235 const regnode *last, SV* sv, I32 l)
a28509cc 6236{
97aff369 6237 dVAR;
a28509cc 6238 register U8 op = EXACT; /* Arbitrary non-END op. */
b5a2f8d8 6239 register const regnode *next;
a28509cc
AL
6240
6241 while (op != END && (!last || node < last)) {
6242 /* While that wasn't END last time... */
6243
6244 NODE_ALIGN(node);
6245 op = OP(node);
6246 if (op == CLOSE)
6247 l--;
b5a2f8d8 6248 next = regnext((regnode *)node);
a28509cc
AL
6249 /* Where, what. */
6250 if (OP(node) == OPTIMIZED)
6251 goto after_print;
32fc9b6a 6252 regprop(r, sv, node);
a28509cc
AL
6253 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6254 (int)(2*l + 1), "", SvPVX_const(sv));
6255 if (next == NULL) /* Next ptr. */
6256 PerlIO_printf(Perl_debug_log, "(0)");
6257 else
6258 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6259 (void)PerlIO_putc(Perl_debug_log, '\n');
6260 after_print:
6261 if (PL_regkind[(U8)op] == BRANCHJ) {
b5a2f8d8
NC
6262 register const regnode *nnode = (OP(next) == LONGJMP
6263 ? regnext((regnode *)next)
6264 : next);
a28509cc
AL
6265 if (last && nnode > last)
6266 nnode = last;
4f639d21 6267 node = dumpuntil(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a28509cc
AL
6268 }
6269 else if (PL_regkind[(U8)op] == BRANCH) {
4f639d21 6270 node = dumpuntil(r, start, NEXTOPER(node), next, sv, l + 1);
a28509cc
AL
6271 }
6272 else if ( PL_regkind[(U8)op] == TRIE ) {
6273 const I32 n = ARG(node);
4f639d21 6274 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
a28509cc
AL
6275 const I32 arry_len = av_len(trie->words)+1;
6276 I32 word_idx;
6277 PerlIO_printf(Perl_debug_log,
6278 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6279 (int)(2*(l+3)),
6280 "",
6281 trie->wordcount,
6282 (int)trie->charcount,
6283 trie->uniquecharcount,
6284 (IV)trie->laststate-1,
6285 node->flags ? " EVAL mode" : "");
6286
6287 for (word_idx=0; word_idx < arry_len; word_idx++) {
097eb12c 6288 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
a28509cc
AL
6289 if (elem_ptr) {
6290 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6291 (int)(2*(l+4)), "",
6292 PL_colors[0],
cfd0369c 6293 SvPV_nolen_const(*elem_ptr),
a28509cc
AL
6294 PL_colors[1]
6295 );
6296 /*
6297 if (next == NULL)
6298 PerlIO_printf(Perl_debug_log, "(0)\n");
6299 else
6300 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6301 */
6302 }
6303
6304 }
6305
6306 node = NEXTOPER(node);
6307 node += regarglen[(U8)op];
6308
6309 }
6310 else if ( op == CURLY) { /* "next" might be very big: optimizer */
4f639d21 6311 node = dumpuntil(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
6312 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6313 }
6314 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4f639d21 6315 node = dumpuntil(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
6316 next, sv, l + 1);
6317 }
6318 else if ( op == PLUS || op == STAR) {
4f639d21 6319 node = dumpuntil(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a28509cc
AL
6320 }
6321 else if (op == ANYOF) {
6322 /* arglen 1 + class block */
6323 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6324 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6325 node = NEXTOPER(node);
6326 }
6327 else if (PL_regkind[(U8)op] == EXACT) {
6328 /* Literal string, where present. */
6329 node += NODE_SZ_STR(node) - 1;
6330 node = NEXTOPER(node);
6331 }
6332 else {
6333 node = NEXTOPER(node);
6334 node += regarglen[(U8)op];
6335 }
6336 if (op == CURLYX || op == OPEN)
6337 l++;
6338 else if (op == WHILEM)
6339 l--;
6340 }
6341 return node;
6342}
6343
6344#endif /* DEBUGGING */
6345
241d1a3b
NC
6346/*
6347 * Local variables:
6348 * c-indentation-style: bsd
6349 * c-basic-offset: 4
6350 * indent-tabs-mode: t
6351 * End:
6352 *
37442d52
RGS
6353 * ex: set ts=8 sts=4 sw=4 noet:
6354 */