This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[perl #34976] substr uses utf8 length cache incorrectly
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e
AD
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
cad2e5aa 38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
d06ea78c 44/* We *really* need to overwrite these symbols: */
56953603
IZ
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
d06ea78c 48# define Perl_pregfree my_regfree
cad2e5aa
JH
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
d06ea78c 51# define Perl_regnext my_regnext
f0b8d043 52# define Perl_save_re_context my_save_re_context
b81d288d 53# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
b81d288d 56#endif
56953603 57
f0fcb552 58/*SUPPRESS 112*/
a687059c 59/*
e50aee73 60 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
61 *
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
64 *
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
68 *
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
71 * from defects in it.
72 *
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
75 *
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
78 *
79 *
80 **** Alterations to Henry's code are...
81 ****
4bb101f2 82 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
241d1a3b 83 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 84 ****
9ef589d8
LW
85 **** You may distribute under the terms of either the GNU General Public
86 **** License or the Artistic License, as specified in the README file.
87
a687059c
LW
88 *
89 * Beware that some of this code is subtly aware of the way operator
90 * precedence is structured in regular expressions. Serious changes in
91 * regular-expression syntax might require a total rethink.
92 */
93#include "EXTERN.h"
864dbfa3 94#define PERL_IN_REGCOMP_C
a687059c 95#include "perl.h"
d06ea78c 96
acfe0abc 97#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
98# include "INTERN.h"
99#endif
c277df42
IZ
100
101#define REG_COMP_C
a687059c
LW
102#include "regcomp.h"
103
d4cce5f1 104#ifdef op
11343788 105#undef op
d4cce5f1 106#endif /* op */
11343788 107
fe14fcc3 108#ifdef MSDOS
7e4e8c89 109# if defined(BUGGY_MSC6)
fe14fcc3 110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 111# pragma optimize("a",off)
fe14fcc3 112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
113# pragma optimize("w",on )
114# endif /* BUGGY_MSC6 */
fe14fcc3
LW
115#endif /* MSDOS */
116
a687059c
LW
117#ifndef STATIC
118#define STATIC static
119#endif
120
830247a4 121typedef struct RExC_state_t {
e2509266 122 U32 flags; /* are we folding, multilining? */
830247a4
IZ
123 char *precomp; /* uncompiled string. */
124 regexp *rx;
fac92740 125 char *start; /* Start of input for compile */
830247a4
IZ
126 char *end; /* End of input for compile */
127 char *parse; /* Input-scan pointer. */
128 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 129 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 130 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
131 I32 naughty; /* How bad is this pattern? */
132 I32 sawback; /* Did we see \1, ...? */
133 U32 seen;
134 I32 size; /* Code size. */
135 I32 npar; /* () count. */
136 I32 extralen;
137 I32 seen_zerolen;
138 I32 seen_evals;
1aa99e6b 139 I32 utf8;
830247a4
IZ
140#if ADD_TO_REGEXEC
141 char *starttry; /* -Dr: where regtry was called. */
142#define RExC_starttry (pRExC_state->starttry)
143#endif
144} RExC_state_t;
145
e2509266 146#define RExC_flags (pRExC_state->flags)
830247a4
IZ
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
fac92740 149#define RExC_start (pRExC_state->start)
830247a4
IZ
150#define RExC_end (pRExC_state->end)
151#define RExC_parse (pRExC_state->parse)
152#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 153#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 154#define RExC_emit (pRExC_state->emit)
fac92740 155#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
156#define RExC_naughty (pRExC_state->naughty)
157#define RExC_sawback (pRExC_state->sawback)
158#define RExC_seen (pRExC_state->seen)
159#define RExC_size (pRExC_state->size)
160#define RExC_npar (pRExC_state->npar)
161#define RExC_extralen (pRExC_state->extralen)
162#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 164#define RExC_utf8 (pRExC_state->utf8)
830247a4 165
a687059c
LW
166#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
167#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168 ((*s) == '{' && regcurly(s)))
a687059c 169
35c8bce7
LW
170#ifdef SPSTART
171#undef SPSTART /* dratted cpp namespace... */
172#endif
a687059c
LW
173/*
174 * Flags to be passed up and down.
175 */
a687059c 176#define WORST 0 /* Worst case. */
821b33a5 177#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
178#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
179#define SPSTART 0x4 /* Starts with * or +. */
180#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 181
2c2d71f5
JH
182/* Length of a variant. */
183
184typedef struct scan_data_t {
185 I32 len_min;
186 I32 len_delta;
187 I32 pos_min;
188 I32 pos_delta;
189 SV *last_found;
190 I32 last_end; /* min value, <0 unless valid. */
191 I32 last_start_min;
192 I32 last_start_max;
193 SV **longest; /* Either &l_fixed, or &l_float. */
194 SV *longest_fixed;
195 I32 offset_fixed;
196 SV *longest_float;
197 I32 offset_float_min;
198 I32 offset_float_max;
199 I32 flags;
200 I32 whilem_c;
cb434fcc 201 I32 *last_closep;
653099ff 202 struct regnode_charclass_class *start_class;
2c2d71f5
JH
203} scan_data_t;
204
a687059c 205/*
e50aee73 206 * Forward declarations for pregcomp()'s friends.
a687059c 207 */
a0d0e21e 208
b81d288d 209static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 210 0, 0, 0, 0, 0, 0};
c277df42
IZ
211
212#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213#define SF_BEFORE_SEOL 0x1
214#define SF_BEFORE_MEOL 0x2
215#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
217
09b7f37c
CB
218#ifdef NO_UNARY_PLUS
219# define SF_FIX_SHIFT_EOL (0+2)
220# define SF_FL_SHIFT_EOL (0+4)
221#else
222# define SF_FIX_SHIFT_EOL (+2)
223# define SF_FL_SHIFT_EOL (+4)
224#endif
c277df42
IZ
225
226#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228
229#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231#define SF_IS_INF 0x40
232#define SF_HAS_PAR 0x80
233#define SF_IN_PAR 0x100
234#define SF_HAS_EVAL 0x200
4bfe0158 235#define SCF_DO_SUBSTR 0x400
653099ff
GS
236#define SCF_DO_STCLASS_AND 0x0800
237#define SCF_DO_STCLASS_OR 0x1000
238#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 239#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 240
eb160463 241#define UTF (RExC_utf8 != 0)
e2509266
JH
242#define LOC ((RExC_flags & PMf_LOCALE) != 0)
243#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 244
ffc61ed2 245#define OOB_UNICODE 12345678
93733859 246#define OOB_NAMEDCLASS -1
b8c5462f 247
a0ed51b3
LW
248#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
250
8615cb43 251
b45f050a
JF
252/* length of regex to show in messages that don't mark a position within */
253#define RegexLengthToShowInErrorMessages 127
254
255/*
256 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258 * op/pragma/warn/regcomp.
259 */
7253e4e3
RK
260#define MARKER1 "<-- HERE" /* marker as it appears in the description */
261#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 262
7253e4e3 263#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
264
265/*
266 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267 * arg. Show regex, up to a maximum length. If it's too long, chop and add
268 * "...".
269 */
ccb2c380 270#define FAIL(msg) STMT_START { \
bfed75c6 271 const char *ellipses = ""; \
ccb2c380
MP
272 IV len = RExC_end - RExC_precomp; \
273 \
274 if (!SIZE_ONLY) \
275 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
276 if (len > RegexLengthToShowInErrorMessages) { \
277 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
278 len = RegexLengthToShowInErrorMessages - 10; \
279 ellipses = "..."; \
280 } \
281 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
282 msg, (int)len, RExC_precomp, ellipses); \
283} STMT_END
8615cb43 284
b45f050a
JF
285/*
286 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287 * args. Show regex, up to a maximum length. If it's too long, chop and add
288 * "...".
289 */
ccb2c380 290#define FAIL2(pat,msg) STMT_START { \
bfed75c6 291 const char *ellipses = ""; \
ccb2c380
MP
292 IV len = RExC_end - RExC_precomp; \
293 \
294 if (!SIZE_ONLY) \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 if (len > RegexLengthToShowInErrorMessages) { \
297 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
298 len = RegexLengthToShowInErrorMessages - 10; \
299 ellipses = "..."; \
300 } \
301 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
302 msg, (int)len, RExC_precomp, ellipses); \
303} STMT_END
b45f050a
JF
304
305
306/*
307 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
308 */
ccb2c380
MP
309#define Simple_vFAIL(m) STMT_START { \
310 IV offset = RExC_parse - RExC_precomp; \
311 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
312 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
313} STMT_END
b45f050a
JF
314
315/*
316 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
317 */
ccb2c380
MP
318#define vFAIL(m) STMT_START { \
319 if (!SIZE_ONLY) \
320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
321 Simple_vFAIL(m); \
322} STMT_END
b45f050a
JF
323
324/*
325 * Like Simple_vFAIL(), but accepts two arguments.
326 */
ccb2c380
MP
327#define Simple_vFAIL2(m,a1) STMT_START { \
328 IV offset = RExC_parse - RExC_precomp; \
329 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
330 (int)offset, RExC_precomp, RExC_precomp + offset); \
331} STMT_END
b45f050a
JF
332
333/*
334 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
335 */
ccb2c380
MP
336#define vFAIL2(m,a1) STMT_START { \
337 if (!SIZE_ONLY) \
338 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
339 Simple_vFAIL2(m, a1); \
340} STMT_END
b45f050a
JF
341
342
343/*
344 * Like Simple_vFAIL(), but accepts three arguments.
345 */
ccb2c380
MP
346#define Simple_vFAIL3(m, a1, a2) STMT_START { \
347 IV offset = RExC_parse - RExC_precomp; \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
349 (int)offset, RExC_precomp, RExC_precomp + offset); \
350} STMT_END
b45f050a
JF
351
352/*
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354 */
ccb2c380
MP
355#define vFAIL3(m,a1,a2) STMT_START { \
356 if (!SIZE_ONLY) \
357 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
358 Simple_vFAIL3(m, a1, a2); \
359} STMT_END
b45f050a
JF
360
361/*
362 * Like Simple_vFAIL(), but accepts four arguments.
363 */
ccb2c380
MP
364#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
365 IV offset = RExC_parse - RExC_precomp; \
366 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
367 (int)offset, RExC_precomp, RExC_precomp + offset); \
368} STMT_END
b45f050a
JF
369
370/*
371 * Like Simple_vFAIL(), but accepts five arguments.
372 */
ccb2c380
MP
373#define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
374 IV offset = RExC_parse - RExC_precomp; \
375 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
376 (int)offset, RExC_precomp, RExC_precomp + offset); \
377} STMT_END
378
379
380#define vWARN(loc,m) STMT_START { \
381 IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
383 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
384} STMT_END
385
386#define vWARNdep(loc,m) STMT_START { \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
389 "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
391} STMT_END
392
393
394#define vWARN2(loc, m, a1) STMT_START { \
395 IV offset = loc - RExC_precomp; \
396 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
397 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
398} STMT_END
399
400#define vWARN3(loc, m, a1, a2) STMT_START { \
401 IV offset = loc - RExC_precomp; \
402 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
403 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
404} STMT_END
405
406#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
407 IV offset = loc - RExC_precomp; \
408 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
409 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
410} STMT_END
411
412#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
413 IV offset = loc - RExC_precomp; \
414 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
415 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
416} STMT_END
9d1d55b5 417
8615cb43 418
cd439c50 419/* Allow for side effects in s */
ccb2c380
MP
420#define REGC(c,s) STMT_START { \
421 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
422} STMT_END
cd439c50 423
fac92740
MJD
424/* Macros for recording node offsets. 20001227 mjd@plover.com
425 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
426 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
427 * Element 0 holds the number n.
428 */
429
430#define MJD_OFFSET_DEBUG(x)
a3621e74 431/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
432
433
434#define Set_Node_Offset_To_R(node,byte) STMT_START { \
435 if (! SIZE_ONLY) { \
436 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
437 __LINE__, (node), (byte))); \
438 if((node) < 0) { \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
440 } else { \
441 RExC_offsets[2*(node)-1] = (byte); \
442 } \
443 } \
444} STMT_END
445
446#define Set_Node_Offset(node,byte) \
447 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
449
450#define Set_Node_Length_To_R(node,len) STMT_START { \
451 if (! SIZE_ONLY) { \
452 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
453 __LINE__, (node), (len))); \
454 if((node) < 0) { \
455 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
456 } else { \
457 RExC_offsets[2*(node)] = (len); \
458 } \
459 } \
460} STMT_END
461
462#define Set_Node_Length(node,len) \
463 Set_Node_Length_To_R((node)-RExC_emit_start, len)
464#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465#define Set_Node_Cur_Length(node) \
466 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
467
468/* Get offsets and lengths */
469#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
471
acfe0abc 472static void clear_re(pTHX_ void *r);
4327152a 473
653099ff
GS
474/* Mark that we cannot extend a found fixed substring at this point.
475 Updata the longest found anchored substring and the longest found
476 floating substrings if needed. */
477
4327152a 478STATIC void
830247a4 479S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 480{
e1ec3a88
AL
481 const STRLEN l = CHR_SVLEN(data->last_found);
482 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 483
c277df42 484 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 485 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
486 if (*data->longest == data->longest_fixed) {
487 data->offset_fixed = l ? data->last_start_min : data->pos_min;
488 if (data->flags & SF_BEFORE_EOL)
b81d288d 489 data->flags
c277df42
IZ
490 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
491 else
492 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
493 }
494 else {
c277df42 495 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
496 data->offset_float_max = (l
497 ? data->last_start_max
c277df42 498 : data->pos_min + data->pos_delta);
9051bda5
HS
499 if ((U32)data->offset_float_max > (U32)I32_MAX)
500 data->offset_float_max = I32_MAX;
c277df42 501 if (data->flags & SF_BEFORE_EOL)
b81d288d 502 data->flags
c277df42
IZ
503 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
504 else
505 data->flags &= ~SF_FL_BEFORE_EOL;
506 }
507 }
508 SvCUR_set(data->last_found, 0);
0eda9292
JH
509 {
510 SV * sv = data->last_found;
511 MAGIC *mg =
512 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513 if (mg && mg->mg_len > 0)
514 mg->mg_len = 0;
515 }
c277df42
IZ
516 data->last_end = -1;
517 data->flags &= ~SF_BEFORE_EOL;
518}
519
653099ff
GS
520/* Can match anything (initialization) */
521STATIC void
830247a4 522S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 523{
653099ff 524 ANYOF_CLASS_ZERO(cl);
f8bef550 525 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
527 if (LOC)
528 cl->flags |= ANYOF_LOCALE;
529}
530
531/* Can match anything (initialization) */
532STATIC int
533S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
534{
535 int value;
536
aaa51d5e 537 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
539 return 1;
1aa99e6b
IH
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
541 return 0;
f8bef550
NC
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
543 return 0;
653099ff
GS
544 return 1;
545}
546
547/* Can match anything (initialization) */
548STATIC void
830247a4 549S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 550{
8ecf7187 551 Zero(cl, 1, struct regnode_charclass_class);
653099ff 552 cl->type = ANYOF;
830247a4 553 cl_anything(pRExC_state, cl);
653099ff
GS
554}
555
556STATIC void
830247a4 557S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 558{
8ecf7187 559 Zero(cl, 1, struct regnode_charclass_class);
653099ff 560 cl->type = ANYOF;
830247a4 561 cl_anything(pRExC_state, cl);
653099ff
GS
562 if (LOC)
563 cl->flags |= ANYOF_LOCALE;
564}
565
566/* 'And' a given class with another one. Can create false positives */
567/* We assume that cl is not inverted */
568STATIC void
569S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
571{
653099ff
GS
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
577 int i;
578
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
582 else
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
1aa99e6b 588
14ebb1a2
JH
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
594 }
14ebb1a2
JH
595 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 597 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 600 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
601}
602
603/* 'OR' a given class with another one. Can create false positives */
604/* We assume that cl is not inverted */
605STATIC void
830247a4 606S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 607{
653099ff
GS
608 if (or_with->flags & ANYOF_INVERT) {
609 /* We do not use
610 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611 * <= (B1 | !B2) | (CL1 | !CL2)
612 * which is wasteful if CL2 is small, but we ignore CL2:
613 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614 * XXXX Can we handle case-fold? Unclear:
615 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
617 */
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619 && !(or_with->flags & ANYOF_FOLD)
620 && !(cl->flags & ANYOF_FOLD) ) {
621 int i;
622
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= ~or_with->bitmap[i];
625 } /* XXXX: logic is complicated otherwise */
626 else {
830247a4 627 cl_anything(pRExC_state, cl);
653099ff
GS
628 }
629 } else {
630 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 632 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
633 || (cl->flags & ANYOF_FOLD)) ) {
634 int i;
635
636 /* OR char bitmap and class bitmap separately */
637 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638 cl->bitmap[i] |= or_with->bitmap[i];
639 if (or_with->flags & ANYOF_CLASS) {
640 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641 cl->classflags[i] |= or_with->classflags[i];
642 cl->flags |= ANYOF_CLASS;
643 }
644 }
645 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 646 cl_anything(pRExC_state, cl);
653099ff
GS
647 }
648 }
649 if (or_with->flags & ANYOF_EOS)
650 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
651
652 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653 ARG(cl) != ARG(or_with)) {
654 cl->flags |= ANYOF_UNICODE_ALL;
655 cl->flags &= ~ANYOF_UNICODE;
656 }
657 if (or_with->flags & ANYOF_UNICODE_ALL) {
658 cl->flags |= ANYOF_UNICODE_ALL;
659 cl->flags &= ~ANYOF_UNICODE;
660 }
653099ff
GS
661}
662
5d1c421c 663/*
a3621e74
YO
664
665 make_trie(startbranch,first,last,tail,flags)
666 startbranch: the first branch in the whole branch sequence
667 first : start branch of sequence of branch-exact nodes.
668 May be the same as startbranch
669 last : Thing following the last branch.
670 May be the same as tail.
671 tail : item following the branch sequence
672 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
673
674Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
675
676A trie is an N'ary tree where the branches are determined by digital
677decomposition of the key. IE, at the root node you look up the 1st character and
678follow that branch repeat until you find the end of the branches. Nodes can be
679marked as "accepting" meaning they represent a complete word. Eg:
680
681 /he|she|his|hers/
682
683would convert into the following structure. Numbers represent states, letters
684following numbers represent valid transitions on the letter from that state, if
685the number is in square brackets it represents an accepting state, otherwise it
686will be in parenthesis.
687
688 +-h->+-e->[3]-+-r->(8)-+-s->[9]
689 | |
690 | (2)
691 | |
692 (1) +-i->(6)-+-s->[7]
693 |
694 +-s->(3)-+-h->(4)-+-e->[5]
695
696 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
697
698This shows that when matching against the string 'hers' we will begin at state 1
699read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
700then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
701is also accepting. Thus we know that we can match both 'he' and 'hers' with a
702single traverse. We store a mapping from accepting to state to which word was
703matched, and then when we have multiple possibilities we try to complete the
704rest of the regex in the order in which they occured in the alternation.
705
706The only prior NFA like behaviour that would be changed by the TRIE support is
707the silent ignoring of duplicate alternations which are of the form:
708
709 / (DUPE|DUPE) X? (?{ ... }) Y /x
710
711Thus EVAL blocks follwing a trie may be called a different number of times with
712and without the optimisation. With the optimisations dupes will be silently
713ignored. This inconsistant behaviour of EVAL type nodes is well established as
714the following demonstrates:
715
716 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
717
718which prints out 'word' three times, but
719
720 'words'=~/(word|word|word)(?{ print $1 })S/
721
722which doesnt print it out at all. This is due to other optimisations kicking in.
723
724Example of what happens on a structural level:
725
726The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
727
728 1: CURLYM[1] {1,32767}(18)
729 5: BRANCH(8)
730 6: EXACT <ac>(16)
731 8: BRANCH(11)
732 9: EXACT <ad>(16)
733 11: BRANCH(14)
734 12: EXACT <ab>(16)
735 16: SUCCEED(0)
736 17: NOTHING(18)
737 18: END(0)
738
739This would be optimizable with startbranch=5, first=5, last=16, tail=16
740and should turn into:
741
742 1: CURLYM[1] {1,32767}(18)
743 5: TRIE(16)
744 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
745 <ac>
746 <ad>
747 <ab>
748 16: SUCCEED(0)
749 17: NOTHING(18)
750 18: END(0)
751
752Cases where tail != last would be like /(?foo|bar)baz/:
753
754 1: BRANCH(4)
755 2: EXACT <foo>(8)
756 4: BRANCH(7)
757 5: EXACT <bar>(8)
758 7: TAIL(8)
759 8: EXACT <baz>(10)
760 10: END(0)
761
762which would be optimizable with startbranch=1, first=1, last=7, tail=8
763and would end up looking like:
764
765 1: TRIE(8)
766 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
767 <foo>
768 <bar>
769 7: TAIL(8)
770 8: EXACT <baz>(10)
771 10: END(0)
772
773*/
774
775#define TRIE_DEBUG_CHAR \
776 DEBUG_TRIE_COMPILE_r({ \
777 SV *tmp; \
778 if ( UTF ) { \
779 tmp = newSVpv( "", 0 ); \
780 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
781 } else { \
e4584336 782 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
a3621e74
YO
783 } \
784 av_push( trie->revcharmap, tmp ); \
785 })
786
787#define TRIE_READ_CHAR STMT_START { \
788 if ( UTF ) { \
789 if ( folder ) { \
790 if ( foldlen > 0 ) { \
791 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
792 foldlen -= len; \
793 scan += len; \
794 len = 0; \
795 } else { \
e1ec3a88 796 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
797 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
798 foldlen -= UNISKIP( uvc ); \
799 scan = foldbuf + UNISKIP( uvc ); \
800 } \
801 } else { \
e1ec3a88 802 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
803 } \
804 } else { \
805 uvc = (U32)*uc; \
806 len = 1; \
807 } \
808} STMT_END
809
810
811#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
812#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
813#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
814#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
815
816#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
817 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
818 TRIE_LIST_LEN( state ) *= 2; \
819 Renew( trie->states[ state ].trans.list, \
820 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
821 } \
822 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
823 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
824 TRIE_LIST_CUR( state )++; \
825} STMT_END
826
827#define TRIE_LIST_NEW(state) STMT_START { \
828 Newz( 1023, trie->states[ state ].trans.list, \
829 4, reg_trie_trans_le ); \
830 TRIE_LIST_CUR( state ) = 1; \
831 TRIE_LIST_LEN( state ) = 4; \
832} STMT_END
833
834STATIC I32
835S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
836{
837 /* first pass, loop through and scan words */
838 reg_trie_data *trie;
839 regnode *cur;
e1ec3a88 840 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
a3621e74
YO
841 STRLEN len = 0;
842 UV uvc = 0;
843 U16 curword = 0;
844 U32 next_alloc = 0;
845 /* we just use folder as a flag in utf8 */
e1ec3a88 846 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
847 ? PL_fold
848 : ( flags == EXACTFL
849 ? PL_fold_locale
850 : NULL
851 )
852 );
853
e1ec3a88 854 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74
YO
855 SV *re_trie_maxbuff;
856
857 GET_RE_DEBUG_FLAGS_DECL;
858
859 Newz( 848200, trie, 1, reg_trie_data );
860 trie->refcount = 1;
861 RExC_rx->data->data[ data_slot ] = (void*)trie;
862 Newz( 848201, trie->charmap, 256, U16 );
863 DEBUG_r({
864 trie->words = newAV();
865 trie->revcharmap = newAV();
866 });
867
868
0111c4fd 869 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 870 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 871 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74
YO
872 }
873
874 /* -- First loop and Setup --
875
876 We first traverse the branches and scan each word to determine if it
877 contains widechars, and how many unique chars there are, this is
878 important as we have to build a table with at least as many columns as we
879 have unique chars.
880
881 We use an array of integers to represent the character codes 0..255
882 (trie->charmap) and we use a an HV* to store unicode characters. We use the
883 native representation of the character value as the key and IV's for the
884 coded index.
885
886 *TODO* If we keep track of how many times each character is used we can
887 remap the columns so that the table compression later on is more
888 efficient in terms of memory by ensuring most common value is in the
889 middle and the least common are on the outside. IMO this would be better
890 than a most to least common mapping as theres a decent chance the most
891 common letter will share a node with the least common, meaning the node
892 will not be compressable. With a middle is most common approach the worst
893 case is when we have the least common nodes twice.
894
895 */
896
897
898 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
899 regnode *noper = NEXTOPER( cur );
e1ec3a88
AL
900 const U8 *uc = (U8*)STRING( noper );
901 const U8 *e = uc + STR_LEN( noper );
a3621e74
YO
902 STRLEN foldlen = 0;
903 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
e1ec3a88 904 const U8 *scan;
a3621e74
YO
905
906 for ( ; uc < e ; uc += len ) {
907 trie->charcount++;
908 TRIE_READ_CHAR;
909 if ( uvc < 256 ) {
910 if ( !trie->charmap[ uvc ] ) {
911 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
912 if ( folder )
913 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
914 TRIE_DEBUG_CHAR;
915 }
916 } else {
917 SV** svpp;
918 if ( !trie->widecharmap )
919 trie->widecharmap = newHV();
920
921 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
922
923 if ( !svpp )
e4584336 924 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
925
926 if ( !SvTRUE( *svpp ) ) {
927 sv_setiv( *svpp, ++trie->uniquecharcount );
928 TRIE_DEBUG_CHAR;
929 }
930 }
931 }
932 trie->wordcount++;
933 } /* end first pass */
934 DEBUG_TRIE_COMPILE_r(
935 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
936 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
937 trie->charcount, trie->uniquecharcount )
938 );
939
940
941 /*
942 We now know what we are dealing with in terms of unique chars and
943 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
944 representation using a flat table will take. If it's over a reasonable
945 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
946 conservative but potentially much slower representation using an array
947 of lists.
948
949 At the end we convert both representations into the same compressed
950 form that will be used in regexec.c for matching with. The latter
951 is a form that cannot be used to construct with but has memory
952 properties similar to the list form and access properties similar
953 to the table form making it both suitable for fast searches and
954 small enough that its feasable to store for the duration of a program.
955
956 See the comment in the code where the compressed table is produced
957 inplace from the flat tabe representation for an explanation of how
958 the compression works.
959
960 */
961
962
963 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
964 /*
965 Second Pass -- Array Of Lists Representation
966
967 Each state will be represented by a list of charid:state records
968 (reg_trie_trans_le) the first such element holds the CUR and LEN
969 points of the allocated array. (See defines above).
970
971 We build the initial structure using the lists, and then convert
972 it into the compressed table form which allows faster lookups
973 (but cant be modified once converted).
974
975
976 */
977
978
979 STRLEN transcount = 1;
980
981 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
982 TRIE_LIST_NEW(1);
983 next_alloc = 2;
984
985 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
986
987 regnode *noper = NEXTOPER( cur );
988 U8 *uc = (U8*)STRING( noper );
989 U8 *e = uc + STR_LEN( noper );
990 U32 state = 1; /* required init */
991 U16 charid = 0; /* sanity init */
992 U8 *scan = (U8*)NULL; /* sanity init */
993 STRLEN foldlen = 0; /* required init */
994 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
995
996
997 for ( ; uc < e ; uc += len ) {
998
999 TRIE_READ_CHAR;
1000
1001 if ( uvc < 256 ) {
1002 charid = trie->charmap[ uvc ];
1003 } else {
1004 SV** svpp=(SV**)NULL;
1005 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1006 if ( !svpp ) {
1007 charid = 0;
1008 } else {
1009 charid=(U16)SvIV( *svpp );
1010 }
1011 }
1012 if ( charid ) {
1013
1014 U16 check;
1015 U32 newstate = 0;
1016
1017 charid--;
1018 if ( !trie->states[ state ].trans.list ) {
1019 TRIE_LIST_NEW( state );
1020 }
1021 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1022 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1023 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1024 break;
1025 }
1026 }
1027 if ( ! newstate ) {
1028 newstate = next_alloc++;
1029 TRIE_LIST_PUSH( state, charid, newstate );
1030 transcount++;
1031 }
1032 state = newstate;
1033
1034 } else {
e4584336 1035 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1036 }
1037 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1038 }
1039
1040 if ( !trie->states[ state ].wordnum ) {
1041 /* we havent inserted this word into the structure yet. */
1042 trie->states[ state ].wordnum = ++curword;
1043
1044 DEBUG_r({
1045 /* store the word for dumping */
1046 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1047 if ( UTF ) SvUTF8_on( tmp );
1048 av_push( trie->words, tmp );
1049 });
1050
1051 } else {
1052 /* Its a dupe. So ignore it. */
1053 }
1054
1055 } /* end second pass */
1056
1057 trie->laststate = next_alloc;
1058 Renew( trie->states, next_alloc, reg_trie_state );
1059
1060 DEBUG_TRIE_COMPILE_MORE_r({
1061 U32 state;
1062 U16 charid;
1063
1064 /*
1065 print out the table precompression.
1066 */
1067
1068 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1069 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1070
1071 for( state=1 ; state < next_alloc ; state ++ ) {
1072
e4584336 1073 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
a3621e74
YO
1074 if ( ! trie->states[ state ].wordnum ) {
1075 PerlIO_printf( Perl_debug_log, "%5s| ","");
1076 } else {
e4584336 1077 PerlIO_printf( Perl_debug_log, "W%04x| ",
a3621e74
YO
1078 trie->states[ state ].wordnum
1079 );
1080 }
1081 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1082 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
e4584336 1083 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
a3621e74
YO
1084 SvPV_nolen( *tmp ),
1085 TRIE_LIST_ITEM(state,charid).forid,
e4584336 1086 (UV)TRIE_LIST_ITEM(state,charid).newstate
a3621e74
YO
1087 );
1088 }
1089
1090 }
1091 PerlIO_printf( Perl_debug_log, "\n\n" );
1092 });
1093
1094 Newz( 848203, trie->trans, transcount ,reg_trie_trans );
1095 {
1096 U32 state;
1097 U16 idx;
1098 U32 tp = 0;
1099 U32 zp = 0;
1100
1101
1102 for( state=1 ; state < next_alloc ; state ++ ) {
1103 U32 base=0;
1104
1105 /*
1106 DEBUG_TRIE_COMPILE_MORE_r(
1107 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1108 );
1109 */
1110
1111 if (trie->states[state].trans.list) {
1112 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1113 U16 maxid=minid;
1114
1115
1116 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1117 if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
1118 minid=TRIE_LIST_ITEM( state, idx).forid;
1119 } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
1120 maxid=TRIE_LIST_ITEM( state, idx).forid;
1121 }
1122 }
1123 if ( transcount < tp + maxid - minid + 1) {
1124 transcount *= 2;
1125 Renew( trie->trans, transcount, reg_trie_trans );
1126 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1127 }
1128 base = trie->uniquecharcount + tp - minid;
1129 if ( maxid == minid ) {
1130 U32 set = 0;
1131 for ( ; zp < tp ; zp++ ) {
1132 if ( ! trie->trans[ zp ].next ) {
1133 base = trie->uniquecharcount + zp - minid;
1134 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1135 trie->trans[ zp ].check = state;
1136 set = 1;
1137 break;
1138 }
1139 }
1140 if ( !set ) {
1141 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1142 trie->trans[ tp ].check = state;
1143 tp++;
1144 zp = tp;
1145 }
1146 } else {
1147 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1148 U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1149 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1150 trie->trans[ tid ].check = state;
1151 }
1152 tp += ( maxid - minid + 1 );
1153 }
1154 Safefree(trie->states[ state ].trans.list);
1155 }
1156 /*
1157 DEBUG_TRIE_COMPILE_MORE_r(
1158 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1159 );
1160 */
1161 trie->states[ state ].trans.base=base;
1162 }
cc601c31 1163 trie->lasttrans = tp + 1;
a3621e74
YO
1164 }
1165 } else {
1166 /*
1167 Second Pass -- Flat Table Representation.
1168
1169 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1170 We know that we will need Charcount+1 trans at most to store the data
1171 (one row per char at worst case) So we preallocate both structures
1172 assuming worst case.
1173
1174 We then construct the trie using only the .next slots of the entry
1175 structs.
1176
1177 We use the .check field of the first entry of the node temporarily to
1178 make compression both faster and easier by keeping track of how many non
1179 zero fields are in the node.
1180
1181 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1182 transition.
1183
1184 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1185 number representing the first entry of the node, and state as a
1186 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1187 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1188 are 2 entrys per node. eg:
1189
1190 A B A B
1191 1. 2 4 1. 3 7
1192 2. 0 3 3. 0 5
1193 3. 0 0 5. 0 0
1194 4. 0 0 7. 0 0
1195
1196 The table is internally in the right hand, idx form. However as we also
1197 have to deal with the states array which is indexed by nodenum we have to
1198 use TRIE_NODENUM() to convert.
1199
1200 */
1201
1202 Newz( 848203, trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1203 reg_trie_trans );
1204 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
1205 next_alloc = trie->uniquecharcount + 1;
1206
1207 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1208
1209 regnode *noper = NEXTOPER( cur );
1210 U8 *uc = (U8*)STRING( noper );
1211 U8 *e = uc + STR_LEN( noper );
1212
1213 U32 state = 1; /* required init */
1214
1215 U16 charid = 0; /* sanity init */
1216 U32 accept_state = 0; /* sanity init */
1217 U8 *scan = (U8*)NULL; /* sanity init */
1218
1219 STRLEN foldlen = 0; /* required init */
1220 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1221
1222
1223 for ( ; uc < e ; uc += len ) {
1224
1225 TRIE_READ_CHAR;
1226
1227 if ( uvc < 256 ) {
1228 charid = trie->charmap[ uvc ];
1229 } else {
1230 SV** svpp=(SV**)NULL;
1231 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1232 if ( !svpp ) {
1233 charid = 0;
1234 } else {
1235 charid=(U16)SvIV( *svpp );
1236 }
1237 }
1238 if ( charid ) {
1239 charid--;
1240 if ( !trie->trans[ state + charid ].next ) {
1241 trie->trans[ state + charid ].next = next_alloc;
1242 trie->trans[ state ].check++;
1243 next_alloc += trie->uniquecharcount;
1244 }
1245 state = trie->trans[ state + charid ].next;
1246 } else {
e4584336 1247 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1248 }
1249 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1250 }
1251
1252 accept_state = TRIE_NODENUM( state );
1253 if ( !trie->states[ accept_state ].wordnum ) {
1254 /* we havent inserted this word into the structure yet. */
1255 trie->states[ accept_state ].wordnum = ++curword;
1256
1257 DEBUG_r({
1258 /* store the word for dumping */
1259 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1260 if ( UTF ) SvUTF8_on( tmp );
1261 av_push( trie->words, tmp );
1262 });
1263
1264 } else {
1265 /* Its a dupe. So ignore it. */
1266 }
1267
1268 } /* end second pass */
1269
1270 DEBUG_TRIE_COMPILE_MORE_r({
1271 /*
1272 print out the table precompression so that we can do a visual check
1273 that they are identical.
1274 */
1275 U32 state;
1276 U16 charid;
1277 PerlIO_printf( Perl_debug_log, "\nChar : " );
1278
1279 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1280 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1281 if ( tmp ) {
1282 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1283 }
1284 }
1285
1286 PerlIO_printf( Perl_debug_log, "\nState+-" );
1287
1288 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1289 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1290 }
1291
1292 PerlIO_printf( Perl_debug_log, "\n" );
1293
1294 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1295
e4584336 1296 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
a3621e74
YO
1297
1298 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
e4584336
RB
1299 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1300 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
a3621e74
YO
1301 }
1302 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
e4584336 1303 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
a3621e74 1304 } else {
e4584336 1305 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
a3621e74
YO
1306 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1307 }
1308 }
1309 PerlIO_printf( Perl_debug_log, "\n\n" );
1310 });
1311 {
1312 /*
1313 * Inplace compress the table.*
1314
1315 For sparse data sets the table constructed by the trie algorithm will
1316 be mostly 0/FAIL transitions or to put it another way mostly empty.
1317 (Note that leaf nodes will not contain any transitions.)
1318
1319 This algorithm compresses the tables by eliminating most such
1320 transitions, at the cost of a modest bit of extra work during lookup:
1321
1322 - Each states[] entry contains a .base field which indicates the
1323 index in the state[] array wheres its transition data is stored.
1324
1325 - If .base is 0 there are no valid transitions from that node.
1326
1327 - If .base is nonzero then charid is added to it to find an entry in
1328 the trans array.
1329
1330 -If trans[states[state].base+charid].check!=state then the
1331 transition is taken to be a 0/Fail transition. Thus if there are fail
1332 transitions at the front of the node then the .base offset will point
1333 somewhere inside the previous nodes data (or maybe even into a node
1334 even earlier), but the .check field determines if the transition is
1335 valid.
1336
1337 The following process inplace converts the table to the compressed
1338 table: We first do not compress the root node 1,and mark its all its
1339 .check pointers as 1 and set its .base pointer as 1 as well. This
1340 allows to do a DFA construction from the compressed table later, and
1341 ensures that any .base pointers we calculate later are greater than
1342 0.
1343
1344 - We set 'pos' to indicate the first entry of the second node.
1345
1346 - We then iterate over the columns of the node, finding the first and
1347 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1348 and set the .check pointers accordingly, and advance pos
1349 appropriately and repreat for the next node. Note that when we copy
1350 the next pointers we have to convert them from the original
1351 NODEIDX form to NODENUM form as the former is not valid post
1352 compression.
1353
1354 - If a node has no transitions used we mark its base as 0 and do not
1355 advance the pos pointer.
1356
1357 - If a node only has one transition we use a second pointer into the
1358 structure to fill in allocated fail transitions from other states.
1359 This pointer is independent of the main pointer and scans forward
1360 looking for null transitions that are allocated to a state. When it
1361 finds one it writes the single transition into the "hole". If the
1362 pointer doesnt find one the single transition is appeneded as normal.
1363
1364 - Once compressed we can Renew/realloc the structures to release the
1365 excess space.
1366
1367 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1368 specifically Fig 3.47 and the associated pseudocode.
1369
1370 demq
1371 */
1372 U32 laststate = TRIE_NODENUM( next_alloc );
1373 U32 used , state, charid;
1374 U32 pos = 0, zp=0;
1375 trie->laststate = laststate;
1376
1377 for ( state = 1 ; state < laststate ; state++ ) {
1378 U8 flag = 0;
1379 U32 stateidx = TRIE_NODEIDX( state );
1380 U32 o_used=trie->trans[ stateidx ].check;
1381 used = trie->trans[ stateidx ].check;
1382 trie->trans[ stateidx ].check = 0;
1383
1384 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1385 if ( flag || trie->trans[ stateidx + charid ].next ) {
1386 if ( trie->trans[ stateidx + charid ].next ) {
1387 if (o_used == 1) {
1388 for ( ; zp < pos ; zp++ ) {
1389 if ( ! trie->trans[ zp ].next ) {
1390 break;
1391 }
1392 }
1393 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1394 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1395 trie->trans[ zp ].check = state;
1396 if ( ++zp > pos ) pos = zp;
1397 break;
1398 }
1399 used--;
1400 }
1401 if ( !flag ) {
1402 flag = 1;
1403 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1404 }
1405 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1406 trie->trans[ pos ].check = state;
1407 pos++;
1408 }
1409 }
1410 }
cc601c31 1411 trie->lasttrans = pos + 1;
a3621e74
YO
1412 Renew( trie->states, laststate + 1, reg_trie_state);
1413 DEBUG_TRIE_COMPILE_MORE_r(
e4584336
RB
1414 PerlIO_printf( Perl_debug_log,
1415 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1416 ( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ), (IV)next_alloc, (IV)pos,
a3621e74
YO
1417 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1418 );
1419
1420 } /* end table compress */
1421 }
cc601c31
YO
1422 /* resize the trans array to remove unused space */
1423 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74
YO
1424
1425 DEBUG_TRIE_COMPILE_r({
1426 U32 state;
1427 /*
1428 Now we print it out again, in a slightly different form as there is additional
1429 info we want to be able to see when its compressed. They are close enough for
1430 visual comparison though.
1431 */
1432 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1433
1434 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1435 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1436 if ( tmp ) {
1437 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1438 }
1439 }
1440 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
cc601c31 1441
a3621e74
YO
1442 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1443 PerlIO_printf( Perl_debug_log, "-----");
1444 PerlIO_printf( Perl_debug_log, "\n");
cc601c31 1445
a3621e74
YO
1446 for( state = 1 ; state < trie->laststate ; state++ ) {
1447 U32 base = trie->states[ state ].trans.base;
1448
e4584336 1449 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
a3621e74
YO
1450
1451 if ( trie->states[ state ].wordnum ) {
1452 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1453 } else {
1454 PerlIO_printf( Perl_debug_log, "%6s", "" );
1455 }
1456
e4584336 1457 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
a3621e74
YO
1458
1459 if ( base ) {
1460 U32 ofs = 0;
1461
cc601c31
YO
1462 while( ( base + ofs < trie->uniquecharcount ) ||
1463 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1464 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
a3621e74
YO
1465 ofs++;
1466
e4584336 1467 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
a3621e74
YO
1468
1469 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1470 if ( ( base + ofs >= trie->uniquecharcount ) &&
1471 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1472 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1473 {
e4584336
RB
1474 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1475 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
a3621e74
YO
1476 } else {
1477 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1478 }
1479 }
1480
e4584336 1481 PerlIO_printf( Perl_debug_log, "]");
a3621e74
YO
1482
1483 }
1484 PerlIO_printf( Perl_debug_log, "\n" );
1485 }
1486 });
1487
1488 {
1489 /* now finally we "stitch in" the new TRIE node
1490 This means we convert either the first branch or the first Exact,
1491 depending on whether the thing following (in 'last') is a branch
1492 or not and whther first is the startbranch (ie is it a sub part of
1493 the alternation or is it the whole thing.)
1494 Assuming its a sub part we conver the EXACT otherwise we convert
1495 the whole branch sequence, including the first.
1496 */
1497 regnode *convert;
1498
1499
1500
1501
1502 if ( first == startbranch && OP( last ) != BRANCH ) {
1503 convert = first;
1504 } else {
1505 convert = NEXTOPER( first );
1506 NEXT_OFF( first ) = (U16)(last - first);
1507 }
1508
1509 OP( convert ) = TRIE + (U8)( flags - EXACT );
1510 NEXT_OFF( convert ) = (U16)(tail - convert);
1511 ARG_SET( convert, data_slot );
1512
1513 /* tells us if we need to handle accept buffers specially */
1514 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1515
1516
1517 /* needed for dumping*/
1518 DEBUG_r({
1519 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1520 /* We now need to mark all of the space originally used by the
1521 branches as optimized away. This keeps the dumpuntil from
1522 throwing a wobbly as it doesnt use regnext() to traverse the
1523 opcodes.
1524 */
1525 while( optimize < last ) {
1526 OP( optimize ) = OPTIMIZED;
1527 optimize++;
1528 }
1529 });
1530 } /* end node insert */
1531 return 1;
1532}
1533
1534
1535
1536/*
5d1c421c
JH
1537 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1538 * These need to be revisited when a newer toolchain becomes available.
1539 */
1540#if defined(__sparc64__) && defined(__GNUC__)
1541# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1542# undef SPARC64_GCC_WORKAROUND
1543# define SPARC64_GCC_WORKAROUND 1
1544# endif
1545#endif
1546
653099ff
GS
1547/* REx optimizer. Converts nodes into quickier variants "in place".
1548 Finds fixed substrings. */
1549
c277df42
IZ
1550/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
1551 to the position after last scanned or to NULL. */
1552
a3621e74 1553
76e3520e 1554STATIC I32
a3621e74 1555S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1556 /* scanp: Start here (read-write). */
1557 /* deltap: Write maxlen-minlen here. */
1558 /* last: Stop before this one. */
1559{
1560 I32 min = 0, pars = 0, code;
1561 regnode *scan = *scanp, *next;
1562 I32 delta = 0;
1563 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1564 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1565 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1566 scan_data_t data_fake;
653099ff 1567 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1568 SV *re_trie_maxbuff = NULL;
1569
1570 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1571
c277df42
IZ
1572 while (scan && OP(scan) != END && scan < last) {
1573 /* Peephole optimizer: */
a3621e74
YO
1574 DEBUG_OPTIMISE_r({
1575 SV *mysv=sv_newmortal();
1576 regprop( mysv, scan);
e4584336
RB
1577 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
1578 (int)depth*2, "", SvPV_nolen(mysv), PTR2UV(scan));
a3621e74 1579 });
c277df42 1580
22c35a8c 1581 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 1582 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1583 regnode *n = regnext(scan);
1584 U32 stringok = 1;
1585#ifdef DEBUGGING
1586 regnode *stop = scan;
b81d288d 1587#endif
c277df42 1588
cd439c50 1589 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1590 /* Skip NOTHING, merge EXACT*. */
1591 while (n &&
b81d288d 1592 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
1593 (stringok && (OP(n) == OP(scan))))
1594 && NEXT_OFF(n)
1595 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1596 if (OP(n) == TAIL || n > next)
1597 stringok = 0;
22c35a8c 1598 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
1599 NEXT_OFF(scan) += NEXT_OFF(n);
1600 next = n + NODE_STEP_REGNODE;
1601#ifdef DEBUGGING
1602 if (stringok)
1603 stop = n;
b81d288d 1604#endif
c277df42 1605 n = regnext(n);
a0ed51b3 1606 }
f49d4d0f 1607 else if (stringok) {
cd439c50 1608 int oldl = STR_LEN(scan);
c277df42 1609 regnode *nnext = regnext(n);
f49d4d0f 1610
b81d288d 1611 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1612 break;
1613 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1614 STR_LEN(scan) += STR_LEN(n);
1615 next = n + NODE_SZ_STR(n);
c277df42 1616 /* Now we can overwrite *n : */
f49d4d0f 1617 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1618#ifdef DEBUGGING
f49d4d0f 1619 stop = next - 1;
b81d288d 1620#endif
c277df42
IZ
1621 n = nnext;
1622 }
1623 }
61a36c01 1624
a3621e74 1625 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1626/*
1627 Two problematic code points in Unicode casefolding of EXACT nodes:
1628
1629 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1630 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1631
1632 which casefold to
1633
1634 Unicode UTF-8
1635
1636 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1637 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1638
1639 This means that in case-insensitive matching (or "loose matching",
1640 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1641 length of the above casefolded versions) can match a target string
1642 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1643 This would rather mess up the minimum length computation.
1644
1645 What we'll do is to look for the tail four bytes, and then peek
1646 at the preceding two bytes to see whether we need to decrease
1647 the minimum length by four (six minus two).
1648
1649 Thanks to the design of UTF-8, there cannot be false matches:
1650 A sequence of valid UTF-8 bytes cannot be a subsequence of
1651 another valid sequence of UTF-8 bytes.
1652
1653*/
1654 char *s0 = STRING(scan), *s, *t;
1655 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
bfed75c6
AL
1656 const char *t0 = "\xcc\x88\xcc\x81";
1657 const char *t1 = t0 + 3;
61a36c01
JH
1658
1659 for (s = s0 + 2;
1660 s < s2 && (t = ninstr(s, s1, t0, t1));
1661 s = t + 4) {
1662 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1663 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1664 min -= 4;
1665 }
1666 }
1667
c277df42
IZ
1668#ifdef DEBUGGING
1669 /* Allow dumping */
cd439c50 1670 n = scan + NODE_SZ_STR(scan);
c277df42 1671 while (n <= stop) {
22c35a8c 1672 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1673 OP(n) = OPTIMIZED;
1674 NEXT_OFF(n) = 0;
1675 }
1676 n++;
1677 }
653099ff 1678#endif
c277df42 1679 }
a3621e74
YO
1680
1681
1682
653099ff
GS
1683 /* Follow the next-chain of the current node and optimize
1684 away all the NOTHINGs from it. */
c277df42 1685 if (OP(scan) != CURLYX) {
048cfca1
GS
1686 int max = (reg_off_by_arg[OP(scan)]
1687 ? I32_MAX
1688 /* I32 may be smaller than U16 on CRAYs! */
1689 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1690 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1691 int noff;
1692 regnode *n = scan;
b81d288d 1693
c277df42
IZ
1694 /* Skip NOTHING and LONGJMP. */
1695 while ((n = regnext(n))
22c35a8c 1696 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1697 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1698 && off + noff < max)
1699 off += noff;
1700 if (reg_off_by_arg[OP(scan)])
1701 ARG(scan) = off;
b81d288d 1702 else
c277df42
IZ
1703 NEXT_OFF(scan) = off;
1704 }
a3621e74 1705
653099ff
GS
1706 /* The principal pseudo-switch. Cannot be a switch, since we
1707 look into several different things. */
b81d288d 1708 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1709 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1710 next = regnext(scan);
1711 code = OP(scan);
a3621e74 1712 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1713
1714 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1715 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1716 struct regnode_charclass_class accum;
a3621e74 1717 regnode *startbranch=scan;
c277df42 1718
653099ff 1719 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1720 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1721 if (flags & SCF_DO_STCLASS)
830247a4 1722 cl_init_zero(pRExC_state, &accum);
a3621e74 1723
c277df42 1724 while (OP(scan) == code) {
830247a4 1725 I32 deltanext, minnext, f = 0, fake;
653099ff 1726 struct regnode_charclass_class this_class;
c277df42
IZ
1727
1728 num++;
1729 data_fake.flags = 0;
b81d288d 1730 if (data) {
2c2d71f5 1731 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1732 data_fake.last_closep = data->last_closep;
1733 }
1734 else
1735 data_fake.last_closep = &fake;
c277df42
IZ
1736 next = regnext(scan);
1737 scan = NEXTOPER(scan);
1738 if (code != BRANCH)
1739 scan = NEXTOPER(scan);
653099ff 1740 if (flags & SCF_DO_STCLASS) {
830247a4 1741 cl_init(pRExC_state, &this_class);
653099ff
GS
1742 data_fake.start_class = &this_class;
1743 f = SCF_DO_STCLASS_AND;
b81d288d 1744 }
e1901655
IZ
1745 if (flags & SCF_WHILEM_VISITED_POS)
1746 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1747
653099ff 1748 /* we suppose the run is continuous, last=next...*/
830247a4 1749 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1750 next, &data_fake, f,depth+1);
b81d288d 1751 if (min1 > minnext)
c277df42
IZ
1752 min1 = minnext;
1753 if (max1 < minnext + deltanext)
1754 max1 = minnext + deltanext;
1755 if (deltanext == I32_MAX)
aca2d497 1756 is_inf = is_inf_internal = 1;
c277df42
IZ
1757 scan = next;
1758 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1759 pars++;
405ff068 1760 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1761 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1762 if (data)
1763 data->whilem_c = data_fake.whilem_c;
653099ff 1764 if (flags & SCF_DO_STCLASS)
830247a4 1765 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1766 if (code == SUSPEND)
c277df42
IZ
1767 break;
1768 }
1769 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1770 min1 = 0;
1771 if (flags & SCF_DO_SUBSTR) {
1772 data->pos_min += min1;
1773 data->pos_delta += max1 - min1;
1774 if (max1 != min1 || is_inf)
1775 data->longest = &(data->longest_float);
1776 }
1777 min += min1;
1778 delta += max1 - min1;
653099ff 1779 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1780 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1781 if (min1) {
1782 cl_and(data->start_class, &and_with);
1783 flags &= ~SCF_DO_STCLASS;
1784 }
1785 }
1786 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1787 if (min1) {
1788 cl_and(data->start_class, &accum);
653099ff 1789 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1790 }
1791 else {
b81d288d 1792 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1793 * data->start_class */
1794 StructCopy(data->start_class, &and_with,
1795 struct regnode_charclass_class);
1796 flags &= ~SCF_DO_STCLASS_AND;
1797 StructCopy(&accum, data->start_class,
1798 struct regnode_charclass_class);
1799 flags |= SCF_DO_STCLASS_OR;
1800 data->start_class->flags |= ANYOF_EOS;
1801 }
653099ff 1802 }
a3621e74
YO
1803
1804 /* demq.
1805
1806 Assuming this was/is a branch we are dealing with: 'scan' now
1807 points at the item that follows the branch sequence, whatever
1808 it is. We now start at the beginning of the sequence and look
1809 for subsequences of
1810
1811 BRANCH->EXACT=>X
1812 BRANCH->EXACT=>X
1813
1814 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1815
1816 If we can find such a subseqence we need to turn the first
1817 element into a trie and then add the subsequent branch exact
1818 strings to the trie.
1819
1820 We have two cases
1821
1822 1. patterns where the whole set of branch can be converted to a trie,
1823
1824 2. patterns where only a subset of the alternations can be
1825 converted to a trie.
1826
1827 In case 1 we can replace the whole set with a single regop
1828 for the trie. In case 2 we need to keep the start and end
1829 branchs so
1830
1831 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1832 becomes BRANCH TRIE; BRANCH X;
1833
1834 Hypthetically when we know the regex isnt anchored we can
1835 turn a case 1 into a DFA and let it rip... Every time it finds a match
1836 it would just call its tail, no WHILEM/CURLY needed.
1837
1838 */
0111c4fd
RGS
1839 if (DO_TRIE) {
1840 if (!re_trie_maxbuff) {
1841 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1842 if (!SvIOK(re_trie_maxbuff))
1843 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1844 }
a3621e74
YO
1845 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1846 regnode *cur;
1847 regnode *first = (regnode *)NULL;
1848 regnode *last = (regnode *)NULL;
1849 regnode *tail = scan;
1850 U8 optype = 0;
1851 U32 count=0;
1852
1853#ifdef DEBUGGING
1854 SV *mysv = sv_newmortal(); /* for dumping */
1855#endif
1856 /* var tail is used because there may be a TAIL
1857 regop in the way. Ie, the exacts will point to the
1858 thing following the TAIL, but the last branch will
1859 point at the TAIL. So we advance tail. If we
1860 have nested (?:) we may have to move through several
1861 tails.
1862 */
1863
1864 while ( OP( tail ) == TAIL ) {
1865 /* this is the TAIL generated by (?:) */
1866 tail = regnext( tail );
1867 }
1868
1869 DEBUG_OPTIMISE_r({
1870 regprop( mysv, tail );
1871 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
e4584336 1872 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen( mysv ),
a3621e74
YO
1873 (RExC_seen_evals) ? "[EVAL]" : ""
1874 );
1875 });
1876 /*
1877
1878 step through the branches, cur represents each
1879 branch, noper is the first thing to be matched
1880 as part of that branch and noper_next is the
1881 regnext() of that node. if noper is an EXACT
1882 and noper_next is the same as scan (our current
1883 position in the regex) then the EXACT branch is
1884 a possible optimization target. Once we have
1885 two or more consequetive such branches we can
1886 create a trie of the EXACT's contents and stich
1887 it in place. If the sequence represents all of
1888 the branches we eliminate the whole thing and
1889 replace it with a single TRIE. If it is a
1890 subsequence then we need to stitch it in. This
1891 means the first branch has to remain, and needs
1892 to be repointed at the item on the branch chain
1893 following the last branch optimized. This could
1894 be either a BRANCH, in which case the
1895 subsequence is internal, or it could be the
1896 item following the branch sequence in which
1897 case the subsequence is at the end.
1898
1899 */
1900
1901 /* dont use tail as the end marker for this traverse */
1902 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1903 regnode *noper = NEXTOPER( cur );
1904 regnode *noper_next = regnext( noper );
1905
a3621e74
YO
1906 DEBUG_OPTIMISE_r({
1907 regprop( mysv, cur);
1908 PerlIO_printf( Perl_debug_log, "%*s%s",
e4584336 1909 (int)depth * 2 + 2," ", SvPV_nolen( mysv ) );
a3621e74
YO
1910
1911 regprop( mysv, noper);
1912 PerlIO_printf( Perl_debug_log, " -> %s",
1913 SvPV_nolen(mysv));
1914
1915 if ( noper_next ) {
1916 regprop( mysv, noper_next );
1917 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1918 SvPV_nolen(mysv));
1919 }
1920 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1921 first, last, cur );
1922 });
1923 if ( ( first ? OP( noper ) == optype
1924 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1925 && noper_next == tail && count<U16_MAX)
1926 {
1927 count++;
1928 if ( !first ) {
1929 first = cur;
1930 optype = OP( noper );
1931 } else {
1932 DEBUG_OPTIMISE_r(
1933 if (!last ) {
1934 regprop( mysv, first);
1935 PerlIO_printf( Perl_debug_log, "%*s%s",
e4584336 1936 (int)depth * 2 + 2, "F:", SvPV_nolen( mysv ) );
a3621e74
YO
1937 regprop( mysv, NEXTOPER(first) );
1938 PerlIO_printf( Perl_debug_log, " -> %s\n",
1939 SvPV_nolen( mysv ) );
1940 }
1941 );
1942 last = cur;
1943 DEBUG_OPTIMISE_r({
1944 regprop( mysv, cur);
1945 PerlIO_printf( Perl_debug_log, "%*s%s",
e4584336 1946 (int)depth * 2 + 2, "N:", SvPV_nolen( mysv ) );
a3621e74
YO
1947 regprop( mysv, noper );
1948 PerlIO_printf( Perl_debug_log, " -> %s\n",
1949 SvPV_nolen( mysv ) );
1950 });
1951 }
1952 } else {
1953 if ( last ) {
1954 DEBUG_OPTIMISE_r(
1955 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1956 (int)depth * 2 + 2, "E:", "**END**" );
a3621e74
YO
1957 );
1958 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1959 }
1960 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1961 && noper_next == tail )
1962 {
1963 count = 1;
1964 first = cur;
1965 optype = OP( noper );
1966 } else {
1967 count = 0;
1968 first = NULL;
1969 optype = 0;
1970 }
1971 last = NULL;
1972 }
1973 }
1974 DEBUG_OPTIMISE_r({
1975 regprop( mysv, cur);
1976 PerlIO_printf( Perl_debug_log,
e4584336 1977 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
a3621e74
YO
1978 " ", SvPV_nolen( mysv ), first, last, cur);
1979
1980 });
1981 if ( last ) {
1982 DEBUG_OPTIMISE_r(
1983 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1984 (int)depth * 2 + 2, "E:", "==END==" );
a3621e74
YO
1985 );
1986 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1987 }
1988 }
1989 }
a0ed51b3 1990 }
a3621e74 1991 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 1992 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 1993 } else /* single branch is optimized. */
c277df42
IZ
1994 scan = NEXTOPER(scan);
1995 continue;
a0ed51b3
LW
1996 }
1997 else if (OP(scan) == EXACT) {
cd439c50 1998 I32 l = STR_LEN(scan);
1aa99e6b 1999 UV uc = *((U8*)STRING(scan));
a0ed51b3 2000 if (UTF) {
1aa99e6b
IH
2001 U8 *s = (U8*)STRING(scan);
2002 l = utf8_length(s, s + l);
9041c2e3 2003 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2004 }
2005 min += l;
c277df42 2006 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2007 /* The code below prefers earlier match for fixed
2008 offset, later match for variable offset. */
2009 if (data->last_end == -1) { /* Update the start info. */
2010 data->last_start_min = data->pos_min;
2011 data->last_start_max = is_inf
b81d288d 2012 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2013 }
cd439c50 2014 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292
JH
2015 {
2016 SV * sv = data->last_found;
2017 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2018 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2019 if (mg && mg->mg_len >= 0)
5e43f467
JH
2020 mg->mg_len += utf8_length((U8*)STRING(scan),
2021 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2022 }
33b8afdf
JH
2023 if (UTF)
2024 SvUTF8_on(data->last_found);
c277df42
IZ
2025 data->last_end = data->pos_min + l;
2026 data->pos_min += l; /* As in the first entry. */
2027 data->flags &= ~SF_BEFORE_EOL;
2028 }
653099ff
GS
2029 if (flags & SCF_DO_STCLASS_AND) {
2030 /* Check whether it is compatible with what we know already! */
2031 int compat = 1;
2032
1aa99e6b 2033 if (uc >= 0x100 ||
516a5887 2034 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2035 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2036 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2037 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2038 )
653099ff
GS
2039 compat = 0;
2040 ANYOF_CLASS_ZERO(data->start_class);
2041 ANYOF_BITMAP_ZERO(data->start_class);
2042 if (compat)
1aa99e6b 2043 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2044 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2045 if (uc < 0x100)
2046 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2047 }
2048 else if (flags & SCF_DO_STCLASS_OR) {
2049 /* false positive possible if the class is case-folded */
1aa99e6b 2050 if (uc < 0x100)
9b877dbb
IH
2051 ANYOF_BITMAP_SET(data->start_class, uc);
2052 else
2053 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2054 data->start_class->flags &= ~ANYOF_EOS;
2055 cl_and(data->start_class, &and_with);
2056 }
2057 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2058 }
653099ff 2059 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2060 I32 l = STR_LEN(scan);
1aa99e6b 2061 UV uc = *((U8*)STRING(scan));
653099ff
GS
2062
2063 /* Search for fixed substrings supports EXACT only. */
b81d288d 2064 if (flags & SCF_DO_SUBSTR)
830247a4 2065 scan_commit(pRExC_state, data);
a0ed51b3 2066 if (UTF) {
1aa99e6b
IH
2067 U8 *s = (U8 *)STRING(scan);
2068 l = utf8_length(s, s + l);
9041c2e3 2069 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2070 }
2071 min += l;
c277df42 2072 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 2073 data->pos_min += l;
653099ff
GS
2074 if (flags & SCF_DO_STCLASS_AND) {
2075 /* Check whether it is compatible with what we know already! */
2076 int compat = 1;
2077
1aa99e6b 2078 if (uc >= 0x100 ||
516a5887 2079 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2080 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2081 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2082 compat = 0;
2083 ANYOF_CLASS_ZERO(data->start_class);
2084 ANYOF_BITMAP_ZERO(data->start_class);
2085 if (compat) {
1aa99e6b 2086 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2087 data->start_class->flags &= ~ANYOF_EOS;
2088 data->start_class->flags |= ANYOF_FOLD;
2089 if (OP(scan) == EXACTFL)
2090 data->start_class->flags |= ANYOF_LOCALE;
2091 }
2092 }
2093 else if (flags & SCF_DO_STCLASS_OR) {
2094 if (data->start_class->flags & ANYOF_FOLD) {
2095 /* false positive possible if the class is case-folded.
2096 Assume that the locale settings are the same... */
1aa99e6b
IH
2097 if (uc < 0x100)
2098 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2099 data->start_class->flags &= ~ANYOF_EOS;
2100 }
2101 cl_and(data->start_class, &and_with);
2102 }
2103 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2104 }
bfed75c6 2105 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2106 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2107 I32 f = flags, pos_before = 0;
c277df42 2108 regnode *oscan = scan;
653099ff
GS
2109 struct regnode_charclass_class this_class;
2110 struct regnode_charclass_class *oclass = NULL;
727f22e3 2111 I32 next_is_eval = 0;
653099ff 2112
22c35a8c 2113 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2114 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2115 scan = NEXTOPER(scan);
2116 goto finish;
2117 case PLUS:
653099ff 2118 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2119 next = NEXTOPER(scan);
653099ff 2120 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2121 mincount = 1;
2122 maxcount = REG_INFTY;
c277df42
IZ
2123 next = regnext(scan);
2124 scan = NEXTOPER(scan);
2125 goto do_curly;
2126 }
2127 }
2128 if (flags & SCF_DO_SUBSTR)
2129 data->pos_min++;
2130 min++;
2131 /* Fall through. */
2132 case STAR:
653099ff
GS
2133 if (flags & SCF_DO_STCLASS) {
2134 mincount = 0;
b81d288d 2135 maxcount = REG_INFTY;
653099ff
GS
2136 next = regnext(scan);
2137 scan = NEXTOPER(scan);
2138 goto do_curly;
2139 }
b81d288d 2140 is_inf = is_inf_internal = 1;
c277df42
IZ
2141 scan = regnext(scan);
2142 if (flags & SCF_DO_SUBSTR) {
830247a4 2143 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2144 data->longest = &(data->longest_float);
2145 }
2146 goto optimize_curly_tail;
2147 case CURLY:
b81d288d 2148 mincount = ARG1(scan);
c277df42
IZ
2149 maxcount = ARG2(scan);
2150 next = regnext(scan);
cb434fcc
IZ
2151 if (OP(scan) == CURLYX) {
2152 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2153 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2154 }
c277df42 2155 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2156 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2157 do_curly:
2158 if (flags & SCF_DO_SUBSTR) {
830247a4 2159 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2160 pos_before = data->pos_min;
2161 }
2162 if (data) {
2163 fl = data->flags;
2164 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2165 if (is_inf)
2166 data->flags |= SF_IS_INF;
2167 }
653099ff 2168 if (flags & SCF_DO_STCLASS) {
830247a4 2169 cl_init(pRExC_state, &this_class);
653099ff
GS
2170 oclass = data->start_class;
2171 data->start_class = &this_class;
2172 f |= SCF_DO_STCLASS_AND;
2173 f &= ~SCF_DO_STCLASS_OR;
2174 }
e1901655
IZ
2175 /* These are the cases when once a subexpression
2176 fails at a particular position, it cannot succeed
2177 even after backtracking at the enclosing scope.
b81d288d 2178
e1901655
IZ
2179 XXXX what if minimal match and we are at the
2180 initial run of {n,m}? */
2181 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2182 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2183
c277df42 2184 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2185 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2186 (mincount == 0
2187 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2188
2189 if (flags & SCF_DO_STCLASS)
2190 data->start_class = oclass;
2191 if (mincount == 0 || minnext == 0) {
2192 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2193 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2194 }
2195 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2196 /* Switch to OR mode: cache the old value of
653099ff
GS
2197 * data->start_class */
2198 StructCopy(data->start_class, &and_with,
2199 struct regnode_charclass_class);
2200 flags &= ~SCF_DO_STCLASS_AND;
2201 StructCopy(&this_class, data->start_class,
2202 struct regnode_charclass_class);
2203 flags |= SCF_DO_STCLASS_OR;
2204 data->start_class->flags |= ANYOF_EOS;
2205 }
2206 } else { /* Non-zero len */
2207 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2208 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2209 cl_and(data->start_class, &and_with);
2210 }
2211 else if (flags & SCF_DO_STCLASS_AND)
2212 cl_and(data->start_class, &this_class);
2213 flags &= ~SCF_DO_STCLASS;
2214 }
c277df42
IZ
2215 if (!scan) /* It was not CURLYX, but CURLY. */
2216 scan = next;
84037bb0 2217 if (ckWARN(WARN_REGEXP)
727f22e3
JP
2218 /* ? quantifier ok, except for (?{ ... }) */
2219 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2220 && (minnext == 0) && (deltanext == 0)
99799961 2221 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 2222 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 2223 {
830247a4 2224 vWARN(RExC_parse,
b45f050a
JF
2225 "Quantifier unexpected on zero-length expression");
2226 }
2227
c277df42 2228 min += minnext * mincount;
b81d288d 2229 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2230 && (minnext + deltanext) > 0)
2231 || deltanext == I32_MAX);
aca2d497 2232 is_inf |= is_inf_internal;
c277df42
IZ
2233 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2234
2235 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2236 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2237 && data->flags & SF_IN_PAR
2238 && !(data->flags & SF_HAS_EVAL)
2239 && !deltanext && minnext == 1 ) {
2240 /* Try to optimize to CURLYN. */
2241 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
2242 regnode *nxt1 = nxt;
2243#ifdef DEBUGGING
2244 regnode *nxt2;
2245#endif
c277df42
IZ
2246
2247 /* Skip open. */
2248 nxt = regnext(nxt);
bfed75c6 2249 if (!strchr((const char*)PL_simple,OP(nxt))
22c35a8c 2250 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 2251 && STR_LEN(nxt) == 1))
c277df42 2252 goto nogo;
497b47a8 2253#ifdef DEBUGGING
c277df42 2254 nxt2 = nxt;
497b47a8 2255#endif
c277df42 2256 nxt = regnext(nxt);
b81d288d 2257 if (OP(nxt) != CLOSE)
c277df42
IZ
2258 goto nogo;
2259 /* Now we know that nxt2 is the only contents: */
eb160463 2260 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2261 OP(oscan) = CURLYN;
2262 OP(nxt1) = NOTHING; /* was OPEN. */
2263#ifdef DEBUGGING
2264 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2265 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2266 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2267 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2268 OP(nxt + 1) = OPTIMIZED; /* was count. */
2269 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2270#endif
c277df42 2271 }
c277df42
IZ
2272 nogo:
2273
2274 /* Try optimization CURLYX => CURLYM. */
b81d288d 2275 if ( OP(oscan) == CURLYX && data
c277df42 2276 && !(data->flags & SF_HAS_PAR)
c277df42 2277 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2278 && !deltanext /* atom is fixed width */
2279 && minnext != 0 /* CURLYM can't handle zero width */
2280 ) {
c277df42
IZ
2281 /* XXXX How to optimize if data == 0? */
2282 /* Optimize to a simpler form. */
2283 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2284 regnode *nxt2;
2285
2286 OP(oscan) = CURLYM;
2287 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2288 && (OP(nxt2) != WHILEM))
c277df42
IZ
2289 nxt = nxt2;
2290 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2291 /* Need to optimize away parenths. */
2292 if (data->flags & SF_IN_PAR) {
2293 /* Set the parenth number. */
2294 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2295
b81d288d 2296 if (OP(nxt) != CLOSE)
b45f050a 2297 FAIL("Panic opt close");
eb160463 2298 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2299 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2300 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2301#ifdef DEBUGGING
2302 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2303 OP(nxt + 1) = OPTIMIZED; /* was count. */
2304 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2305 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2306#endif
c277df42
IZ
2307#if 0
2308 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2309 regnode *nnxt = regnext(nxt1);
b81d288d 2310
c277df42
IZ
2311 if (nnxt == nxt) {
2312 if (reg_off_by_arg[OP(nxt1)])
2313 ARG_SET(nxt1, nxt2 - nxt1);
2314 else if (nxt2 - nxt1 < U16_MAX)
2315 NEXT_OFF(nxt1) = nxt2 - nxt1;
2316 else
2317 OP(nxt) = NOTHING; /* Cannot beautify */
2318 }
2319 nxt1 = nnxt;
2320 }
2321#endif
2322 /* Optimize again: */
b81d288d 2323 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2324 NULL, 0,depth+1);
a0ed51b3
LW
2325 }
2326 else
c277df42 2327 oscan->flags = 0;
c277df42 2328 }
e1901655
IZ
2329 else if ((OP(oscan) == CURLYX)
2330 && (flags & SCF_WHILEM_VISITED_POS)
2331 /* See the comment on a similar expression above.
2332 However, this time it not a subexpression
2333 we care about, but the expression itself. */
2334 && (maxcount == REG_INFTY)
2335 && data && ++data->whilem_c < 16) {
2336 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2337 /* Find WHILEM (as in regexec.c) */
2338 regnode *nxt = oscan + NEXT_OFF(oscan);
2339
2340 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2341 nxt += ARG(nxt);
eb160463
GS
2342 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2343 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2344 }
b81d288d 2345 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2346 pars++;
2347 if (flags & SCF_DO_SUBSTR) {
2348 SV *last_str = Nullsv;
2349 int counted = mincount != 0;
2350
2351 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2352#if defined(SPARC64_GCC_WORKAROUND)
2353 I32 b = 0;
2354 STRLEN l = 0;
2355 char *s = NULL;
2356 I32 old = 0;
2357
2358 if (pos_before >= data->last_start_min)
2359 b = pos_before;
2360 else
2361 b = data->last_start_min;
2362
2363 l = 0;
2364 s = SvPV(data->last_found, l);
2365 old = b - data->last_start_min;
2366
2367#else
b81d288d 2368 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2369 ? pos_before : data->last_start_min;
2370 STRLEN l;
2371 char *s = SvPV(data->last_found, l);
a0ed51b3 2372 I32 old = b - data->last_start_min;
5d1c421c 2373#endif
a0ed51b3
LW
2374
2375 if (UTF)
2376 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2377
a0ed51b3 2378 l -= old;
c277df42 2379 /* Get the added string: */
79cb57f6 2380 last_str = newSVpvn(s + old, l);
0e933229
IH
2381 if (UTF)
2382 SvUTF8_on(last_str);
c277df42
IZ
2383 if (deltanext == 0 && pos_before == b) {
2384 /* What was added is a constant string */
2385 if (mincount > 1) {
2386 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2387 repeatcpy(SvPVX(last_str) + l,
c277df42 2388 SvPVX(last_str), l, mincount - 1);
b162af07 2389 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2390 /* Add additional parts. */
b81d288d 2391 SvCUR_set(data->last_found,
c277df42
IZ
2392 SvCUR(data->last_found) - l);
2393 sv_catsv(data->last_found, last_str);
0eda9292
JH
2394 {
2395 SV * sv = data->last_found;
2396 MAGIC *mg =
2397 SvUTF8(sv) && SvMAGICAL(sv) ?
2398 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2399 if (mg && mg->mg_len >= 0)
2400 mg->mg_len += CHR_SVLEN(last_str);
2401 }
c277df42
IZ
2402 data->last_end += l * (mincount - 1);
2403 }
2a8d9689
HS
2404 } else {
2405 /* start offset must point into the last copy */
2406 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2407 data->last_start_max += is_inf ? I32_MAX
2408 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2409 }
2410 }
2411 /* It is counted once already... */
2412 data->pos_min += minnext * (mincount - counted);
2413 data->pos_delta += - counted * deltanext +
2414 (minnext + deltanext) * maxcount - minnext * mincount;
2415 if (mincount != maxcount) {
653099ff
GS
2416 /* Cannot extend fixed substrings found inside
2417 the group. */
830247a4 2418 scan_commit(pRExC_state,data);
c277df42
IZ
2419 if (mincount && last_str) {
2420 sv_setsv(data->last_found, last_str);
2421 data->last_end = data->pos_min;
b81d288d 2422 data->last_start_min =
a0ed51b3 2423 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2424 data->last_start_max = is_inf
2425 ? I32_MAX
c277df42 2426 : data->pos_min + data->pos_delta
a0ed51b3 2427 - CHR_SVLEN(last_str);
c277df42
IZ
2428 }
2429 data->longest = &(data->longest_float);
2430 }
aca2d497 2431 SvREFCNT_dec(last_str);
c277df42 2432 }
405ff068 2433 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2434 data->flags |= SF_HAS_EVAL;
2435 optimize_curly_tail:
c277df42 2436 if (OP(oscan) != CURLYX) {
22c35a8c 2437 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2438 && NEXT_OFF(next))
2439 NEXT_OFF(oscan) += NEXT_OFF(next);
2440 }
c277df42 2441 continue;
653099ff 2442 default: /* REF and CLUMP only? */
c277df42 2443 if (flags & SCF_DO_SUBSTR) {
830247a4 2444 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2445 data->longest = &(data->longest_float);
2446 }
aca2d497 2447 is_inf = is_inf_internal = 1;
653099ff 2448 if (flags & SCF_DO_STCLASS_OR)
830247a4 2449 cl_anything(pRExC_state, data->start_class);
653099ff 2450 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2451 break;
2452 }
a0ed51b3 2453 }
bfed75c6 2454 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2455 int value = 0;
653099ff 2456
c277df42 2457 if (flags & SCF_DO_SUBSTR) {
830247a4 2458 scan_commit(pRExC_state,data);
c277df42
IZ
2459 data->pos_min++;
2460 }
2461 min++;
653099ff
GS
2462 if (flags & SCF_DO_STCLASS) {
2463 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2464
2465 /* Some of the logic below assumes that switching
2466 locale on will only add false positives. */
2467 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2468 case SANY:
653099ff
GS
2469 default:
2470 do_default:
2471 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2472 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2473 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2474 break;
2475 case REG_ANY:
2476 if (OP(scan) == SANY)
2477 goto do_default;
2478 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2479 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2480 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2481 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2482 }
2483 if (flags & SCF_DO_STCLASS_AND || !value)
2484 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2485 break;
2486 case ANYOF:
2487 if (flags & SCF_DO_STCLASS_AND)
2488 cl_and(data->start_class,
2489 (struct regnode_charclass_class*)scan);
2490 else
830247a4 2491 cl_or(pRExC_state, data->start_class,
653099ff
GS
2492 (struct regnode_charclass_class*)scan);
2493 break;
2494 case ALNUM:
2495 if (flags & SCF_DO_STCLASS_AND) {
2496 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2497 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2498 for (value = 0; value < 256; value++)
2499 if (!isALNUM(value))
2500 ANYOF_BITMAP_CLEAR(data->start_class, value);
2501 }
2502 }
2503 else {
2504 if (data->start_class->flags & ANYOF_LOCALE)
2505 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2506 else {
2507 for (value = 0; value < 256; value++)
2508 if (isALNUM(value))
b81d288d 2509 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2510 }
2511 }
2512 break;
2513 case ALNUML:
2514 if (flags & SCF_DO_STCLASS_AND) {
2515 if (data->start_class->flags & ANYOF_LOCALE)
2516 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2517 }
2518 else {
2519 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2520 data->start_class->flags |= ANYOF_LOCALE;
2521 }
2522 break;
2523 case NALNUM:
2524 if (flags & SCF_DO_STCLASS_AND) {
2525 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2526 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2527 for (value = 0; value < 256; value++)
2528 if (isALNUM(value))
2529 ANYOF_BITMAP_CLEAR(data->start_class, value);
2530 }
2531 }
2532 else {
2533 if (data->start_class->flags & ANYOF_LOCALE)
2534 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2535 else {
2536 for (value = 0; value < 256; value++)
2537 if (!isALNUM(value))
b81d288d 2538 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2539 }
2540 }
2541 break;
2542 case NALNUML:
2543 if (flags & SCF_DO_STCLASS_AND) {
2544 if (data->start_class->flags & ANYOF_LOCALE)
2545 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2546 }
2547 else {
2548 data->start_class->flags |= ANYOF_LOCALE;
2549 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2550 }
2551 break;
2552 case SPACE:
2553 if (flags & SCF_DO_STCLASS_AND) {
2554 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2555 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2556 for (value = 0; value < 256; value++)
2557 if (!isSPACE(value))
2558 ANYOF_BITMAP_CLEAR(data->start_class, value);
2559 }
2560 }
2561 else {
2562 if (data->start_class->flags & ANYOF_LOCALE)
2563 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2564 else {
2565 for (value = 0; value < 256; value++)
2566 if (isSPACE(value))
b81d288d 2567 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2568 }
2569 }
2570 break;
2571 case SPACEL:
2572 if (flags & SCF_DO_STCLASS_AND) {
2573 if (data->start_class->flags & ANYOF_LOCALE)
2574 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2575 }
2576 else {
2577 data->start_class->flags |= ANYOF_LOCALE;
2578 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2579 }
2580 break;
2581 case NSPACE:
2582 if (flags & SCF_DO_STCLASS_AND) {
2583 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2584 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2585 for (value = 0; value < 256; value++)
2586 if (isSPACE(value))
2587 ANYOF_BITMAP_CLEAR(data->start_class, value);
2588 }
2589 }
2590 else {
2591 if (data->start_class->flags & ANYOF_LOCALE)
2592 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2593 else {
2594 for (value = 0; value < 256; value++)
2595 if (!isSPACE(value))
b81d288d 2596 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2597 }
2598 }
2599 break;
2600 case NSPACEL:
2601 if (flags & SCF_DO_STCLASS_AND) {
2602 if (data->start_class->flags & ANYOF_LOCALE) {
2603 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2604 for (value = 0; value < 256; value++)
2605 if (!isSPACE(value))
2606 ANYOF_BITMAP_CLEAR(data->start_class, value);
2607 }
2608 }
2609 else {
2610 data->start_class->flags |= ANYOF_LOCALE;
2611 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2612 }
2613 break;
2614 case DIGIT:
2615 if (flags & SCF_DO_STCLASS_AND) {
2616 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2617 for (value = 0; value < 256; value++)
2618 if (!isDIGIT(value))
2619 ANYOF_BITMAP_CLEAR(data->start_class, value);
2620 }
2621 else {
2622 if (data->start_class->flags & ANYOF_LOCALE)
2623 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2624 else {
2625 for (value = 0; value < 256; value++)
2626 if (isDIGIT(value))
b81d288d 2627 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2628 }
2629 }
2630 break;
2631 case NDIGIT:
2632 if (flags & SCF_DO_STCLASS_AND) {
2633 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2634 for (value = 0; value < 256; value++)
2635 if (isDIGIT(value))
2636 ANYOF_BITMAP_CLEAR(data->start_class, value);
2637 }
2638 else {
2639 if (data->start_class->flags & ANYOF_LOCALE)
2640 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2641 else {
2642 for (value = 0; value < 256; value++)
2643 if (!isDIGIT(value))
b81d288d 2644 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2645 }
2646 }
2647 break;
2648 }
2649 if (flags & SCF_DO_STCLASS_OR)
2650 cl_and(data->start_class, &and_with);
2651 flags &= ~SCF_DO_STCLASS;
2652 }
a0ed51b3 2653 }
22c35a8c 2654 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2655 data->flags |= (OP(scan) == MEOL
2656 ? SF_BEFORE_MEOL
2657 : SF_BEFORE_SEOL);
a0ed51b3 2658 }
653099ff
GS
2659 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2660 /* Lookbehind, or need to calculate parens/evals/stclass: */
2661 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2662 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2663 /* Lookahead/lookbehind */
cb434fcc 2664 I32 deltanext, minnext, fake = 0;
c277df42 2665 regnode *nscan;
653099ff
GS
2666 struct regnode_charclass_class intrnl;
2667 int f = 0;
c277df42
IZ
2668
2669 data_fake.flags = 0;
b81d288d 2670 if (data) {
2c2d71f5 2671 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2672 data_fake.last_closep = data->last_closep;
2673 }
2674 else
2675 data_fake.last_closep = &fake;
653099ff
GS
2676 if ( flags & SCF_DO_STCLASS && !scan->flags
2677 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2678 cl_init(pRExC_state, &intrnl);
653099ff 2679 data_fake.start_class = &intrnl;
e1901655 2680 f |= SCF_DO_STCLASS_AND;
653099ff 2681 }
e1901655
IZ
2682 if (flags & SCF_WHILEM_VISITED_POS)
2683 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2684 next = regnext(scan);
2685 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2686 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2687 if (scan->flags) {
2688 if (deltanext) {
9baa0206 2689 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2690 }
2691 else if (minnext > U8_MAX) {
9baa0206 2692 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2693 }
eb160463 2694 scan->flags = (U8)minnext;
c277df42
IZ
2695 }
2696 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2697 pars++;
405ff068 2698 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2699 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2700 if (data)
2701 data->whilem_c = data_fake.whilem_c;
e1901655 2702 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
2703 int was = (data->start_class->flags & ANYOF_EOS);
2704
2705 cl_and(data->start_class, &intrnl);
2706 if (was)
2707 data->start_class->flags |= ANYOF_EOS;
2708 }
a0ed51b3
LW
2709 }
2710 else if (OP(scan) == OPEN) {
c277df42 2711 pars++;
a0ed51b3 2712 }
cb434fcc 2713 else if (OP(scan) == CLOSE) {
eb160463 2714 if ((I32)ARG(scan) == is_par) {
cb434fcc 2715 next = regnext(scan);
c277df42 2716
cb434fcc
IZ
2717 if ( next && (OP(next) != WHILEM) && next < last)
2718 is_par = 0; /* Disable optimization */
2719 }
2720 if (data)
2721 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2722 }
2723 else if (OP(scan) == EVAL) {
c277df42
IZ
2724 if (data)
2725 data->flags |= SF_HAS_EVAL;
2726 }
96776eda 2727 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2728 if (flags & SCF_DO_SUBSTR) {
830247a4 2729 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2730 data->longest = &(data->longest_float);
2731 }
2732 is_inf = is_inf_internal = 1;
653099ff 2733 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2734 cl_anything(pRExC_state, data->start_class);
96776eda 2735 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2736 }
c277df42
IZ
2737 /* Else: zero-length, ignore. */
2738 scan = regnext(scan);
2739 }
2740
2741 finish:
2742 *scanp = scan;
aca2d497 2743 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2744 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2745 data->pos_delta = I32_MAX - data->pos_min;
2746 if (is_par > U8_MAX)
2747 is_par = 0;
2748 if (is_par && pars==1 && data) {
2749 data->flags |= SF_IN_PAR;
2750 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2751 }
2752 else if (pars && data) {
c277df42
IZ
2753 data->flags |= SF_HAS_PAR;
2754 data->flags &= ~SF_IN_PAR;
2755 }
653099ff
GS
2756 if (flags & SCF_DO_STCLASS_OR)
2757 cl_and(data->start_class, &and_with);
c277df42
IZ
2758 return min;
2759}
2760
76e3520e 2761STATIC I32
bfed75c6 2762S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2763{
830247a4 2764 if (RExC_rx->data) {
b81d288d
AB
2765 Renewc(RExC_rx->data,
2766 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2767 char, struct reg_data);
830247a4
IZ
2768 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2769 RExC_rx->data->count += n;
a0ed51b3
LW
2770 }
2771 else {
830247a4 2772 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2773 char, struct reg_data);
830247a4
IZ
2774 New(1208, RExC_rx->data->what, n, U8);
2775 RExC_rx->data->count = n;
c277df42 2776 }
830247a4
IZ
2777 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2778 return RExC_rx->data->count - n;
c277df42
IZ
2779}
2780
d88dccdf 2781void
864dbfa3 2782Perl_reginitcolors(pTHX)
d88dccdf 2783{
d88dccdf
IZ
2784 int i = 0;
2785 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 2786
d88dccdf
IZ
2787 if (s) {
2788 PL_colors[0] = s = savepv(s);
2789 while (++i < 6) {
2790 s = strchr(s, '\t');
2791 if (s) {
2792 *s = '\0';
2793 PL_colors[i] = ++s;
2794 }
2795 else
dd374669 2796 PL_colors[i] = s = (char *)"";
d88dccdf
IZ
2797 }
2798 } else {
b81d288d 2799 while (i < 6)
06b5626a 2800 PL_colors[i++] = (char *)"";
d88dccdf
IZ
2801 }
2802 PL_colorset = 1;
2803}
2804
8615cb43 2805
a687059c 2806/*
e50aee73 2807 - pregcomp - compile a regular expression into internal code
a687059c
LW
2808 *
2809 * We can't allocate space until we know how big the compiled form will be,
2810 * but we can't compile it (and thus know how big it is) until we've got a
2811 * place to put the code. So we cheat: we compile it twice, once with code
2812 * generation turned off and size counting turned on, and once "for real".
2813 * This also means that we don't allocate space until we are sure that the
2814 * thing really will compile successfully, and we never have to move the
2815 * code and thus invalidate pointers into it. (Note that it has to be in
2816 * one piece because free() must be able to free it all.) [NB: not true in perl]
2817 *
2818 * Beware that the optimization-preparation code in here knows about some
2819 * of the structure of the compiled regexp. [I'll say.]
2820 */
2821regexp *
864dbfa3 2822Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2823{
a0d0e21e 2824 register regexp *r;
c277df42 2825 regnode *scan;
c277df42 2826 regnode *first;
a0d0e21e 2827 I32 flags;
a0d0e21e
LW
2828 I32 minlen = 0;
2829 I32 sawplus = 0;
2830 I32 sawopen = 0;
2c2d71f5 2831 scan_data_t data;
830247a4
IZ
2832 RExC_state_t RExC_state;
2833 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 2834
a3621e74
YO
2835 GET_RE_DEBUG_FLAGS_DECL;
2836
a0d0e21e 2837 if (exp == NULL)
c277df42 2838 FAIL("NULL regexp argument");
a0d0e21e 2839
a5961de5 2840 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 2841
5cfc7842 2842 RExC_precomp = exp;
a3621e74
YO
2843 DEBUG_r(if (!PL_colorset) reginitcolors());
2844 DEBUG_COMPILE_r({
2845 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
2846 PL_colors[4],PL_colors[5],PL_colors[0],
2847 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2848 });
e2509266 2849 RExC_flags = pm->op_pmflags;
830247a4 2850 RExC_sawback = 0;
bbce6d69 2851
830247a4
IZ
2852 RExC_seen = 0;
2853 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2854 RExC_seen_evals = 0;
2855 RExC_extralen = 0;
c277df42 2856
bbce6d69 2857 /* First pass: determine size, legality. */
830247a4 2858 RExC_parse = exp;
fac92740 2859 RExC_start = exp;
830247a4
IZ
2860 RExC_end = xend;
2861 RExC_naughty = 0;
2862 RExC_npar = 1;
2863 RExC_size = 0L;
2864 RExC_emit = &PL_regdummy;
2865 RExC_whilem_seen = 0;
85ddcde9
JH
2866#if 0 /* REGC() is (currently) a NOP at the first pass.
2867 * Clever compilers notice this and complain. --jhi */
830247a4 2868 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 2869#endif
830247a4 2870 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 2871 RExC_precomp = Nullch;
a0d0e21e
LW
2872 return(NULL);
2873 }
a3621e74 2874 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 2875
c277df42
IZ
2876 /* Small enough for pointer-storage convention?
2877 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
2878 if (RExC_size >= 0x10000L && RExC_extralen)
2879 RExC_size += RExC_extralen;
c277df42 2880 else
830247a4
IZ
2881 RExC_extralen = 0;
2882 if (RExC_whilem_seen > 15)
2883 RExC_whilem_seen = 15;
a0d0e21e 2884
bbce6d69 2885 /* Allocate space and initialize. */
830247a4 2886 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 2887 char, regexp);
a0d0e21e 2888 if (r == NULL)
b45f050a
JF
2889 FAIL("Regexp out of space");
2890
0f79a09d
GS
2891#ifdef DEBUGGING
2892 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 2893 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 2894#endif
c277df42 2895 r->refcnt = 1;
bbce6d69 2896 r->prelen = xend - exp;
5cfc7842 2897 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 2898 r->subbeg = NULL;
ed252734
NC
2899#ifdef PERL_COPY_ON_WRITE
2900 r->saved_copy = Nullsv;
2901#endif
cf93c79d 2902 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 2903 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
2904
2905 r->substrs = 0; /* Useful during FAIL. */
2906 r->startp = 0; /* Useful during FAIL. */
2907 r->endp = 0; /* Useful during FAIL. */
2908
fac92740
MJD
2909 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2910 if (r->offsets) {
2911 r->offsets[0] = RExC_size;
2912 }
a3621e74 2913 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
392fbf5d 2914 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 2915 r->offsets ? "Got" : "Couldn't get",
392fbf5d 2916 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 2917
830247a4 2918 RExC_rx = r;
bbce6d69 2919
2920 /* Second pass: emit code. */
e2509266 2921 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
2922 RExC_parse = exp;
2923 RExC_end = xend;
2924 RExC_naughty = 0;
2925 RExC_npar = 1;
fac92740 2926 RExC_emit_start = r->program;
830247a4 2927 RExC_emit = r->program;
2cd61cdb 2928 /* Store the count of eval-groups for security checks: */
eb160463 2929 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 2930 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 2931 r->data = 0;
830247a4 2932 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
2933 return(NULL);
2934
a3621e74 2935
a0d0e21e 2936 /* Dig out information for optimizations. */
cf93c79d 2937 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 2938 pm->op_pmflags = RExC_flags;
a0ed51b3 2939 if (UTF)
5ff6fc6d 2940 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 2941 r->regstclass = NULL;
830247a4 2942 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 2943 r->reganch |= ROPT_NAUGHTY;
c277df42 2944 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
2945
2946 /* XXXX To minimize changes to RE engine we always allocate
2947 3-units-long substrs field. */
2948 Newz(1004, r->substrs, 1, struct reg_substr_data);
2949
2c2d71f5 2950 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 2951 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 2952 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 2953 I32 fake;
c5254dd6 2954 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
2955 struct regnode_charclass_class ch_class;
2956 int stclass_flag;
cb434fcc 2957 I32 last_close = 0;
a0d0e21e
LW
2958
2959 first = scan;
c277df42 2960 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 2961 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 2962 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
2963 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2964 (OP(first) == PLUS) ||
2965 (OP(first) == MINMOD) ||
653099ff 2966 /* An {n,m} with n>0 */
22c35a8c 2967 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
2968 if (OP(first) == PLUS)
2969 sawplus = 1;
2970 else
2971 first += regarglen[(U8)OP(first)];
2972 first = NEXTOPER(first);
a687059c
LW
2973 }
2974
a0d0e21e
LW
2975 /* Starting-point info. */
2976 again:
653099ff 2977 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
2978 if (OP(first) == EXACT)
2979 ; /* Empty, get anchored substr later. */
2980 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
2981 r->regstclass = first;
2982 }
bfed75c6 2983 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 2984 r->regstclass = first;
22c35a8c
GS
2985 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2986 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 2987 r->regstclass = first;
22c35a8c 2988 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
2989 r->reganch |= (OP(first) == MBOL
2990 ? ROPT_ANCH_MBOL
2991 : (OP(first) == SBOL
2992 ? ROPT_ANCH_SBOL
2993 : ROPT_ANCH_BOL));
a0d0e21e 2994 first = NEXTOPER(first);
774d564b 2995 goto again;
2996 }
2997 else if (OP(first) == GPOS) {
2998 r->reganch |= ROPT_ANCH_GPOS;
2999 first = NEXTOPER(first);
3000 goto again;
a0d0e21e 3001 }
e09294f4 3002 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 3003 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3004 !(r->reganch & ROPT_ANCH) )
3005 {
3006 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
3007 int type = OP(NEXTOPER(first));
3008
ffc61ed2 3009 if (type == REG_ANY)
cad2e5aa
JH
3010 type = ROPT_ANCH_MBOL;
3011 else
3012 type = ROPT_ANCH_SBOL;
3013
3014 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3015 first = NEXTOPER(first);
774d564b 3016 goto again;
a0d0e21e 3017 }
b81d288d 3018 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3019 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3020 /* x+ must match at the 1st pos of run of x's */
3021 r->reganch |= ROPT_SKIP;
a0d0e21e 3022
c277df42 3023 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 3024 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 3025 (IV)(first - scan + 1)));
a0d0e21e
LW
3026 /*
3027 * If there's something expensive in the r.e., find the
3028 * longest literal string that must appear and make it the
3029 * regmust. Resolve ties in favor of later strings, since
3030 * the regstart check works with the beginning of the r.e.
3031 * and avoiding duplication strengthens checking. Not a
3032 * strong reason, but sufficient in the absence of others.
3033 * [Now we resolve ties in favor of the earlier string if
c277df42 3034 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3035 * earlier string may buy us something the later one won't.]
3036 */
a0d0e21e 3037 minlen = 0;
a687059c 3038
79cb57f6
GS
3039 data.longest_fixed = newSVpvn("",0);
3040 data.longest_float = newSVpvn("",0);
3041 data.last_found = newSVpvn("",0);
c277df42
IZ
3042 data.longest = &(data.longest_fixed);
3043 first = scan;
653099ff 3044 if (!r->regstclass) {
830247a4 3045 cl_init(pRExC_state, &ch_class);
653099ff
GS
3046 data.start_class = &ch_class;
3047 stclass_flag = SCF_DO_STCLASS_AND;
3048 } else /* XXXX Check for BOUND? */
3049 stclass_flag = 0;
cb434fcc 3050 data.last_closep = &last_close;
653099ff 3051
830247a4 3052 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3053 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3054 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3055 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3056 && !RExC_seen_zerolen
3057 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3058 r->reganch |= ROPT_CHECK_ALL;
830247a4 3059 scan_commit(pRExC_state, &data);
c277df42
IZ
3060 SvREFCNT_dec(data.last_found);
3061
a0ed51b3 3062 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3063 if (longest_float_length
c277df42
IZ
3064 || (data.flags & SF_FL_BEFORE_EOL
3065 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3066 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3067 int t;
3068
a0ed51b3 3069 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3070 && data.offset_fixed == data.offset_float_min
3071 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3072 goto remove_float; /* As in (a)+. */
3073
33b8afdf
JH
3074 if (SvUTF8(data.longest_float)) {
3075 r->float_utf8 = data.longest_float;
3076 r->float_substr = Nullsv;
3077 } else {
3078 r->float_substr = data.longest_float;
3079 r->float_utf8 = Nullsv;
3080 }
c277df42
IZ
3081 r->float_min_offset = data.offset_float_min;
3082 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3083 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3084 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3085 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3086 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3087 }
3088 else {
aca2d497 3089 remove_float:
33b8afdf 3090 r->float_substr = r->float_utf8 = Nullsv;
c277df42 3091 SvREFCNT_dec(data.longest_float);
c5254dd6 3092 longest_float_length = 0;
a0d0e21e 3093 }
c277df42 3094
a0ed51b3 3095 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3096 if (longest_fixed_length
c277df42
IZ
3097 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3098 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3099 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3100 int t;
3101
33b8afdf
JH
3102 if (SvUTF8(data.longest_fixed)) {
3103 r->anchored_utf8 = data.longest_fixed;
3104 r->anchored_substr = Nullsv;
3105 } else {
3106 r->anchored_substr = data.longest_fixed;
3107 r->anchored_utf8 = Nullsv;
3108 }
c277df42 3109 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3110 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3111 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3112 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3113 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3114 }
3115 else {
33b8afdf 3116 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 3117 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3118 longest_fixed_length = 0;
a0d0e21e 3119 }
b81d288d 3120 if (r->regstclass
ffc61ed2 3121 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3122 r->regstclass = NULL;
33b8afdf
JH
3123 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3124 && stclass_flag
653099ff 3125 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3126 && !cl_is_anything(data.start_class))
3127 {
830247a4 3128 I32 n = add_data(pRExC_state, 1, "f");
653099ff 3129
b81d288d 3130 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
3131 struct regnode_charclass_class);
3132 StructCopy(data.start_class,
830247a4 3133 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3134 struct regnode_charclass_class);
830247a4 3135 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3136 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 3137 PL_regdata = r->data; /* for regprop() */
a3621e74 3138 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
9c5ffd7c
JH
3139 regprop(sv, (regnode*)data.start_class);
3140 PerlIO_printf(Perl_debug_log,
3141 "synthetic stclass `%s'.\n",
3142 SvPVX(sv));});
653099ff 3143 }
c277df42
IZ
3144
3145 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3146 if (longest_fixed_length > longest_float_length) {
c277df42 3147 r->check_substr = r->anchored_substr;
33b8afdf 3148 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3149 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3150 if (r->reganch & ROPT_ANCH_SINGLE)
3151 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3152 }
3153 else {
c277df42 3154 r->check_substr = r->float_substr;
33b8afdf 3155 r->check_utf8 = r->float_utf8;
c277df42
IZ
3156 r->check_offset_min = data.offset_float_min;
3157 r->check_offset_max = data.offset_float_max;
a0d0e21e 3158 }
30382c73
IZ
3159 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3160 This should be changed ASAP! */
33b8afdf 3161 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3162 r->reganch |= RE_USE_INTUIT;
33b8afdf 3163 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3164 r->reganch |= RE_INTUIT_TAIL;
3165 }
a0ed51b3
LW
3166 }
3167 else {
c277df42
IZ
3168 /* Several toplevels. Best we can is to set minlen. */
3169 I32 fake;
653099ff 3170 struct regnode_charclass_class ch_class;
cb434fcc 3171 I32 last_close = 0;
c277df42 3172
a3621e74 3173 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3174 scan = r->program + 1;
830247a4 3175 cl_init(pRExC_state, &ch_class);
653099ff 3176 data.start_class = &ch_class;
cb434fcc 3177 data.last_closep = &last_close;
a3621e74 3178 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf
JH
3179 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3180 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 3181 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3182 && !cl_is_anything(data.start_class))
3183 {
830247a4 3184 I32 n = add_data(pRExC_state, 1, "f");
653099ff 3185
b81d288d 3186 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
3187 struct regnode_charclass_class);
3188 StructCopy(data.start_class,
830247a4 3189 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3190 struct regnode_charclass_class);
830247a4 3191 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3192 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3193 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
9c5ffd7c
JH
3194 regprop(sv, (regnode*)data.start_class);
3195 PerlIO_printf(Perl_debug_log,
3196 "synthetic stclass `%s'.\n",
3197 SvPVX(sv));});
653099ff 3198 }
a0d0e21e
LW
3199 }
3200
a0d0e21e 3201 r->minlen = minlen;
b81d288d 3202 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3203 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3204 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3205 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3206 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3207 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3208 if (RExC_seen & REG_SEEN_CANY)
3209 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
3210 Newz(1002, r->startp, RExC_npar, I32);
3211 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 3212 PL_regdata = r->data; /* for regprop() */
a3621e74 3213 DEBUG_COMPILE_r(regdump(r));
a0d0e21e 3214 return(r);
a687059c
LW
3215}
3216
3217/*
3218 - reg - regular expression, i.e. main body or parenthesized thing
3219 *
3220 * Caller must absorb opening parenthesis.
3221 *
3222 * Combining parenthesis handling with the base level of regular expression
3223 * is a trifle forced, but the need to tie the tails of the branches to what
3224 * follows makes it hard to avoid.
3225 */
76e3520e 3226STATIC regnode *
830247a4 3227S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 3228 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3229{
c277df42
IZ
3230 register regnode *ret; /* Will be the head of the group. */
3231 register regnode *br;
3232 register regnode *lastbr;
3233 register regnode *ender = 0;
a0d0e21e 3234 register I32 parno = 0;
e2509266 3235 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
3236
3237 /* for (?g), (?gc), and (?o) warnings; warning
3238 about (?c) will warn about (?g) -- japhy */
3239
3240 I32 wastedflags = 0x00,
3241 wasted_o = 0x01,
3242 wasted_g = 0x02,
3243 wasted_gc = 0x02 | 0x04,
3244 wasted_c = 0x04;
3245
fac92740 3246 char * parse_start = RExC_parse; /* MJD */
830247a4 3247 char *oregcomp_parse = RExC_parse;
c277df42 3248 char c;
a0d0e21e 3249
821b33a5 3250 *flagp = 0; /* Tentatively. */
a0d0e21e 3251
9d1d55b5 3252
a0d0e21e
LW
3253 /* Make an OPEN node, if parenthesized. */
3254 if (paren) {
fac92740 3255 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3256 U32 posflags = 0, negflags = 0;
3257 U32 *flagsp = &posflags;
0f5d15d6 3258 int logical = 0;
830247a4 3259 char *seqstart = RExC_parse;
ca9dfc88 3260
830247a4
IZ
3261 RExC_parse++;
3262 paren = *RExC_parse++;
c277df42 3263 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3264 switch (paren) {
fac92740 3265 case '<': /* (?<...) */
830247a4 3266 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3267 if (*RExC_parse == '!')
c277df42 3268 paren = ',';
b81d288d 3269 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3270 goto unknown;
830247a4 3271 RExC_parse++;
fac92740
MJD
3272 case '=': /* (?=...) */
3273 case '!': /* (?!...) */
830247a4 3274 RExC_seen_zerolen++;
fac92740
MJD
3275 case ':': /* (?:...) */
3276 case '>': /* (?>...) */
a0d0e21e 3277 break;
fac92740
MJD
3278 case '$': /* (?$...) */
3279 case '@': /* (?@...) */
8615cb43 3280 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3281 break;
fac92740 3282 case '#': /* (?#...) */
830247a4
IZ
3283 while (*RExC_parse && *RExC_parse != ')')
3284 RExC_parse++;
3285 if (*RExC_parse != ')')
c277df42 3286 FAIL("Sequence (?#... not terminated");
830247a4 3287 nextchar(pRExC_state);
a0d0e21e
LW
3288 *flagp = TRYAGAIN;
3289 return NULL;
fac92740 3290 case 'p': /* (?p...) */
9014280d 3291 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3292 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3293 /* FALL THROUGH*/
fac92740 3294 case '?': /* (??...) */
0f5d15d6 3295 logical = 1;
438a3801
YST
3296 if (*RExC_parse != '{')
3297 goto unknown;
830247a4 3298 paren = *RExC_parse++;
0f5d15d6 3299 /* FALL THROUGH */
fac92740 3300 case '{': /* (?{...}) */
c277df42 3301 {
c277df42
IZ
3302 I32 count = 1, n = 0;
3303 char c;
830247a4 3304 char *s = RExC_parse;
c277df42
IZ
3305 SV *sv;
3306 OP_4tree *sop, *rop;
3307
830247a4
IZ
3308 RExC_seen_zerolen++;
3309 RExC_seen |= REG_SEEN_EVAL;
3310 while (count && (c = *RExC_parse)) {
3311 if (c == '\\' && RExC_parse[1])
3312 RExC_parse++;
b81d288d 3313 else if (c == '{')
c277df42 3314 count++;
b81d288d 3315 else if (c == '}')
c277df42 3316 count--;
830247a4 3317 RExC_parse++;
c277df42 3318 }
830247a4 3319 if (*RExC_parse != ')')
b45f050a 3320 {
b81d288d 3321 RExC_parse = s;
b45f050a
JF
3322 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3323 }
c277df42 3324 if (!SIZE_ONLY) {
f3548bdc 3325 PAD *pad;
b81d288d
AB
3326
3327 if (RExC_parse - 1 - s)
830247a4 3328 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3329 else
79cb57f6 3330 sv = newSVpvn("", 0);
c277df42 3331
569233ed
SB
3332 ENTER;
3333 Perl_save_re_context(aTHX);
f3548bdc 3334 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3335 sop->op_private |= OPpREFCOUNTED;
3336 /* re_dup will OpREFCNT_inc */
3337 OpREFCNT_set(sop, 1);
569233ed 3338 LEAVE;
c277df42 3339
830247a4
IZ
3340 n = add_data(pRExC_state, 3, "nop");
3341 RExC_rx->data->data[n] = (void*)rop;
3342 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3343 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3344 SvREFCNT_dec(sv);
a0ed51b3 3345 }
e24b16f9 3346 else { /* First pass */
830247a4 3347 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3348 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3349 /* No compiled RE interpolated, has runtime
3350 components ===> unsafe. */
3351 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3352 if (PL_tainting && PL_tainted)
cc6b7395 3353 FAIL("Eval-group in insecure regular expression");
923e4eb5 3354 if (IN_PERL_COMPILETIME)
b5c19bd7 3355 PL_cv_has_eval = 1;
c277df42 3356 }
b5c19bd7 3357
830247a4 3358 nextchar(pRExC_state);
0f5d15d6 3359 if (logical) {
830247a4 3360 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3361 if (!SIZE_ONLY)
3362 ret->flags = 2;
830247a4 3363 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3364 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3365 return ret;
3366 }
ccb2c380
MP
3367 ret = reganode(pRExC_state, EVAL, n);
3368 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3369 Set_Node_Offset(ret, parse_start);
3370 return ret;
c277df42 3371 }
fac92740 3372 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3373 {
fac92740 3374 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3375 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3376 || RExC_parse[1] == '<'
830247a4 3377 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3378 I32 flag;
3379
830247a4 3380 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3381 if (!SIZE_ONLY)
3382 ret->flags = 1;
830247a4 3383 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 3384 goto insert_if;
b81d288d 3385 }
a0ed51b3 3386 }
830247a4 3387 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3388 /* (?(1)...) */
830247a4 3389 parno = atoi(RExC_parse++);
c277df42 3390
830247a4
IZ
3391 while (isDIGIT(*RExC_parse))
3392 RExC_parse++;
fac92740
MJD
3393 ret = reganode(pRExC_state, GROUPP, parno);
3394
830247a4 3395 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3396 vFAIL("Switch condition not recognized");
c277df42 3397 insert_if:
830247a4
IZ
3398 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3399 br = regbranch(pRExC_state, &flags, 1);
c277df42 3400 if (br == NULL)
830247a4 3401 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3402 else
830247a4
IZ
3403 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3404 c = *nextchar(pRExC_state);
d1b80229
IZ
3405 if (flags&HASWIDTH)
3406 *flagp |= HASWIDTH;
c277df42 3407 if (c == '|') {
830247a4
IZ
3408 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3409 regbranch(pRExC_state, &flags, 1);
3410 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
3411 if (flags&HASWIDTH)
3412 *flagp |= HASWIDTH;
830247a4 3413 c = *nextchar(pRExC_state);
a0ed51b3
LW
3414 }
3415 else
c277df42
IZ
3416 lastbr = NULL;
3417 if (c != ')')
8615cb43 3418 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
3419 ender = reg_node(pRExC_state, TAIL);
3420 regtail(pRExC_state, br, ender);
c277df42 3421 if (lastbr) {
830247a4
IZ
3422 regtail(pRExC_state, lastbr, ender);
3423 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3424 }
3425 else
830247a4 3426 regtail(pRExC_state, ret, ender);
c277df42 3427 return ret;
a0ed51b3
LW
3428 }
3429 else {
830247a4 3430 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3431 }
3432 }
1b1626e4 3433 case 0:
830247a4 3434 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3435 vFAIL("Sequence (? incomplete");
1b1626e4 3436 break;
a0d0e21e 3437 default:
830247a4 3438 --RExC_parse;
fac92740 3439 parse_flags: /* (?i) */
830247a4 3440 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3441 /* (?g), (?gc) and (?o) are useless here
3442 and must be globally applied -- japhy */
3443
3444 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3445 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3446 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3447 if (! (wastedflags & wflagbit) ) {
3448 wastedflags |= wflagbit;
3449 vWARN5(
3450 RExC_parse + 1,
3451 "Useless (%s%c) - %suse /%c modifier",
3452 flagsp == &negflags ? "?-" : "?",
3453 *RExC_parse,
3454 flagsp == &negflags ? "don't " : "",
3455 *RExC_parse
3456 );
3457 }
3458 }
3459 }
3460 else if (*RExC_parse == 'c') {
3461 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3462 if (! (wastedflags & wasted_c) ) {
3463 wastedflags |= wasted_gc;
3464 vWARN3(
3465 RExC_parse + 1,
3466 "Useless (%sc) - %suse /gc modifier",
3467 flagsp == &negflags ? "?-" : "?",
3468 flagsp == &negflags ? "don't " : ""
3469 );
3470 }
3471 }
3472 }
3473 else { pmflag(flagsp, *RExC_parse); }
3474
830247a4 3475 ++RExC_parse;
ca9dfc88 3476 }
830247a4 3477 if (*RExC_parse == '-') {
ca9dfc88 3478 flagsp = &negflags;
9d1d55b5 3479 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3480 ++RExC_parse;
ca9dfc88 3481 goto parse_flags;
48c036b1 3482 }
e2509266
JH
3483 RExC_flags |= posflags;
3484 RExC_flags &= ~negflags;
830247a4
IZ
3485 if (*RExC_parse == ':') {
3486 RExC_parse++;
ca9dfc88
IZ
3487 paren = ':';
3488 break;
3489 }
c277df42 3490 unknown:
830247a4
IZ
3491 if (*RExC_parse != ')') {
3492 RExC_parse++;
3493 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3494 }
830247a4 3495 nextchar(pRExC_state);
a0d0e21e
LW
3496 *flagp = TRYAGAIN;
3497 return NULL;
3498 }
3499 }
fac92740 3500 else { /* (...) */
830247a4
IZ
3501 parno = RExC_npar;
3502 RExC_npar++;
3503 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3504 Set_Node_Length(ret, 1); /* MJD */
3505 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 3506 open = 1;
a0d0e21e 3507 }
a0ed51b3 3508 }
fac92740 3509 else /* ! paren */
a0d0e21e
LW
3510 ret = NULL;
3511
3512 /* Pick up the branches, linking them together. */
fac92740 3513 parse_start = RExC_parse; /* MJD */
830247a4 3514 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
3515 /* branch_len = (paren != 0); */
3516
a0d0e21e
LW
3517 if (br == NULL)
3518 return(NULL);
830247a4
IZ
3519 if (*RExC_parse == '|') {
3520 if (!SIZE_ONLY && RExC_extralen) {
3521 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3522 }
fac92740 3523 else { /* MJD */
830247a4 3524 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3525 Set_Node_Length(br, paren != 0);
3526 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3527 }
c277df42
IZ
3528 have_branch = 1;
3529 if (SIZE_ONLY)
830247a4 3530 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3531 }
3532 else if (paren == ':') {
c277df42
IZ
3533 *flagp |= flags&SIMPLE;
3534 }
3535 if (open) { /* Starts with OPEN. */
830247a4 3536 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3537 }
3538 else if (paren != '?') /* Not Conditional */
a0d0e21e 3539 ret = br;
32a0ca98 3540 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3541 lastbr = br;
830247a4
IZ
3542 while (*RExC_parse == '|') {
3543 if (!SIZE_ONLY && RExC_extralen) {
3544 ender = reganode(pRExC_state, LONGJMP,0);
3545 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3546 }
3547 if (SIZE_ONLY)
830247a4
IZ
3548 RExC_extralen += 2; /* Account for LONGJMP. */
3549 nextchar(pRExC_state);
3550 br = regbranch(pRExC_state, &flags, 0);
fac92740 3551
a687059c 3552 if (br == NULL)
a0d0e21e 3553 return(NULL);
830247a4 3554 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3555 lastbr = br;
821b33a5
IZ
3556 if (flags&HASWIDTH)
3557 *flagp |= HASWIDTH;
a687059c 3558 *flagp |= flags&SPSTART;
a0d0e21e
LW
3559 }
3560
c277df42
IZ
3561 if (have_branch || paren != ':') {
3562 /* Make a closing node, and hook it on the end. */
3563 switch (paren) {
3564 case ':':
830247a4 3565 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3566 break;
3567 case 1:
830247a4 3568 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3569 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3570 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3571 break;
3572 case '<':
c277df42
IZ
3573 case ',':
3574 case '=':
3575 case '!':
c277df42 3576 *flagp &= ~HASWIDTH;
821b33a5
IZ
3577 /* FALL THROUGH */
3578 case '>':
830247a4 3579 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3580 break;
3581 case 0:
830247a4 3582 ender = reg_node(pRExC_state, END);
c277df42
IZ
3583 break;
3584 }
830247a4 3585 regtail(pRExC_state, lastbr, ender);
a0d0e21e 3586
c277df42
IZ
3587 if (have_branch) {
3588 /* Hook the tails of the branches to the closing node. */
3589 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 3590 regoptail(pRExC_state, br, ender);
c277df42
IZ
3591 }
3592 }
a0d0e21e 3593 }
c277df42
IZ
3594
3595 {
e1ec3a88
AL
3596 const char *p;
3597 static const char parens[] = "=!<,>";
c277df42
IZ
3598
3599 if (paren && (p = strchr(parens, paren))) {
eb160463 3600 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
3601 int flag = (p - parens) > 1;
3602
3603 if (paren == '>')
3604 node = SUSPEND, flag = 0;
830247a4 3605 reginsert(pRExC_state, node,ret);
45948336
EP
3606 Set_Node_Cur_Length(ret);
3607 Set_Node_Offset(ret, parse_start + 1);
c277df42 3608 ret->flags = flag;
830247a4 3609 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 3610 }
a0d0e21e
LW
3611 }
3612
3613 /* Check for proper termination. */
ce3e6498 3614 if (paren) {
e2509266 3615 RExC_flags = oregflags;
830247a4
IZ
3616 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3617 RExC_parse = oregcomp_parse;
380a0633 3618 vFAIL("Unmatched (");
ce3e6498 3619 }
a0ed51b3 3620 }
830247a4
IZ
3621 else if (!paren && RExC_parse < RExC_end) {
3622 if (*RExC_parse == ')') {
3623 RExC_parse++;
380a0633 3624 vFAIL("Unmatched )");
a0ed51b3
LW
3625 }
3626 else
b45f050a 3627 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
3628 /* NOTREACHED */
3629 }
a687059c 3630
a0d0e21e 3631 return(ret);
a687059c
LW
3632}
3633
3634/*
3635 - regbranch - one alternative of an | operator
3636 *
3637 * Implements the concatenation operator.
3638 */
76e3520e 3639STATIC regnode *
830247a4 3640S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 3641{
c277df42
IZ
3642 register regnode *ret;
3643 register regnode *chain = NULL;
3644 register regnode *latest;
3645 I32 flags = 0, c = 0;
a0d0e21e 3646
b81d288d 3647 if (first)
c277df42
IZ
3648 ret = NULL;
3649 else {
b81d288d 3650 if (!SIZE_ONLY && RExC_extralen)
830247a4 3651 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 3652 else {
830247a4 3653 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
3654 Set_Node_Length(ret, 1);
3655 }
c277df42
IZ
3656 }
3657
b81d288d 3658 if (!first && SIZE_ONLY)
830247a4 3659 RExC_extralen += 1; /* BRANCHJ */
b81d288d 3660
c277df42 3661 *flagp = WORST; /* Tentatively. */
a0d0e21e 3662
830247a4
IZ
3663 RExC_parse--;
3664 nextchar(pRExC_state);
3665 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 3666 flags &= ~TRYAGAIN;
830247a4 3667 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
3668 if (latest == NULL) {
3669 if (flags & TRYAGAIN)
3670 continue;
3671 return(NULL);
a0ed51b3
LW
3672 }
3673 else if (ret == NULL)
c277df42 3674 ret = latest;
a0d0e21e 3675 *flagp |= flags&HASWIDTH;
c277df42 3676 if (chain == NULL) /* First piece. */
a0d0e21e
LW
3677 *flagp |= flags&SPSTART;
3678 else {
830247a4
IZ
3679 RExC_naughty++;
3680 regtail(pRExC_state, chain, latest);
a687059c 3681 }
a0d0e21e 3682 chain = latest;
c277df42
IZ
3683 c++;
3684 }
3685 if (chain == NULL) { /* Loop ran zero times. */
830247a4 3686 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
3687 if (ret == NULL)
3688 ret = chain;
3689 }
3690 if (c == 1) {
3691 *flagp |= flags&SIMPLE;
a0d0e21e 3692 }
a687059c 3693
a0d0e21e 3694 return(ret);
a687059c
LW
3695}
3696
3697/*
3698 - regpiece - something followed by possible [*+?]
3699 *
3700 * Note that the branching code sequences used for ? and the general cases
3701 * of * and + are somewhat optimized: they use the same NOTHING node as
3702 * both the endmarker for their branch list and the body of the last branch.
3703 * It might seem that this node could be dispensed with entirely, but the
3704 * endmarker role is not redundant.
3705 */
76e3520e 3706STATIC regnode *
830247a4 3707S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3708{
c277df42 3709 register regnode *ret;
a0d0e21e
LW
3710 register char op;
3711 register char *next;
3712 I32 flags;
830247a4 3713 char *origparse = RExC_parse;
a0d0e21e
LW
3714 char *maxpos;
3715 I32 min;
c277df42 3716 I32 max = REG_INFTY;
fac92740 3717 char *parse_start;
a0d0e21e 3718
830247a4 3719 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
3720 if (ret == NULL) {
3721 if (flags & TRYAGAIN)
3722 *flagp |= TRYAGAIN;
3723 return(NULL);
3724 }
3725
830247a4 3726 op = *RExC_parse;
a0d0e21e 3727
830247a4 3728 if (op == '{' && regcurly(RExC_parse)) {
fac92740 3729 parse_start = RExC_parse; /* MJD */
830247a4 3730 next = RExC_parse + 1;
a0d0e21e
LW
3731 maxpos = Nullch;
3732 while (isDIGIT(*next) || *next == ',') {
3733 if (*next == ',') {
3734 if (maxpos)
3735 break;
3736 else
3737 maxpos = next;
a687059c 3738 }
a0d0e21e
LW
3739 next++;
3740 }
3741 if (*next == '}') { /* got one */
3742 if (!maxpos)
3743 maxpos = next;
830247a4
IZ
3744 RExC_parse++;
3745 min = atoi(RExC_parse);
a0d0e21e
LW
3746 if (*maxpos == ',')
3747 maxpos++;
3748 else
830247a4 3749 maxpos = RExC_parse;
a0d0e21e
LW
3750 max = atoi(maxpos);
3751 if (!max && *maxpos != '0')
c277df42
IZ
3752 max = REG_INFTY; /* meaning "infinity" */
3753 else if (max >= REG_INFTY)
8615cb43 3754 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
3755 RExC_parse = next;
3756 nextchar(pRExC_state);
a0d0e21e
LW
3757
3758 do_curly:
3759 if ((flags&SIMPLE)) {
830247a4
IZ
3760 RExC_naughty += 2 + RExC_naughty / 2;
3761 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
3762 Set_Node_Offset(ret, parse_start+1); /* MJD */
3763 Set_Node_Cur_Length(ret);
a0d0e21e
LW
3764 }
3765 else {
830247a4 3766 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
3767
3768 w->flags = 0;
830247a4
IZ
3769 regtail(pRExC_state, ret, w);
3770 if (!SIZE_ONLY && RExC_extralen) {
3771 reginsert(pRExC_state, LONGJMP,ret);
3772 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
3773 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3774 }
830247a4 3775 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
3776 /* MJD hk */
3777 Set_Node_Offset(ret, parse_start+1);
3778 Set_Node_Length(ret,
3779 op == '{' ? (RExC_parse - parse_start) : 1);
3780
830247a4 3781 if (!SIZE_ONLY && RExC_extralen)
c277df42 3782 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 3783 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 3784 if (SIZE_ONLY)
830247a4
IZ
3785 RExC_whilem_seen++, RExC_extralen += 3;
3786 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 3787 }
c277df42 3788 ret->flags = 0;
a0d0e21e
LW
3789
3790 if (min > 0)
821b33a5
IZ
3791 *flagp = WORST;
3792 if (max > 0)
3793 *flagp |= HASWIDTH;
a0d0e21e 3794 if (max && max < min)
8615cb43 3795 vFAIL("Can't do {n,m} with n > m");
c277df42 3796 if (!SIZE_ONLY) {
eb160463
GS
3797 ARG1_SET(ret, (U16)min);
3798 ARG2_SET(ret, (U16)max);
a687059c 3799 }
a687059c 3800
a0d0e21e 3801 goto nest_check;
a687059c 3802 }
a0d0e21e 3803 }
a687059c 3804
a0d0e21e
LW
3805 if (!ISMULT1(op)) {
3806 *flagp = flags;
a687059c 3807 return(ret);
a0d0e21e 3808 }
bb20fd44 3809
c277df42 3810#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
3811
3812 /* if this is reinstated, don't forget to put this back into perldiag:
3813
3814 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3815
3816 (F) The part of the regexp subject to either the * or + quantifier
3817 could match an empty string. The {#} shows in the regular
3818 expression about where the problem was discovered.
3819
3820 */
3821
bb20fd44 3822 if (!(flags&HASWIDTH) && op != '?')
b45f050a 3823 vFAIL("Regexp *+ operand could be empty");
b81d288d 3824#endif
bb20fd44 3825
fac92740 3826 parse_start = RExC_parse;
830247a4 3827 nextchar(pRExC_state);
a0d0e21e 3828
821b33a5 3829 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
3830
3831 if (op == '*' && (flags&SIMPLE)) {
830247a4 3832 reginsert(pRExC_state, STAR, ret);
c277df42 3833 ret->flags = 0;
830247a4 3834 RExC_naughty += 4;
a0d0e21e
LW
3835 }
3836 else if (op == '*') {
3837 min = 0;
3838 goto do_curly;
a0ed51b3
LW
3839 }
3840 else if (op == '+' && (flags&SIMPLE)) {
830247a4 3841 reginsert(pRExC_state, PLUS, ret);
c277df42 3842 ret->flags = 0;
830247a4 3843 RExC_naughty += 3;
a0d0e21e
LW
3844 }
3845 else if (op == '+') {
3846 min = 1;
3847 goto do_curly;
a0ed51b3
LW
3848 }
3849 else if (op == '?') {
a0d0e21e
LW
3850 min = 0; max = 1;
3851 goto do_curly;
3852 }
3853 nest_check:
e476b1b5 3854 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 3855 vWARN3(RExC_parse,
b45f050a 3856 "%.*s matches null string many times",
830247a4 3857 RExC_parse - origparse,
b45f050a 3858 origparse);
a0d0e21e
LW
3859 }
3860
830247a4
IZ
3861 if (*RExC_parse == '?') {
3862 nextchar(pRExC_state);
3863 reginsert(pRExC_state, MINMOD, ret);
3864 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 3865 }
830247a4
IZ
3866 if (ISMULT2(RExC_parse)) {
3867 RExC_parse++;
b45f050a
JF
3868 vFAIL("Nested quantifiers");
3869 }
a0d0e21e
LW
3870
3871 return(ret);
a687059c
LW
3872}
3873
3874/*
3875 - regatom - the lowest level
3876 *
3877 * Optimization: gobbles an entire sequence of ordinary characters so that
3878 * it can turn them into a single node, which is smaller to store and
3879 * faster to run. Backslashed characters are exceptions, each becoming a
3880 * separate node; the code is simpler that way and it's not worth fixing.
3881 *
b45f050a 3882 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 3883STATIC regnode *
830247a4 3884S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3885{
c277df42 3886 register regnode *ret = 0;
a0d0e21e 3887 I32 flags;
45948336 3888 char *parse_start = RExC_parse;
a0d0e21e
LW
3889
3890 *flagp = WORST; /* Tentatively. */
3891
3892tryagain:
830247a4 3893 switch (*RExC_parse) {
a0d0e21e 3894 case '^':
830247a4
IZ
3895 RExC_seen_zerolen++;
3896 nextchar(pRExC_state);
e2509266 3897 if (RExC_flags & PMf_MULTILINE)
830247a4 3898 ret = reg_node(pRExC_state, MBOL);
e2509266 3899 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3900 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3901 else
830247a4 3902 ret = reg_node(pRExC_state, BOL);
fac92740 3903 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3904 break;
3905 case '$':
830247a4 3906 nextchar(pRExC_state);
b81d288d 3907 if (*RExC_parse)
830247a4 3908 RExC_seen_zerolen++;
e2509266 3909 if (RExC_flags & PMf_MULTILINE)
830247a4 3910 ret = reg_node(pRExC_state, MEOL);
e2509266 3911 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3912 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3913 else
830247a4 3914 ret = reg_node(pRExC_state, EOL);
fac92740 3915 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3916 break;
3917 case '.':
830247a4 3918 nextchar(pRExC_state);
e2509266 3919 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
3920 ret = reg_node(pRExC_state, SANY);
3921 else
3922 ret = reg_node(pRExC_state, REG_ANY);
3923 *flagp |= HASWIDTH|SIMPLE;
830247a4 3924 RExC_naughty++;
fac92740 3925 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3926 break;
3927 case '[':
b45f050a 3928 {
830247a4 3929 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 3930 ret = regclass(pRExC_state);
830247a4
IZ
3931 if (*RExC_parse != ']') {
3932 RExC_parse = oregcomp_parse;
b45f050a
JF
3933 vFAIL("Unmatched [");
3934 }
830247a4 3935 nextchar(pRExC_state);
a0d0e21e 3936 *flagp |= HASWIDTH|SIMPLE;
fac92740 3937 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 3938 break;
b45f050a 3939 }
a0d0e21e 3940 case '(':
830247a4
IZ
3941 nextchar(pRExC_state);
3942 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 3943 if (ret == NULL) {
bf93d4cc 3944 if (flags & TRYAGAIN) {
830247a4 3945 if (RExC_parse == RExC_end) {
bf93d4cc
GS
3946 /* Make parent create an empty node if needed. */
3947 *flagp |= TRYAGAIN;
3948 return(NULL);
3949 }
a0d0e21e 3950 goto tryagain;
bf93d4cc 3951 }
a0d0e21e
LW
3952 return(NULL);
3953 }
c277df42 3954 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
3955 break;
3956 case '|':
3957 case ')':
3958 if (flags & TRYAGAIN) {
3959 *flagp |= TRYAGAIN;
3960 return NULL;
3961 }
b45f050a 3962 vFAIL("Internal urp");
a0d0e21e
LW
3963 /* Supposed to be caught earlier. */
3964 break;
85afd4ae 3965 case '{':
830247a4
IZ
3966 if (!regcurly(RExC_parse)) {
3967 RExC_parse++;
85afd4ae
CS
3968 goto defchar;
3969 }
3970 /* FALL THROUGH */
a0d0e21e
LW
3971 case '?':
3972 case '+':
3973 case '*':
830247a4 3974 RExC_parse++;
b45f050a 3975 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
3976 break;
3977 case '\\':
830247a4 3978 switch (*++RExC_parse) {
a0d0e21e 3979 case 'A':
830247a4
IZ
3980 RExC_seen_zerolen++;
3981 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3982 *flagp |= SIMPLE;
830247a4 3983 nextchar(pRExC_state);
fac92740 3984 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3985 break;
3986 case 'G':
830247a4
IZ
3987 ret = reg_node(pRExC_state, GPOS);
3988 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 3989 *flagp |= SIMPLE;
830247a4 3990 nextchar(pRExC_state);
fac92740 3991 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3992 break;
3993 case 'Z':
830247a4 3994 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3995 *flagp |= SIMPLE;
a1917ab9 3996 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 3997 nextchar(pRExC_state);
a0d0e21e 3998 break;
b85d18e9 3999 case 'z':
830247a4 4000 ret = reg_node(pRExC_state, EOS);
b85d18e9 4001 *flagp |= SIMPLE;
830247a4
IZ
4002 RExC_seen_zerolen++; /* Do not optimize RE away */
4003 nextchar(pRExC_state);
fac92740 4004 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 4005 break;
4a2d328f 4006 case 'C':
f33976b4
DB
4007 ret = reg_node(pRExC_state, CANY);
4008 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 4009 *flagp |= HASWIDTH|SIMPLE;
830247a4 4010 nextchar(pRExC_state);
fac92740 4011 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
4012 break;
4013 case 'X':
830247a4 4014 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 4015 *flagp |= HASWIDTH;
830247a4 4016 nextchar(pRExC_state);
fac92740 4017 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 4018 break;
a0d0e21e 4019 case 'w':
eb160463 4020 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 4021 *flagp |= HASWIDTH|SIMPLE;
830247a4 4022 nextchar(pRExC_state);
fac92740 4023 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4024 break;
4025 case 'W':
eb160463 4026 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4027 *flagp |= HASWIDTH|SIMPLE;
830247a4 4028 nextchar(pRExC_state);
fac92740 4029 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4030 break;
4031 case 'b':
830247a4
IZ
4032 RExC_seen_zerolen++;
4033 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4034 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4035 *flagp |= SIMPLE;
830247a4 4036 nextchar(pRExC_state);
fac92740 4037 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4038 break;
4039 case 'B':
830247a4
IZ
4040 RExC_seen_zerolen++;
4041 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4042 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4043 *flagp |= SIMPLE;
830247a4 4044 nextchar(pRExC_state);
fac92740 4045 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4046 break;
4047 case 's':
eb160463 4048 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4049 *flagp |= HASWIDTH|SIMPLE;
830247a4 4050 nextchar(pRExC_state);
fac92740 4051 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4052 break;
4053 case 'S':
eb160463 4054 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4055 *flagp |= HASWIDTH|SIMPLE;
830247a4 4056 nextchar(pRExC_state);
fac92740 4057 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4058 break;
4059 case 'd':
ffc61ed2 4060 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4061 *flagp |= HASWIDTH|SIMPLE;
830247a4 4062 nextchar(pRExC_state);
fac92740 4063 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4064 break;
4065 case 'D':
ffc61ed2 4066 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4067 *flagp |= HASWIDTH|SIMPLE;
830247a4 4068 nextchar(pRExC_state);
fac92740 4069 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4070 break;
a14b48bc
LW
4071 case 'p':
4072 case 'P':
3568d838 4073 {
830247a4 4074 char* oldregxend = RExC_end;
ccb2c380 4075 char* parse_start = RExC_parse - 2;
a14b48bc 4076
830247a4 4077 if (RExC_parse[1] == '{') {
3568d838 4078 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4079 RExC_end = strchr(RExC_parse, '}');
4080 if (!RExC_end) {
0da60cf5 4081 U8 c = (U8)*RExC_parse;
830247a4
IZ
4082 RExC_parse += 2;
4083 RExC_end = oldregxend;
0da60cf5 4084 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4085 }
830247a4 4086 RExC_end++;
a14b48bc 4087 }
af6f566e 4088 else {
830247a4 4089 RExC_end = RExC_parse + 2;
af6f566e
HS
4090 if (RExC_end > oldregxend)
4091 RExC_end = oldregxend;
4092 }
830247a4 4093 RExC_parse--;
a14b48bc 4094
ffc61ed2 4095 ret = regclass(pRExC_state);
a14b48bc 4096
830247a4
IZ
4097 RExC_end = oldregxend;
4098 RExC_parse--;
ccb2c380
MP
4099
4100 Set_Node_Offset(ret, parse_start + 2);
4101 Set_Node_Cur_Length(ret);
830247a4 4102 nextchar(pRExC_state);
a14b48bc
LW
4103 *flagp |= HASWIDTH|SIMPLE;
4104 }
4105 break;
a0d0e21e
LW
4106 case 'n':
4107 case 'r':
4108 case 't':
4109 case 'f':
4110 case 'e':
4111 case 'a':
4112 case 'x':
4113 case 'c':
4114 case '0':
4115 goto defchar;
4116 case '1': case '2': case '3': case '4':
4117 case '5': case '6': case '7': case '8': case '9':
4118 {
830247a4 4119 I32 num = atoi(RExC_parse);
a0d0e21e 4120
830247a4 4121 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4122 goto defchar;
4123 else {
fac92740 4124 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4125 while (isDIGIT(*RExC_parse))
4126 RExC_parse++;
b45f050a 4127
eb160463 4128 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4129 vFAIL("Reference to nonexistent group");
830247a4 4130 RExC_sawback = 1;
eb160463
GS
4131 ret = reganode(pRExC_state,
4132 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4133 num);
a0d0e21e 4134 *flagp |= HASWIDTH;
fac92740
MJD
4135
4136 /* override incorrect value set in reganode MJD */
4137 Set_Node_Offset(ret, parse_start+1);
4138 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4139 RExC_parse--;
4140 nextchar(pRExC_state);
a0d0e21e
LW
4141 }
4142 }
4143 break;
4144 case '\0':
830247a4 4145 if (RExC_parse >= RExC_end)
b45f050a 4146 FAIL("Trailing \\");
a0d0e21e
LW
4147 /* FALL THROUGH */
4148 default:
c9f97d15
IZ
4149 /* Do not generate `unrecognized' warnings here, we fall
4150 back into the quick-grab loop below */
45948336 4151 parse_start--;
a0d0e21e
LW
4152 goto defchar;
4153 }
4154 break;
4633a7c4
LW
4155
4156 case '#':
e2509266 4157 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4158 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4159 if (RExC_parse < RExC_end)
4633a7c4
LW
4160 goto tryagain;
4161 }
4162 /* FALL THROUGH */
4163
a0d0e21e 4164 default: {
ba210ebe 4165 register STRLEN len;
58ae7d3f 4166 register UV ender;
a0d0e21e 4167 register char *p;
c277df42 4168 char *oldp, *s;
ba210ebe 4169 STRLEN numlen;
80aecb99 4170 STRLEN foldlen;
89ebb4a3 4171 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4172
4173 parse_start = RExC_parse - 1;
a0d0e21e 4174
830247a4 4175 RExC_parse++;
a0d0e21e
LW
4176
4177 defchar:
58ae7d3f 4178 ender = 0;
eb160463
GS
4179 ret = reg_node(pRExC_state,
4180 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4181 s = STRING(ret);
830247a4
IZ
4182 for (len = 0, p = RExC_parse - 1;
4183 len < 127 && p < RExC_end;
a0d0e21e
LW
4184 len++)
4185 {
4186 oldp = p;
5b5a24f7 4187
e2509266 4188 if (RExC_flags & PMf_EXTENDED)
830247a4 4189 p = regwhite(p, RExC_end);
a0d0e21e
LW
4190 switch (*p) {
4191 case '^':
4192 case '$':
4193 case '.':
4194 case '[':
4195 case '(':
4196 case ')':
4197 case '|':
4198 goto loopdone;
4199 case '\\':
4200 switch (*++p) {
4201 case 'A':
1ed8eac0
JF
4202 case 'C':
4203 case 'X':
a0d0e21e
LW
4204 case 'G':
4205 case 'Z':
b85d18e9 4206 case 'z':
a0d0e21e
LW
4207 case 'w':
4208 case 'W':
4209 case 'b':
4210 case 'B':
4211 case 's':
4212 case 'S':
4213 case 'd':
4214 case 'D':
a14b48bc
LW
4215 case 'p':
4216 case 'P':
a0d0e21e
LW
4217 --p;
4218 goto loopdone;
4219 case 'n':
4220 ender = '\n';
4221 p++;
a687059c 4222 break;
a0d0e21e
LW
4223 case 'r':
4224 ender = '\r';
4225 p++;
a687059c 4226 break;
a0d0e21e
LW
4227 case 't':
4228 ender = '\t';
4229 p++;
a687059c 4230 break;
a0d0e21e
LW
4231 case 'f':
4232 ender = '\f';
4233 p++;
a687059c 4234 break;
a0d0e21e 4235 case 'e':
c7f1f016 4236 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4237 p++;
a687059c 4238 break;
a0d0e21e 4239 case 'a':
c7f1f016 4240 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4241 p++;
a687059c 4242 break;
a0d0e21e 4243 case 'x':
a0ed51b3
LW
4244 if (*++p == '{') {
4245 char* e = strchr(p, '}');
b81d288d 4246
b45f050a 4247 if (!e) {
830247a4 4248 RExC_parse = p + 1;
b45f050a
JF
4249 vFAIL("Missing right brace on \\x{}");
4250 }
de5f0749 4251 else {
a4c04bdc
NC
4252 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4253 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4254 numlen = e - p - 1;
4255 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4256 if (ender > 0xff)
4257 RExC_utf8 = 1;
a0ed51b3
LW
4258 p = e + 1;
4259 }
a0ed51b3
LW
4260 }
4261 else {
a4c04bdc 4262 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4263 numlen = 2;
4264 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4265 p += numlen;
4266 }
a687059c 4267 break;
a0d0e21e
LW
4268 case 'c':
4269 p++;
bbce6d69 4270 ender = UCHARAT(p++);
4271 ender = toCTRL(ender);
a687059c 4272 break;
a0d0e21e
LW
4273 case '0': case '1': case '2': case '3':case '4':
4274 case '5': case '6': case '7': case '8':case '9':
4275 if (*p == '0' ||
830247a4 4276 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
4277 I32 flags = 0;
4278 numlen = 3;
4279 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4280 p += numlen;
4281 }
4282 else {
4283 --p;
4284 goto loopdone;
a687059c
LW
4285 }
4286 break;
a0d0e21e 4287 case '\0':
830247a4 4288 if (p >= RExC_end)
b45f050a 4289 FAIL("Trailing \\");
a687059c 4290 /* FALL THROUGH */
a0d0e21e 4291 default:
e476b1b5 4292 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 4293 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4294 goto normal_default;
a0d0e21e
LW
4295 }
4296 break;
a687059c 4297 default:
a0ed51b3 4298 normal_default:
fd400ab9 4299 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 4300 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 4301 &numlen, 0);
a0ed51b3
LW
4302 p += numlen;
4303 }
4304 else
4305 ender = *p++;
a0d0e21e 4306 break;
a687059c 4307 }
e2509266 4308 if (RExC_flags & PMf_EXTENDED)
830247a4 4309 p = regwhite(p, RExC_end);
60a8b682
JH
4310 if (UTF && FOLD) {
4311 /* Prime the casefolded buffer. */
ac7e0132 4312 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4313 }
a0d0e21e
LW
4314 if (ISMULT2(p)) { /* Back off on ?+*. */
4315 if (len)
4316 p = oldp;
16ea2a2e 4317 else if (UTF) {
0ebc6274
JH
4318 STRLEN unilen;
4319
80aecb99 4320 if (FOLD) {
60a8b682 4321 /* Emit all the Unicode characters. */
80aecb99
JH
4322 for (foldbuf = tmpbuf;
4323 foldlen;
4324 foldlen -= numlen) {
4325 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4326 if (numlen > 0) {
0ebc6274
JH
4327 reguni(pRExC_state, ender, s, &unilen);
4328 s += unilen;
4329 len += unilen;
4330 /* In EBCDIC the numlen
4331 * and unilen can differ. */
9dc45d57 4332 foldbuf += numlen;
47654450
JH
4333 if (numlen >= foldlen)
4334 break;
9dc45d57
JH
4335 }
4336 else
4337 break; /* "Can't happen." */
80aecb99
JH
4338 }
4339 }
4340 else {
0ebc6274 4341 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4342 if (unilen > 0) {
0ebc6274
JH
4343 s += unilen;
4344 len += unilen;
9dc45d57 4345 }
80aecb99 4346 }
a0ed51b3 4347 }
a0d0e21e
LW
4348 else {
4349 len++;
eb160463 4350 REGC((char)ender, s++);
a0d0e21e
LW
4351 }
4352 break;
a687059c 4353 }
16ea2a2e 4354 if (UTF) {
0ebc6274
JH
4355 STRLEN unilen;
4356
80aecb99 4357 if (FOLD) {
60a8b682 4358 /* Emit all the Unicode characters. */
80aecb99
JH
4359 for (foldbuf = tmpbuf;
4360 foldlen;
4361 foldlen -= numlen) {
4362 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4363 if (numlen > 0) {
0ebc6274
JH
4364 reguni(pRExC_state, ender, s, &unilen);
4365 len += unilen;
4366 s += unilen;
4367 /* In EBCDIC the numlen
4368 * and unilen can differ. */
9dc45d57 4369 foldbuf += numlen;
47654450
JH
4370 if (numlen >= foldlen)
4371 break;
9dc45d57
JH
4372 }
4373 else
4374 break;
80aecb99
JH
4375 }
4376 }
4377 else {
0ebc6274 4378 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4379 if (unilen > 0) {
0ebc6274
JH
4380 s += unilen;
4381 len += unilen;
9dc45d57 4382 }
80aecb99
JH
4383 }
4384 len--;
a0ed51b3
LW
4385 }
4386 else
eb160463 4387 REGC((char)ender, s++);
a0d0e21e
LW
4388 }
4389 loopdone:
830247a4 4390 RExC_parse = p - 1;
fac92740 4391 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4392 nextchar(pRExC_state);
793db0cb
JH
4393 {
4394 /* len is STRLEN which is unsigned, need to copy to signed */
4395 IV iv = len;
4396 if (iv < 0)
4397 vFAIL("Internal disaster");
4398 }
a0d0e21e
LW
4399 if (len > 0)
4400 *flagp |= HASWIDTH;
090f7165 4401 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4402 *flagp |= SIMPLE;
c277df42 4403 if (!SIZE_ONLY)
cd439c50
IZ
4404 STR_LEN(ret) = len;
4405 if (SIZE_ONLY)
830247a4 4406 RExC_size += STR_SZ(len);
cd439c50 4407 else
830247a4 4408 RExC_emit += STR_SZ(len);
a687059c 4409 }
a0d0e21e
LW
4410 break;
4411 }
a687059c 4412
60a8b682
JH
4413 /* If the encoding pragma is in effect recode the text of
4414 * any EXACT-kind nodes. */
22c54be3 4415 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
4416 STRLEN oldlen = STR_LEN(ret);
4417 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4418
4419 if (RExC_utf8)
4420 SvUTF8_on(sv);
4421 if (sv_utf8_downgrade(sv, TRUE)) {
4422 char *s = sv_recode_to_utf8(sv, PL_encoding);
4423 STRLEN newlen = SvCUR(sv);
4424
4425 if (SvUTF8(sv))
4426 RExC_utf8 = 1;
4427 if (!SIZE_ONLY) {
a3621e74
YO
4428 GET_RE_DEBUG_FLAGS_DECL;
4429 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
4430 (int)oldlen, STRING(ret),
4431 (int)newlen, s));
4432 Copy(s, STRING(ret), newlen, char);
4433 STR_LEN(ret) += newlen - oldlen;
4434 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4435 } else
4436 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4437 }
a72c7584
JH
4438 }
4439
a0d0e21e 4440 return(ret);
a687059c
LW
4441}
4442
873ef191 4443STATIC char *
cea2e8a9 4444S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
4445{
4446 while (p < e) {
4447 if (isSPACE(*p))
4448 ++p;
4449 else if (*p == '#') {
4450 do {
4451 p++;
4452 } while (p < e && *p != '\n');
4453 }
4454 else
4455 break;
4456 }
4457 return p;
4458}
4459
b8c5462f
JH
4460/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4461 Character classes ([:foo:]) can also be negated ([:^foo:]).
4462 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4463 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 4464 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
4465
4466#define POSIXCC_DONE(c) ((c) == ':')
4467#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4468#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4469
b8c5462f 4470STATIC I32
830247a4 4471S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
4472{
4473 char *posixcc = 0;
936ed897 4474 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 4475
830247a4 4476 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 4477 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
4478 POSIXCC(UCHARAT(RExC_parse))) {
4479 char c = UCHARAT(RExC_parse);
830247a4 4480 char* s = RExC_parse++;
b81d288d 4481
9a86a77b 4482 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
4483 RExC_parse++;
4484 if (RExC_parse == RExC_end)
620e46c5 4485 /* Grandfather lone [:, [=, [. */
830247a4 4486 RExC_parse = s;
620e46c5 4487 else {
830247a4 4488 char* t = RExC_parse++; /* skip over the c */
b8c5462f 4489
80916619
NC
4490 assert(*t == c);
4491
9a86a77b 4492 if (UCHARAT(RExC_parse) == ']') {
830247a4 4493 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
4494 posixcc = s + 1;
4495 if (*s == ':') {
4496 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
80916619
NC
4497 I32 skip = t - posixcc;
4498
4499 /* Initially switch on the length of the name. */
4500 switch (skip) {
4501 case 4:
4502 if (memEQ(posixcc, "word", 4)) {
4503 /* this is not POSIX, this is the Perl \w */;
4504 namedclass
4505 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4506 }
cc4319de 4507 break;
80916619
NC
4508 case 5:
4509 /* Names all of length 5. */
4510 /* alnum alpha ascii blank cntrl digit graph lower
4511 print punct space upper */
4512 /* Offset 4 gives the best switch position. */
4513 switch (posixcc[4]) {
4514 case 'a':
4515 if (memEQ(posixcc, "alph", 4)) {
4516 /* a */
4517 namedclass
4518 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4519 }
4520 break;
4521 case 'e':
4522 if (memEQ(posixcc, "spac", 4)) {
4523 /* e */
4524 namedclass
4525 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4526 }
4527 break;
4528 case 'h':
4529 if (memEQ(posixcc, "grap", 4)) {
4530 /* h */
4531 namedclass
4532 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4533 }
4534 break;
4535 case 'i':
4536 if (memEQ(posixcc, "asci", 4)) {
4537 /* i */
4538 namedclass
4539 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4540 }
4541 break;
4542 case 'k':
4543 if (memEQ(posixcc, "blan", 4)) {
4544 /* k */
4545 namedclass
4546 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4547 }
4548 break;
4549 case 'l':
4550 if (memEQ(posixcc, "cntr", 4)) {
4551 /* l */
4552 namedclass
4553 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4554 }
4555 break;
4556 case 'm':
4557 if (memEQ(posixcc, "alnu", 4)) {
4558 /* m */
4559 namedclass
4560 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4561 }
4562 break;
4563 case 'r':
4564 if (memEQ(posixcc, "lowe", 4)) {
4565 /* r */
4566 namedclass
4567 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4568 }
4569 if (memEQ(posixcc, "uppe", 4)) {
8fdec511 4570 /* r */
80916619
NC
4571 namedclass
4572 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4573 }
4574 break;
4575 case 't':
4576 if (memEQ(posixcc, "digi", 4)) {
4577 /* t */
4578 namedclass
4579 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4580 }
4581 if (memEQ(posixcc, "prin", 4)) {
8fdec511 4582 /* t */
80916619
NC
4583 namedclass
4584 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4585 }
4586 if (memEQ(posixcc, "punc", 4)) {
4587 /* t */
4588 namedclass
4589 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4590 }
4591 break;
b8c5462f
JH
4592 }
4593 break;
80916619
NC
4594 case 6:
4595 if (memEQ(posixcc, "xdigit", 6)) {
4596 namedclass
4597 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
4598 }
4599 break;
4600 }
80916619
NC
4601
4602 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
4603 {
4604 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4605 t - s - 1, s + 1);
4606 }
80916619
NC
4607 assert (posixcc[skip] == ':');
4608 assert (posixcc[skip+1] == ']');
b45f050a 4609 } else if (!SIZE_ONLY) {
b8c5462f 4610 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 4611
830247a4 4612 /* adjust RExC_parse so the warning shows after
b45f050a 4613 the class closes */
9a86a77b 4614 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 4615 RExC_parse++;
b45f050a
JF
4616 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4617 }
b8c5462f
JH
4618 } else {
4619 /* Maternal grandfather:
4620 * "[:" ending in ":" but not in ":]" */
830247a4 4621 RExC_parse = s;
767d463e 4622 }
620e46c5
JH
4623 }
4624 }
4625
b8c5462f
JH
4626 return namedclass;
4627}
4628
4629STATIC void
830247a4 4630S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 4631{
b938889d 4632 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
830247a4 4633 char *s = RExC_parse;
93733859 4634 char c = *s++;
b8c5462f
JH
4635
4636 while(*s && isALNUM(*s))
4637 s++;
4638 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
4639 if (ckWARN(WARN_REGEXP))
4640 vWARN3(s+2,
4641 "POSIX syntax [%c %c] belongs inside character classes",
4642 c, c);
b45f050a
JF
4643
4644 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 4645 if (POSIXCC_NOTYET(c)) {
830247a4 4646 /* adjust RExC_parse so the error shows after
b45f050a 4647 the class closes */
9a86a77b 4648 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
4649 ;
4650 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4651 }
b8c5462f
JH
4652 }
4653 }
620e46c5
JH
4654}
4655
76e3520e 4656STATIC regnode *
830247a4 4657S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 4658{
ffc61ed2 4659 register UV value;
9a86a77b 4660 register UV nextvalue;
3568d838 4661 register IV prevvalue = OOB_UNICODE;
ffc61ed2 4662 register IV range = 0;
c277df42 4663 register regnode *ret;
ba210ebe 4664 STRLEN numlen;
ffc61ed2 4665 IV namedclass;
9c5ffd7c 4666 char *rangebegin = 0;
936ed897 4667 bool need_class = 0;
9c5ffd7c 4668 SV *listsv = Nullsv;
ffc61ed2
JH
4669 register char *e;
4670 UV n;
9e55ce06
JH
4671 bool optimize_invert = TRUE;
4672 AV* unicode_alternate = 0;
1b2d223b
JH
4673#ifdef EBCDIC
4674 UV literal_endpoint = 0;
4675#endif
ffc61ed2
JH
4676
4677 ret = reganode(pRExC_state, ANYOF, 0);
4678
4679 if (!SIZE_ONLY)
4680 ANYOF_FLAGS(ret) = 0;
4681
9a86a77b 4682 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
4683 RExC_naughty++;
4684 RExC_parse++;
4685 if (!SIZE_ONLY)
4686 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4687 }
a0d0e21e 4688
936ed897 4689 if (SIZE_ONLY)
830247a4 4690 RExC_size += ANYOF_SKIP;
936ed897 4691 else {
830247a4 4692 RExC_emit += ANYOF_SKIP;
936ed897
IZ
4693 if (FOLD)
4694 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4695 if (LOC)
4696 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
4697 ANYOF_BITMAP_ZERO(ret);
4698 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 4699 }
b8c5462f 4700
9a86a77b
JH
4701 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4702
b938889d 4703 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 4704 checkposixcc(pRExC_state);
b8c5462f 4705
f064b6ad
HS
4706 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4707 if (UCHARAT(RExC_parse) == ']')
4708 goto charclassloop;
ffc61ed2 4709
9a86a77b 4710 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
4711
4712 charclassloop:
4713
4714 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4715
73b437c8 4716 if (!range)
830247a4 4717 rangebegin = RExC_parse;
ffc61ed2 4718 if (UTF) {
5e12f4fb 4719 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
4720 RExC_end - RExC_parse,
4721 &numlen, 0);
ffc61ed2
JH
4722 RExC_parse += numlen;
4723 }
4724 else
4725 value = UCHARAT(RExC_parse++);
9a86a77b
JH
4726 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4727 if (value == '[' && POSIXCC(nextvalue))
830247a4 4728 namedclass = regpposixcc(pRExC_state, value);
620e46c5 4729 else if (value == '\\') {
ffc61ed2 4730 if (UTF) {
5e12f4fb 4731 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
4732 RExC_end - RExC_parse,
4733 &numlen, 0);
4734 RExC_parse += numlen;
4735 }
4736 else
4737 value = UCHARAT(RExC_parse++);
470c3474 4738 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 4739 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
4740 * be a problem later if we want switch on Unicode.
4741 * A similar issue a little bit later when switching on
4742 * namedclass. --jhi */
ffc61ed2 4743 switch ((I32)value) {
b8c5462f
JH
4744 case 'w': namedclass = ANYOF_ALNUM; break;
4745 case 'W': namedclass = ANYOF_NALNUM; break;
4746 case 's': namedclass = ANYOF_SPACE; break;
4747 case 'S': namedclass = ANYOF_NSPACE; break;
4748 case 'd': namedclass = ANYOF_DIGIT; break;
4749 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
4750 case 'p':
4751 case 'P':
af6f566e 4752 if (RExC_parse >= RExC_end)
2a4859cd 4753 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 4754 if (*RExC_parse == '{') {
0da60cf5 4755 U8 c = (U8)value;
ffc61ed2
JH
4756 e = strchr(RExC_parse++, '}');
4757 if (!e)
0da60cf5 4758 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
4759 while (isSPACE(UCHARAT(RExC_parse)))
4760 RExC_parse++;
4761 if (e == RExC_parse)
0da60cf5 4762 vFAIL2("Empty \\%c{}", c);
ffc61ed2 4763 n = e - RExC_parse;
ab13f0c7
JH
4764 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4765 n--;
ffc61ed2
JH
4766 }
4767 else {
4768 e = RExC_parse;
4769 n = 1;
4770 }
4771 if (!SIZE_ONLY) {
ab13f0c7
JH
4772 if (UCHARAT(RExC_parse) == '^') {
4773 RExC_parse++;
4774 n--;
4775 value = value == 'p' ? 'P' : 'p'; /* toggle */
4776 while (isSPACE(UCHARAT(RExC_parse))) {
4777 RExC_parse++;
4778 n--;
4779 }
4780 }
ffc61ed2 4781 if (value == 'p')
ab13f0c7
JH
4782 Perl_sv_catpvf(aTHX_ listsv,
4783 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 4784 else
ab13f0c7
JH
4785 Perl_sv_catpvf(aTHX_ listsv,
4786 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
4787 }
4788 RExC_parse = e + 1;
4789 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2
JP
4790 namedclass = ANYOF_MAX; /* no official name, but it's named */
4791 break;
b8c5462f
JH
4792 case 'n': value = '\n'; break;
4793 case 'r': value = '\r'; break;
4794 case 't': value = '\t'; break;
4795 case 'f': value = '\f'; break;
4796 case 'b': value = '\b'; break;
c7f1f016
NIS
4797 case 'e': value = ASCII_TO_NATIVE('\033');break;
4798 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 4799 case 'x':
ffc61ed2 4800 if (*RExC_parse == '{') {
a4c04bdc
NC
4801 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4802 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 4803 e = strchr(RExC_parse++, '}');
b81d288d 4804 if (!e)
ffc61ed2 4805 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
4806
4807 numlen = e - RExC_parse;
4808 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4809 RExC_parse = e + 1;
4810 }
4811 else {
a4c04bdc 4812 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4813 numlen = 2;
4814 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4815 RExC_parse += numlen;
4816 }
b8c5462f
JH
4817 break;
4818 case 'c':
830247a4 4819 value = UCHARAT(RExC_parse++);
b8c5462f
JH
4820 value = toCTRL(value);
4821 break;
4822 case '0': case '1': case '2': case '3': case '4':
4823 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
4824 {
4825 I32 flags = 0;
4826 numlen = 3;
4827 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 4828 RExC_parse += numlen;
b8c5462f 4829 break;
53305cf1 4830 }
1028017a 4831 default:
e476b1b5 4832 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
4833 vWARN2(RExC_parse,
4834 "Unrecognized escape \\%c in character class passed through",
4835 (int)value);
1028017a 4836 break;
b8c5462f 4837 }
ffc61ed2 4838 } /* end of \blah */
1b2d223b
JH
4839#ifdef EBCDIC
4840 else
4841 literal_endpoint++;
4842#endif
ffc61ed2
JH
4843
4844 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4845
4846 if (!SIZE_ONLY && !need_class)
936ed897 4847 ANYOF_CLASS_ZERO(ret);
ffc61ed2 4848
936ed897 4849 need_class = 1;
ffc61ed2
JH
4850
4851 /* a bad range like a-\d, a-[:digit:] ? */
4852 if (range) {
73b437c8 4853 if (!SIZE_ONLY) {
e476b1b5 4854 if (ckWARN(WARN_REGEXP))
830247a4 4855 vWARN4(RExC_parse,
b45f050a 4856 "False [] range \"%*.*s\"",
830247a4
IZ
4857 RExC_parse - rangebegin,
4858 RExC_parse - rangebegin,
b45f050a 4859 rangebegin);
3568d838
JH
4860 if (prevvalue < 256) {
4861 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
4862 ANYOF_BITMAP_SET(ret, '-');
4863 }
4864 else {
4865 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4866 Perl_sv_catpvf(aTHX_ listsv,
3568d838 4867 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 4868 }
b8c5462f 4869 }
ffc61ed2
JH
4870
4871 range = 0; /* this was not a true range */
73b437c8 4872 }
ffc61ed2 4873
73b437c8 4874 if (!SIZE_ONLY) {
c49a72a9
NC
4875 const char *what = NULL;
4876 char yesno = 0;
4877
3568d838
JH
4878 if (namedclass > OOB_NAMEDCLASS)
4879 optimize_invert = FALSE;
e2962f66
JH
4880 /* Possible truncation here but in some 64-bit environments
4881 * the compiler gets heartburn about switch on 64-bit values.
4882 * A similar issue a little earlier when switching on value.
98f323fa 4883 * --jhi */
e2962f66 4884 switch ((I32)namedclass) {
73b437c8
JH
4885 case ANYOF_ALNUM:
4886 if (LOC)
936ed897 4887 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
4888 else {
4889 for (value = 0; value < 256; value++)
4890 if (isALNUM(value))
936ed897 4891 ANYOF_BITMAP_SET(ret, value);
73b437c8 4892 }
c49a72a9
NC
4893 yesno = '+';
4894 what = "Word";
73b437c8
JH
4895 break;
4896 case ANYOF_NALNUM:
4897 if (LOC)
936ed897 4898 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
4899 else {
4900 for (value = 0; value < 256; value++)
4901 if (!isALNUM(value))
936ed897 4902 ANYOF_BITMAP_SET(ret, value);
73b437c8 4903 }
c49a72a9
NC
4904 yesno = '!';
4905 what = "Word";
73b437c8 4906 break;
ffc61ed2 4907 case ANYOF_ALNUMC:
73b437c8 4908 if (LOC)
ffc61ed2 4909 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
4910 else {
4911 for (value = 0; value < 256; value++)
ffc61ed2 4912 if (isALNUMC(value))
936ed897 4913 ANYOF_BITMAP_SET(ret, value);
73b437c8 4914 }
c49a72a9
NC
4915 yesno = '+';
4916 what = "Alnum";
73b437c8
JH
4917 break;
4918 case ANYOF_NALNUMC:
4919 if (LOC)
936ed897 4920 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
4921 else {
4922 for (value = 0; value < 256; value++)
4923 if (!isALNUMC(value))
936ed897 4924 ANYOF_BITMAP_SET(ret, value);
73b437c8 4925 }
c49a72a9
NC
4926 yesno = '!';
4927 what = "Alnum";
73b437c8
JH
4928 break;
4929 case ANYOF_ALPHA:
4930 if (LOC)
936ed897 4931 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
4932 else {
4933 for (value = 0; value < 256; value++)
4934 if (isALPHA(value))
936ed897 4935 ANYOF_BITMAP_SET(ret, value);
73b437c8 4936 }
c49a72a9
NC
4937 yesno = '+';
4938 what = "Alpha";
73b437c8
JH
4939 break;
4940 case ANYOF_NALPHA:
4941 if (LOC)
936ed897 4942 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
4943 else {
4944 for (value = 0; value < 256; value++)
4945 if (!isALPHA(value))
936ed897 4946 ANYOF_BITMAP_SET(ret, value);
73b437c8 4947 }
c49a72a9
NC
4948 yesno = '!';
4949 what = "Alpha";
73b437c8
JH
4950 break;
4951 case ANYOF_ASCII:
4952 if (LOC)
936ed897 4953 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 4954 else {
c7f1f016 4955#ifndef EBCDIC
1ba5c669
JH
4956 for (value = 0; value < 128; value++)
4957 ANYOF_BITMAP_SET(ret, value);
4958#else /* EBCDIC */
ffbc6a93 4959 for (value = 0; value < 256; value++) {
3a3c4447
JH
4960 if (isASCII(value))
4961 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4962 }
1ba5c669 4963#endif /* EBCDIC */
73b437c8 4964 }
c49a72a9
NC
4965 yesno = '+';
4966 what = "ASCII";
73b437c8
JH
4967 break;
4968 case ANYOF_NASCII:
4969 if (LOC)
936ed897 4970 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 4971 else {
c7f1f016 4972#ifndef EBCDIC
1ba5c669
JH
4973 for (value = 128; value < 256; value++)
4974 ANYOF_BITMAP_SET(ret, value);
4975#else /* EBCDIC */
ffbc6a93 4976 for (value = 0; value < 256; value++) {
3a3c4447
JH
4977 if (!isASCII(value))
4978 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4979 }
1ba5c669 4980#endif /* EBCDIC */
73b437c8 4981 }
c49a72a9
NC
4982 yesno = '!';
4983 what = "ASCII";
73b437c8 4984 break;
aaa51d5e
JF
4985 case ANYOF_BLANK:
4986 if (LOC)
4987 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4988 else {
4989 for (value = 0; value < 256; value++)
4990 if (isBLANK(value))
4991 ANYOF_BITMAP_SET(ret, value);
4992 }
c49a72a9
NC
4993 yesno = '+';
4994 what = "Blank";
aaa51d5e
JF
4995 break;
4996 case ANYOF_NBLANK:
4997 if (LOC)
4998 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4999 else {
5000 for (value = 0; value < 256; value++)
5001 if (!isBLANK(value))
5002 ANYOF_BITMAP_SET(ret, value);
5003 }
c49a72a9
NC
5004 yesno = '!';
5005 what = "Blank";
aaa51d5e 5006 break;
73b437c8
JH
5007 case ANYOF_CNTRL:
5008 if (LOC)
936ed897 5009 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
5010 else {
5011 for (value = 0; value < 256; value++)
5012 if (isCNTRL(value))
936ed897 5013 ANYOF_BITMAP_SET(ret, value);
73b437c8 5014 }
c49a72a9
NC
5015 yesno = '+';
5016 what = "Cntrl";
73b437c8
JH
5017 break;
5018 case ANYOF_NCNTRL:
5019 if (LOC)
936ed897 5020 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5021 else {
5022 for (value = 0; value < 256; value++)
5023 if (!isCNTRL(value))
936ed897 5024 ANYOF_BITMAP_SET(ret, value);
73b437c8 5025 }
c49a72a9
NC
5026 yesno = '!';
5027 what = "Cntrl";
ffc61ed2
JH
5028 break;
5029 case ANYOF_DIGIT:
5030 if (LOC)
5031 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5032 else {
5033 /* consecutive digits assumed */
5034 for (value = '0'; value <= '9'; value++)
5035 ANYOF_BITMAP_SET(ret, value);
5036 }
c49a72a9
NC
5037 yesno = '+';
5038 what = "Digit";
ffc61ed2
JH
5039 break;
5040 case ANYOF_NDIGIT:
5041 if (LOC)
5042 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5043 else {
5044 /* consecutive digits assumed */
5045 for (value = 0; value < '0'; value++)
5046 ANYOF_BITMAP_SET(ret, value);
5047 for (value = '9' + 1; value < 256; value++)
5048 ANYOF_BITMAP_SET(ret, value);
5049 }
c49a72a9
NC
5050 yesno = '!';
5051 what = "Digit";
73b437c8
JH
5052 break;
5053 case ANYOF_GRAPH:
5054 if (LOC)
936ed897 5055 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5056 else {
5057 for (value = 0; value < 256; value++)
5058 if (isGRAPH(value))
936ed897 5059 ANYOF_BITMAP_SET(ret, value);
73b437c8 5060 }
c49a72a9
NC
5061 yesno = '+';
5062 what = "Graph";
73b437c8
JH
5063 break;
5064 case ANYOF_NGRAPH:
5065 if (LOC)
936ed897 5066 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5067 else {
5068 for (value = 0; value < 256; value++)
5069 if (!isGRAPH(value))
936ed897 5070 ANYOF_BITMAP_SET(ret, value);
73b437c8 5071 }
c49a72a9
NC
5072 yesno = '!';
5073 what = "Graph";
73b437c8
JH
5074 break;
5075 case ANYOF_LOWER:
5076 if (LOC)
936ed897 5077 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5078 else {
5079 for (value = 0; value < 256; value++)
5080 if (isLOWER(value))
936ed897 5081 ANYOF_BITMAP_SET(ret, value);
73b437c8 5082 }
c49a72a9
NC
5083 yesno = '+';
5084 what = "Lower";
73b437c8
JH
5085 break;
5086 case ANYOF_NLOWER:
5087 if (LOC)
936ed897 5088 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5089 else {
5090 for (value = 0; value < 256; value++)
5091 if (!isLOWER(value))
936ed897 5092 ANYOF_BITMAP_SET(ret, value);
73b437c8 5093 }
c49a72a9
NC
5094 yesno = '!';
5095 what = "Lower";
73b437c8
JH
5096 break;
5097 case ANYOF_PRINT:
5098 if (LOC)
936ed897 5099 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5100 else {
5101 for (value = 0; value < 256; value++)
5102 if (isPRINT(value))
936ed897 5103 ANYOF_BITMAP_SET(ret, value);
73b437c8 5104 }
c49a72a9
NC
5105 yesno = '+';
5106 what = "Print";
73b437c8
JH
5107 break;
5108 case ANYOF_NPRINT:
5109 if (LOC)
936ed897 5110 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5111 else {
5112 for (value = 0; value < 256; value++)
5113 if (!isPRINT(value))
936ed897 5114 ANYOF_BITMAP_SET(ret, value);
73b437c8 5115 }
c49a72a9
NC
5116 yesno = '!';
5117 what = "Print";
73b437c8 5118 break;
aaa51d5e
JF
5119 case ANYOF_PSXSPC:
5120 if (LOC)
5121 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5122 else {
5123 for (value = 0; value < 256; value++)
5124 if (isPSXSPC(value))
5125 ANYOF_BITMAP_SET(ret, value);
5126 }
c49a72a9
NC
5127 yesno = '+';
5128 what = "Space";
aaa51d5e
JF
5129 break;
5130 case ANYOF_NPSXSPC:
5131 if (LOC)
5132 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5133 else {
5134 for (value = 0; value < 256; value++)
5135 if (!isPSXSPC(value))
5136 ANYOF_BITMAP_SET(ret, value);
5137 }
c49a72a9
NC
5138 yesno = '!';
5139 what = "Space";
aaa51d5e 5140 break;
73b437c8
JH
5141 case ANYOF_PUNCT:
5142 if (LOC)
936ed897 5143 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5144 else {
5145 for (value = 0; value < 256; value++)
5146 if (isPUNCT(value))
936ed897 5147 ANYOF_BITMAP_SET(ret, value);
73b437c8 5148 }
c49a72a9
NC
5149 yesno = '+';
5150 what = "Punct";
73b437c8
JH
5151 break;
5152 case ANYOF_NPUNCT:
5153 if (LOC)
936ed897 5154 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5155 else {
5156 for (value = 0; value < 256; value++)
5157 if (!isPUNCT(value))
936ed897 5158 ANYOF_BITMAP_SET(ret, value);
73b437c8 5159 }
c49a72a9
NC
5160 yesno = '!';
5161 what = "Punct";
ffc61ed2
JH
5162 break;
5163 case ANYOF_SPACE:
5164 if (LOC)
5165 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5166 else {
5167 for (value = 0; value < 256; value++)
5168 if (isSPACE(value))
5169 ANYOF_BITMAP_SET(ret, value);
5170 }
c49a72a9
NC
5171 yesno = '+';
5172 what = "SpacePerl";
ffc61ed2
JH
5173 break;
5174 case ANYOF_NSPACE:
5175 if (LOC)
5176 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5177 else {
5178 for (value = 0; value < 256; value++)
5179 if (!isSPACE(value))
5180 ANYOF_BITMAP_SET(ret, value);
5181 }
c49a72a9
NC
5182 yesno = '!';
5183 what = "SpacePerl";
73b437c8
JH
5184 break;
5185 case ANYOF_UPPER:
5186 if (LOC)
936ed897 5187 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5188 else {
5189 for (value = 0; value < 256; value++)
5190 if (isUPPER(value))
936ed897 5191 ANYOF_BITMAP_SET(ret, value);
73b437c8 5192 }
c49a72a9
NC
5193 yesno = '+';
5194 what = "Upper";
73b437c8
JH
5195 break;
5196 case ANYOF_NUPPER:
5197 if (LOC)
936ed897 5198 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5199 else {
5200 for (value = 0; value < 256; value++)
5201 if (!isUPPER(value))
936ed897 5202 ANYOF_BITMAP_SET(ret, value);
73b437c8 5203 }
c49a72a9
NC
5204 yesno = '!';
5205 what = "Upper";
73b437c8
JH
5206 break;
5207 case ANYOF_XDIGIT:
5208 if (LOC)
936ed897 5209 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5210 else {
5211 for (value = 0; value < 256; value++)
5212 if (isXDIGIT(value))
936ed897 5213 ANYOF_BITMAP_SET(ret, value);
73b437c8 5214 }
c49a72a9
NC
5215 yesno = '+';
5216 what = "XDigit";
73b437c8
JH
5217 break;
5218 case ANYOF_NXDIGIT:
5219 if (LOC)
936ed897 5220 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5221 else {
5222 for (value = 0; value < 256; value++)
5223 if (!isXDIGIT(value))
936ed897 5224 ANYOF_BITMAP_SET(ret, value);
73b437c8 5225 }
c49a72a9
NC
5226 yesno = '!';
5227 what = "XDigit";
73b437c8 5228 break;
f81125e2
JP
5229 case ANYOF_MAX:
5230 /* this is to handle \p and \P */
5231 break;
73b437c8 5232 default:
b45f050a 5233 vFAIL("Invalid [::] class");
73b437c8 5234 break;
b8c5462f 5235 }
c49a72a9
NC
5236 if (what) {
5237 /* Strings such as "+utf8::isWord\n" */
5238 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5239 }
b8c5462f 5240 if (LOC)
936ed897 5241 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5242 continue;
a0d0e21e 5243 }
ffc61ed2
JH
5244 } /* end of namedclass \blah */
5245
a0d0e21e 5246 if (range) {
eb160463 5247 if (prevvalue > (IV)value) /* b-a */ {
b45f050a 5248 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
5249 RExC_parse - rangebegin,
5250 RExC_parse - rangebegin,
b45f050a 5251 rangebegin);
3568d838 5252 range = 0; /* not a valid range */
73b437c8 5253 }
a0d0e21e
LW
5254 }
5255 else {
3568d838 5256 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5257 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5258 RExC_parse[1] != ']') {
5259 RExC_parse++;
ffc61ed2
JH
5260
5261 /* a bad range like \w-, [:word:]- ? */
5262 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 5263 if (ckWARN(WARN_REGEXP))
830247a4 5264 vWARN4(RExC_parse,
b45f050a 5265 "False [] range \"%*.*s\"",
830247a4
IZ
5266 RExC_parse - rangebegin,
5267 RExC_parse - rangebegin,
b45f050a 5268 rangebegin);
73b437c8 5269 if (!SIZE_ONLY)
936ed897 5270 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5271 } else
ffc61ed2
JH
5272 range = 1; /* yeah, it's a range! */
5273 continue; /* but do it the next time */
a0d0e21e 5274 }
a687059c 5275 }
ffc61ed2 5276
93733859 5277 /* now is the next time */
ae5c130c 5278 if (!SIZE_ONLY) {
3568d838
JH
5279 IV i;
5280
5281 if (prevvalue < 256) {
5282 IV ceilvalue = value < 256 ? value : 255;
5283
5284#ifdef EBCDIC
1b2d223b
JH
5285 /* In EBCDIC [\x89-\x91] should include
5286 * the \x8e but [i-j] should not. */
5287 if (literal_endpoint == 2 &&
5288 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5289 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5290 {
3568d838
JH
5291 if (isLOWER(prevvalue)) {
5292 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5293 if (isLOWER(i))
5294 ANYOF_BITMAP_SET(ret, i);
5295 } else {
3568d838 5296 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5297 if (isUPPER(i))
5298 ANYOF_BITMAP_SET(ret, i);
5299 }
8ada0baa 5300 }
ffc61ed2 5301 else
8ada0baa 5302#endif
a5961de5
JH
5303 for (i = prevvalue; i <= ceilvalue; i++)
5304 ANYOF_BITMAP_SET(ret, i);
3568d838 5305 }
a5961de5 5306 if (value > 255 || UTF) {
b08decb7
JH
5307 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5308 UV natvalue = NATIVE_TO_UNI(value);
5309
ffc61ed2 5310 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5311 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5312 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5313 prevnatvalue, natvalue);
5314 }
5315 else if (prevnatvalue == natvalue) {
5316 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5317 if (FOLD) {
89ebb4a3 5318 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5319 STRLEN foldlen;
2f3bf011 5320 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5321
c840d2a2
JH
5322 /* If folding and foldable and a single
5323 * character, insert also the folded version
5324 * to the charclass. */
9e55ce06 5325 if (f != value) {
eb160463 5326 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5327 Perl_sv_catpvf(aTHX_ listsv,
5328 "%04"UVxf"\n", f);
5329 else {
5330 /* Any multicharacter foldings
5331 * require the following transform:
5332 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5333 * where E folds into "pq" and F folds
5334 * into "rst", all other characters
5335 * fold to single characters. We save
5336 * away these multicharacter foldings,
5337 * to be later saved as part of the
5338 * additional "s" data. */
5339 SV *sv;
5340
5341 if (!unicode_alternate)
5342 unicode_alternate = newAV();
5343 sv = newSVpvn((char*)foldbuf, foldlen);
5344 SvUTF8_on(sv);
5345 av_push(unicode_alternate, sv);
5346 }
5347 }
254ba52a 5348
60a8b682
JH
5349 /* If folding and the value is one of the Greek
5350 * sigmas insert a few more sigmas to make the
5351 * folding rules of the sigmas to work right.
5352 * Note that not all the possible combinations
5353 * are handled here: some of them are handled
9e55ce06
JH
5354 * by the standard folding rules, and some of
5355 * them (literal or EXACTF cases) are handled
5356 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5357 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5358 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5359 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5360 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5361 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5362 }
5363 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5364 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5365 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5366 }
5367 }
ffc61ed2 5368 }
1b2d223b
JH
5369#ifdef EBCDIC
5370 literal_endpoint = 0;
5371#endif
8ada0baa 5372 }
ffc61ed2
JH
5373
5374 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5375 }
ffc61ed2 5376
936ed897 5377 if (need_class) {
4f66b38d 5378 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5379 if (SIZE_ONLY)
830247a4 5380 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5381 else
830247a4 5382 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5383 }
ffc61ed2 5384
ae5c130c 5385 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 5386 if (!SIZE_ONLY &&
ffc61ed2 5387 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5388 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5389 ) {
a0ed51b3 5390 for (value = 0; value < 256; ++value) {
936ed897 5391 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5392 UV fold = PL_fold[value];
ffc61ed2
JH
5393
5394 if (fold != value)
5395 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5396 }
5397 }
936ed897 5398 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5399 }
ffc61ed2 5400
ae5c130c 5401 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 5402 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
5403 /* If the only flag is inversion. */
5404 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5405 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5406 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5407 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5408 }
a0d0e21e 5409
b81d288d 5410 if (!SIZE_ONLY) {
fde631ed 5411 AV *av = newAV();
ffc61ed2
JH
5412 SV *rv;
5413
9e55ce06 5414 /* The 0th element stores the character class description
6a0407ee 5415 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5416 * to initialize the appropriate swash (which gets stored in
5417 * the 1st element), and also useful for dumping the regnode.
5418 * The 2nd element stores the multicharacter foldings,
6a0407ee 5419 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
5420 av_store(av, 0, listsv);
5421 av_store(av, 1, NULL);
9e55ce06 5422 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 5423 rv = newRV_noinc((SV*)av);
19860706 5424 n = add_data(pRExC_state, 1, "s");
830247a4 5425 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 5426 ARG_SET(ret, n);
a0ed51b3
LW
5427 }
5428
5429 return ret;
5430}
5431
76e3520e 5432STATIC char*
830247a4 5433S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 5434{
830247a4 5435 char* retval = RExC_parse++;
a0d0e21e 5436
4633a7c4 5437 for (;;) {
830247a4
IZ
5438 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5439 RExC_parse[2] == '#') {
e994fd66
AE
5440 while (*RExC_parse != ')') {
5441 if (RExC_parse == RExC_end)
5442 FAIL("Sequence (?#... not terminated");
830247a4 5443 RExC_parse++;
e994fd66 5444 }
830247a4 5445 RExC_parse++;
4633a7c4
LW
5446 continue;
5447 }
e2509266 5448 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
5449 if (isSPACE(*RExC_parse)) {
5450 RExC_parse++;
748a9306
LW
5451 continue;
5452 }
830247a4 5453 else if (*RExC_parse == '#') {
e994fd66
AE
5454 while (RExC_parse < RExC_end)
5455 if (*RExC_parse++ == '\n') break;
748a9306
LW
5456 continue;
5457 }
748a9306 5458 }
4633a7c4 5459 return retval;
a0d0e21e 5460 }
a687059c
LW
5461}
5462
5463/*
c277df42 5464- reg_node - emit a node
a0d0e21e 5465*/
76e3520e 5466STATIC regnode * /* Location. */
830247a4 5467S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 5468{
c277df42
IZ
5469 register regnode *ret;
5470 register regnode *ptr;
a687059c 5471
830247a4 5472 ret = RExC_emit;
c277df42 5473 if (SIZE_ONLY) {
830247a4
IZ
5474 SIZE_ALIGN(RExC_size);
5475 RExC_size += 1;
a0d0e21e
LW
5476 return(ret);
5477 }
a687059c 5478
c277df42 5479 NODE_ALIGN_FILL(ret);
a0d0e21e 5480 ptr = ret;
c277df42 5481 FILL_ADVANCE_NODE(ptr, op);
fac92740 5482 if (RExC_offsets) { /* MJD */
ccb2c380 5483 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
5484 "reg_node", __LINE__,
5485 reg_name[op],
5486 RExC_emit - RExC_emit_start > RExC_offsets[0]
5487 ? "Overwriting end of array!\n" : "OK",
5488 RExC_emit - RExC_emit_start,
5489 RExC_parse - RExC_start,
5490 RExC_offsets[0]));
ccb2c380 5491 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
5492 }
5493
830247a4 5494 RExC_emit = ptr;
a687059c 5495
a0d0e21e 5496 return(ret);
a687059c
LW
5497}
5498
5499/*
a0d0e21e
LW
5500- reganode - emit a node with an argument
5501*/
76e3520e 5502STATIC regnode * /* Location. */
830247a4 5503S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 5504{
c277df42
IZ
5505 register regnode *ret;
5506 register regnode *ptr;
fe14fcc3 5507
830247a4 5508 ret = RExC_emit;
c277df42 5509 if (SIZE_ONLY) {
830247a4
IZ
5510 SIZE_ALIGN(RExC_size);
5511 RExC_size += 2;
a0d0e21e
LW
5512 return(ret);
5513 }
fe14fcc3 5514
c277df42 5515 NODE_ALIGN_FILL(ret);
a0d0e21e 5516 ptr = ret;
c277df42 5517 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 5518 if (RExC_offsets) { /* MJD */
ccb2c380 5519 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5520 "reganode",
ccb2c380
MP
5521 __LINE__,
5522 reg_name[op],
fac92740
MJD
5523 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5524 "Overwriting end of array!\n" : "OK",
5525 RExC_emit - RExC_emit_start,
5526 RExC_parse - RExC_start,
5527 RExC_offsets[0]));
ccb2c380 5528 Set_Cur_Node_Offset;
fac92740
MJD
5529 }
5530
830247a4 5531 RExC_emit = ptr;
fe14fcc3 5532
a0d0e21e 5533 return(ret);
fe14fcc3
LW
5534}
5535
5536/*
cd439c50 5537- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
5538*/
5539STATIC void
830247a4 5540S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 5541{
5e12f4fb 5542 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
5543}
5544
5545/*
a0d0e21e
LW
5546- reginsert - insert an operator in front of already-emitted operand
5547*
5548* Means relocating the operand.
5549*/
76e3520e 5550STATIC void
830247a4 5551S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 5552{
c277df42
IZ
5553 register regnode *src;
5554 register regnode *dst;
5555 register regnode *place;
5556 register int offset = regarglen[(U8)op];
b81d288d 5557
22c35a8c 5558/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
5559
5560 if (SIZE_ONLY) {
830247a4 5561 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
5562 return;
5563 }
a687059c 5564
830247a4
IZ
5565 src = RExC_emit;
5566 RExC_emit += NODE_STEP_REGNODE + offset;
5567 dst = RExC_emit;
fac92740 5568 while (src > opnd) {
c277df42 5569 StructCopy(--src, --dst, regnode);
fac92740 5570 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 5571 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 5572 "reg_insert",
ccb2c380
MP
5573 __LINE__,
5574 reg_name[op],
fac92740
MJD
5575 dst - RExC_emit_start > RExC_offsets[0]
5576 ? "Overwriting end of array!\n" : "OK",
5577 src - RExC_emit_start,
5578 dst - RExC_emit_start,
5579 RExC_offsets[0]));
ccb2c380
MP
5580 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5581 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
5582 }
5583 }
5584
a0d0e21e
LW
5585
5586 place = opnd; /* Op node, where operand used to be. */
fac92740 5587 if (RExC_offsets) { /* MJD */
ccb2c380 5588 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5589 "reginsert",
ccb2c380
MP
5590 __LINE__,
5591 reg_name[op],
fac92740
MJD
5592 place - RExC_emit_start > RExC_offsets[0]
5593 ? "Overwriting end of array!\n" : "OK",
5594 place - RExC_emit_start,
5595 RExC_parse - RExC_start,
5596 RExC_offsets[0]));
ccb2c380 5597 Set_Node_Offset(place, RExC_parse);
45948336 5598 Set_Node_Length(place, 1);
fac92740 5599 }
c277df42
IZ
5600 src = NEXTOPER(place);
5601 FILL_ADVANCE_NODE(place, op);
5602 Zero(src, offset, regnode);
a687059c
LW
5603}
5604
5605/*
c277df42 5606- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 5607*/
76e3520e 5608STATIC void
830247a4 5609S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5610{
c277df42
IZ
5611 register regnode *scan;
5612 register regnode *temp;
a0d0e21e 5613
c277df42 5614 if (SIZE_ONLY)
a0d0e21e
LW
5615 return;
5616
5617 /* Find last node. */
5618 scan = p;
5619 for (;;) {
5620 temp = regnext(scan);
5621 if (temp == NULL)
5622 break;
5623 scan = temp;
5624 }
a687059c 5625
c277df42
IZ
5626 if (reg_off_by_arg[OP(scan)]) {
5627 ARG_SET(scan, val - scan);
a0ed51b3
LW
5628 }
5629 else {
c277df42
IZ
5630 NEXT_OFF(scan) = val - scan;
5631 }
a687059c
LW
5632}
5633
5634/*
a0d0e21e
LW
5635- regoptail - regtail on operand of first argument; nop if operandless
5636*/
76e3520e 5637STATIC void
830247a4 5638S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5639{
a0d0e21e 5640 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
5641 if (p == NULL || SIZE_ONLY)
5642 return;
22c35a8c 5643 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 5644 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 5645 }
22c35a8c 5646 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 5647 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
5648 }
5649 else
a0d0e21e 5650 return;
a687059c
LW
5651}
5652
5653/*
5654 - regcurly - a little FSA that accepts {\d+,?\d*}
5655 */
79072805 5656STATIC I32
cea2e8a9 5657S_regcurly(pTHX_ register char *s)
a687059c
LW
5658{
5659 if (*s++ != '{')
5660 return FALSE;
f0fcb552 5661 if (!isDIGIT(*s))
a687059c 5662 return FALSE;
f0fcb552 5663 while (isDIGIT(*s))
a687059c
LW
5664 s++;
5665 if (*s == ',')
5666 s++;
f0fcb552 5667 while (isDIGIT(*s))
a687059c
LW
5668 s++;
5669 if (*s != '}')
5670 return FALSE;
5671 return TRUE;
5672}
5673
a687059c 5674
8fa7f367
JH
5675#ifdef DEBUGGING
5676
76e3520e 5677STATIC regnode *
cea2e8a9 5678S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 5679{
f248d071 5680 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 5681 register regnode *next;
c277df42
IZ
5682
5683 while (op != END && (!last || node < last)) {
5684 /* While that wasn't END last time... */
5685
5686 NODE_ALIGN(node);
5687 op = OP(node);
5688 if (op == CLOSE)
5689 l--;
5690 next = regnext(node);
5691 /* Where, what. */
5692 if (OP(node) == OPTIMIZED)
5693 goto after_print;
5694 regprop(sv, node);
b900a521 5695 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 5696 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
5697 if (next == NULL) /* Next ptr. */
5698 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 5699 else
b900a521 5700 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
5701 (void)PerlIO_putc(Perl_debug_log, '\n');
5702 after_print:
22c35a8c 5703 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
5704 register regnode *nnode = (OP(next) == LONGJMP
5705 ? regnext(next)
c277df42
IZ
5706 : next);
5707 if (last && nnode > last)
5708 nnode = last;
5709 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 5710 }
22c35a8c 5711 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 5712 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3 5713 }
a3621e74 5714 else if ( PL_regkind[(U8)op] == TRIE ) {
e1ec3a88
AL
5715 const I32 n = ARG(node);
5716 const reg_trie_data *trie = (reg_trie_data*)PL_regdata->data[n];
5717 const I32 arry_len = av_len(trie->words)+1;
a3621e74 5718 I32 word_idx;
a3621e74 5719 PerlIO_printf(Perl_debug_log,
e4584336 5720 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
a3621e74
YO
5721 (int)(2*(l+3)), "",
5722 trie->wordcount,
5723 trie->charcount,
5724 trie->uniquecharcount,
e4584336 5725 (IV)trie->laststate-1,
a3621e74
YO
5726 node->flags ? " EVAL mode" : "");
5727
5728 for (word_idx=0; word_idx < arry_len; word_idx++) {
5729 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
5730 if (elem_ptr) {
5731 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
5732 (int)(2*(l+4)), "",
5733 PL_colors[0],
5734 SvPV_nolen(*elem_ptr),
5735 PL_colors[1]
5736 );
5737 /*
5738 if (next == NULL)
5739 PerlIO_printf(Perl_debug_log, "(0)\n");
5740 else
5741 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
5742 */
5743 }
5744
5745 }
5746
5747 node = NEXTOPER(node);
5748 node += regarglen[(U8)op];
5749
5750 }
a0ed51b3 5751 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
5752 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5753 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 5754 }
22c35a8c 5755 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
5756 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5757 next, sv, l + 1);
a0ed51b3
LW
5758 }
5759 else if ( op == PLUS || op == STAR) {
c277df42 5760 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
5761 }
5762 else if (op == ANYOF) {
4f66b38d
HS
5763 /* arglen 1 + class block */
5764 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
5765 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
5766 node = NEXTOPER(node);
a0ed51b3 5767 }
22c35a8c 5768 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 5769 /* Literal string, where present. */
cd439c50 5770 node += NODE_SZ_STR(node) - 1;
c277df42 5771 node = NEXTOPER(node);
a0ed51b3
LW
5772 }
5773 else {
c277df42
IZ
5774 node = NEXTOPER(node);
5775 node += regarglen[(U8)op];
5776 }
5777 if (op == CURLYX || op == OPEN)
5778 l++;
5779 else if (op == WHILEM)
5780 l--;
5781 }
5782 return node;
5783}
5784
8fa7f367
JH
5785#endif /* DEBUGGING */
5786
a687059c 5787/*
fd181c75 5788 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
5789 */
5790void
864dbfa3 5791Perl_regdump(pTHX_ regexp *r)
a687059c 5792{
35ff7856 5793#ifdef DEBUGGING
46fc3d4c 5794 SV *sv = sv_newmortal();
a687059c 5795
c277df42 5796 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
5797
5798 /* Header fields of interest. */
c277df42 5799 if (r->anchored_substr)
7b0972df 5800 PerlIO_printf(Perl_debug_log,
b81d288d 5801 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 5802 PL_colors[0],
7b0972df 5803 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 5804 SvPVX(r->anchored_substr),
3280af22 5805 PL_colors[1],
c277df42 5806 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 5807 (IV)r->anchored_offset);
33b8afdf
JH
5808 else if (r->anchored_utf8)
5809 PerlIO_printf(Perl_debug_log,
5810 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
5811 PL_colors[0],
5812 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5813 SvPVX(r->anchored_utf8),
5814 PL_colors[1],
5815 SvTAIL(r->anchored_utf8) ? "$" : "",
5816 (IV)r->anchored_offset);
c277df42 5817 if (r->float_substr)
7b0972df 5818 PerlIO_printf(Perl_debug_log,
b81d288d 5819 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 5820 PL_colors[0],
b81d288d 5821 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 5822 SvPVX(r->float_substr),
3280af22 5823 PL_colors[1],
c277df42 5824 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 5825 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
5826 else if (r->float_utf8)
5827 PerlIO_printf(Perl_debug_log,
5828 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
5829 PL_colors[0],
5830 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5831 SvPVX(r->float_utf8),
5832 PL_colors[1],
5833 SvTAIL(r->float_utf8) ? "$" : "",
5834 (IV)r->float_min_offset, (UV)r->float_max_offset);
5835 if (r->check_substr || r->check_utf8)
b81d288d
AB
5836 PerlIO_printf(Perl_debug_log,
5837 r->check_substr == r->float_substr
33b8afdf 5838 && r->check_utf8 == r->float_utf8
c277df42
IZ
5839 ? "(checking floating" : "(checking anchored");
5840 if (r->reganch & ROPT_NOSCAN)
5841 PerlIO_printf(Perl_debug_log, " noscan");
5842 if (r->reganch & ROPT_CHECK_ALL)
5843 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 5844 if (r->check_substr || r->check_utf8)
c277df42
IZ
5845 PerlIO_printf(Perl_debug_log, ") ");
5846
46fc3d4c 5847 if (r->regstclass) {
5848 regprop(sv, r->regstclass);
5849 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
5850 }
774d564b 5851 if (r->reganch & ROPT_ANCH) {
5852 PerlIO_printf(Perl_debug_log, "anchored");
5853 if (r->reganch & ROPT_ANCH_BOL)
5854 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
5855 if (r->reganch & ROPT_ANCH_MBOL)
5856 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
5857 if (r->reganch & ROPT_ANCH_SBOL)
5858 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 5859 if (r->reganch & ROPT_ANCH_GPOS)
5860 PerlIO_printf(Perl_debug_log, "(GPOS)");
5861 PerlIO_putc(Perl_debug_log, ' ');
5862 }
c277df42
IZ
5863 if (r->reganch & ROPT_GPOS_SEEN)
5864 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 5865 if (r->reganch & ROPT_SKIP)
760ac839 5866 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 5867 if (r->reganch & ROPT_IMPLICIT)
760ac839 5868 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 5869 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
5870 if (r->reganch & ROPT_EVAL_SEEN)
5871 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 5872 PerlIO_printf(Perl_debug_log, "\n");
fac92740 5873 if (r->offsets) {
e4584336
RB
5874 U32 i;
5875 const U32 len = r->offsets[0];
a3621e74
YO
5876 GET_RE_DEBUG_FLAGS_DECL;
5877 DEBUG_OFFSETS_r({
e4584336
RB
5878 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5879 for (i = 1; i <= len; i++)
5880 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5881 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5882 PerlIO_printf(Perl_debug_log, "\n");
a3621e74 5883 });
fac92740 5884 }
17c3b450 5885#endif /* DEBUGGING */
a687059c
LW
5886}
5887
8fa7f367
JH
5888#ifdef DEBUGGING
5889
653099ff
GS
5890STATIC void
5891S_put_byte(pTHX_ SV *sv, int c)
5892{
7be5a6cf 5893 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
5894 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5895 else if (c == '-' || c == ']' || c == '\\' || c == '^')
5896 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
5897 else
5898 Perl_sv_catpvf(aTHX_ sv, "%c", c);
5899}
5900
8fa7f367
JH
5901#endif /* DEBUGGING */
5902
a3621e74 5903
a687059c 5904/*
a0d0e21e
LW
5905- regprop - printable representation of opcode
5906*/
46fc3d4c 5907void
864dbfa3 5908Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 5909{
35ff7856 5910#ifdef DEBUGGING
9b155405 5911 register int k;
a0d0e21e 5912
54dc92de 5913 sv_setpvn(sv, "", 0);
9b155405 5914 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
5915 /* It would be nice to FAIL() here, but this may be called from
5916 regexec.c, and it would be hard to supply pRExC_state. */
5917 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 5918 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405
IZ
5919
5920 k = PL_regkind[(U8)OP(o)];
5921
2a782b5b
JH
5922 if (k == EXACT) {
5923 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
5924 /* Using is_utf8_string() is a crude hack but it may
5925 * be the best for now since we have no flag "this EXACTish
5926 * node was UTF-8" --jhi */
5927 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
8a989385 5928 char *s = do_utf8 ?
c728cb41
JH
5929 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5930 UNI_DISPLAY_REGEX) :
2a782b5b 5931 STRING(o);
e1ec3a88 5932 const int len = do_utf8 ?
2a782b5b
JH
5933 strlen(s) :
5934 STR_LEN(o);
5935 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5936 PL_colors[0],
5937 len, s,
5938 PL_colors[1]);
a3621e74
YO
5939 } else if (k == TRIE) {/*
5940 this isn't always safe, as Pl_regdata may not be for this regex yet
5941 (depending on where its called from) so its being moved to dumpuntil
5942 I32 n = ARG(o);
5943 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5944 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5945 trie->wordcount,
5946 trie->charcount,
5947 trie->uniquecharcount,
5948 trie->laststate);
5949 */
5950 } else if (k == CURLY) {
cb434fcc 5951 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
5952 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5953 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 5954 }
2c2d71f5
JH
5955 else if (k == WHILEM && o->flags) /* Ordinal/of */
5956 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 5957 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 5958 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 5959 else if (k == LOGICAL)
04ebc1ab 5960 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
5961 else if (k == ANYOF) {
5962 int i, rangestart = -1;
ffc61ed2 5963 U8 flags = ANYOF_FLAGS(o);
a6d05634 5964 const char * const anyofs[] = { /* Should be synchronized with
19860706 5965 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
5966 "\\w",
5967 "\\W",
5968 "\\s",
5969 "\\S",
5970 "\\d",
5971 "\\D",
5972 "[:alnum:]",
5973 "[:^alnum:]",
5974 "[:alpha:]",
5975 "[:^alpha:]",
5976 "[:ascii:]",
5977 "[:^ascii:]",
5978 "[:ctrl:]",
5979 "[:^ctrl:]",
5980 "[:graph:]",
5981 "[:^graph:]",
5982 "[:lower:]",
5983 "[:^lower:]",
5984 "[:print:]",
5985 "[:^print:]",
5986 "[:punct:]",
5987 "[:^punct:]",
5988 "[:upper:]",
aaa51d5e 5989 "[:^upper:]",
653099ff 5990 "[:xdigit:]",
aaa51d5e
JF
5991 "[:^xdigit:]",
5992 "[:space:]",
5993 "[:^space:]",
5994 "[:blank:]",
5995 "[:^blank:]"
653099ff
GS
5996 };
5997
19860706 5998 if (flags & ANYOF_LOCALE)
653099ff 5999 sv_catpv(sv, "{loc}");
19860706 6000 if (flags & ANYOF_FOLD)
653099ff
GS
6001 sv_catpv(sv, "{i}");
6002 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 6003 if (flags & ANYOF_INVERT)
653099ff 6004 sv_catpv(sv, "^");
ffc61ed2
JH
6005 for (i = 0; i <= 256; i++) {
6006 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6007 if (rangestart == -1)
6008 rangestart = i;
6009 } else if (rangestart != -1) {
6010 if (i <= rangestart + 3)
6011 for (; rangestart < i; rangestart++)
653099ff 6012 put_byte(sv, rangestart);
ffc61ed2
JH
6013 else {
6014 put_byte(sv, rangestart);
6015 sv_catpv(sv, "-");
6016 put_byte(sv, i - 1);
653099ff 6017 }
ffc61ed2 6018 rangestart = -1;
653099ff 6019 }
847a199f 6020 }
ffc61ed2
JH
6021
6022 if (o->flags & ANYOF_CLASS)
6023 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
6024 if (ANYOF_CLASS_TEST(o,i))
6025 sv_catpv(sv, anyofs[i]);
6026
6027 if (flags & ANYOF_UNICODE)
6028 sv_catpv(sv, "{unicode}");
1aa99e6b 6029 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 6030 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
6031
6032 {
6033 SV *lv;
9e55ce06 6034 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 6035
ffc61ed2
JH
6036 if (lv) {
6037 if (sw) {
89ebb4a3 6038 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 6039
ffc61ed2 6040 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 6041 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 6042
3568d838 6043 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
6044 if (rangestart == -1)
6045 rangestart = i;
6046 } else if (rangestart != -1) {
6047 U8 *p;
b81d288d 6048
ffc61ed2
JH
6049 if (i <= rangestart + 3)
6050 for (; rangestart < i; rangestart++) {
2b9d42f0 6051 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
6052 put_byte(sv, *p);
6053 }
6054 else {
2b9d42f0 6055 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
6056 put_byte(sv, *p);
6057 sv_catpv(sv, "-");
2b9d42f0 6058 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
6059 put_byte(sv, *p);
6060 }
6061 rangestart = -1;
6062 }
19860706 6063 }
ffc61ed2
JH
6064
6065 sv_catpv(sv, "..."); /* et cetera */
19860706 6066 }
fde631ed 6067
ffc61ed2 6068 {
2e0de35c 6069 char *s = savesvpv(lv);
ffc61ed2 6070 char *origs = s;
b81d288d 6071
ffc61ed2 6072 while(*s && *s != '\n') s++;
b81d288d 6073
ffc61ed2
JH
6074 if (*s == '\n') {
6075 char *t = ++s;
6076
6077 while (*s) {
6078 if (*s == '\n')
6079 *s = ' ';
6080 s++;
6081 }
6082 if (s[-1] == ' ')
6083 s[-1] = 0;
6084
6085 sv_catpv(sv, t);
fde631ed 6086 }
b81d288d 6087
ffc61ed2 6088 Safefree(origs);
fde631ed
JH
6089 }
6090 }
653099ff 6091 }
ffc61ed2 6092
653099ff
GS
6093 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6094 }
9b155405 6095 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 6096 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 6097#endif /* DEBUGGING */
35ff7856 6098}
a687059c 6099
cad2e5aa
JH
6100SV *
6101Perl_re_intuit_string(pTHX_ regexp *prog)
6102{ /* Assume that RE_INTUIT is set */
a3621e74
YO
6103 GET_RE_DEBUG_FLAGS_DECL;
6104 DEBUG_COMPILE_r(
cad2e5aa 6105 { STRLEN n_a;
33b8afdf
JH
6106 char *s = SvPV(prog->check_substr
6107 ? prog->check_substr : prog->check_utf8, n_a);
cad2e5aa
JH
6108
6109 if (!PL_colorset) reginitcolors();
6110 PerlIO_printf(Perl_debug_log,
33b8afdf
JH
6111 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
6112 PL_colors[4],
6113 prog->check_substr ? "" : "utf8 ",
6114 PL_colors[5],PL_colors[0],
cad2e5aa
JH
6115 s,
6116 PL_colors[1],
6117 (strlen(s) > 60 ? "..." : ""));
6118 } );
6119
33b8afdf 6120 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
6121}
6122
2b69d0c2 6123void
864dbfa3 6124Perl_pregfree(pTHX_ struct regexp *r)
a687059c 6125{
9e55ce06
JH
6126#ifdef DEBUGGING
6127 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
a3621e74 6128 SV *re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
9e55ce06 6129#endif
7821416a 6130
a3621e74 6131
7821416a
IZ
6132 if (!r || (--r->refcnt > 0))
6133 return;
a3621e74 6134 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
e1ec3a88
AL
6135 const char *s = (r->reganch & ROPT_UTF8)
6136 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 6137 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
e1ec3a88 6138 const int len = SvCUR(dsv);
9e55ce06
JH
6139 if (!PL_colorset)
6140 reginitcolors();
6141 PerlIO_printf(Perl_debug_log,
a3621e74 6142 "%sFreeing REx:%s %s%*.*s%s%s\n",
9e55ce06
JH
6143 PL_colors[4],PL_colors[5],PL_colors[0],
6144 len, len, s,
6145 PL_colors[1],
6146 len > 60 ? "..." : "");
6147 });
cad2e5aa 6148
c277df42 6149 if (r->precomp)
a0d0e21e 6150 Safefree(r->precomp);
fac92740
MJD
6151 if (r->offsets) /* 20010421 MJD */
6152 Safefree(r->offsets);
ed252734
NC
6153 RX_MATCH_COPY_FREE(r);
6154#ifdef PERL_COPY_ON_WRITE
6155 if (r->saved_copy)
6156 SvREFCNT_dec(r->saved_copy);
6157#endif
a193d654
GS
6158 if (r->substrs) {
6159 if (r->anchored_substr)
6160 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6161 if (r->anchored_utf8)
6162 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6163 if (r->float_substr)
6164 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6165 if (r->float_utf8)
6166 SvREFCNT_dec(r->float_utf8);
2779dcf1 6167 Safefree(r->substrs);
a193d654 6168 }
c277df42
IZ
6169 if (r->data) {
6170 int n = r->data->count;
f3548bdc
DM
6171 PAD* new_comppad = NULL;
6172 PAD* old_comppad;
4026c95a 6173 PADOFFSET refcnt;
dfad63ad 6174
c277df42 6175 while (--n >= 0) {
261faec3 6176 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6177 switch (r->data->what[n]) {
6178 case 's':
6179 SvREFCNT_dec((SV*)r->data->data[n]);
6180 break;
653099ff
GS
6181 case 'f':
6182 Safefree(r->data->data[n]);
6183 break;
dfad63ad
HS
6184 case 'p':
6185 new_comppad = (AV*)r->data->data[n];
6186 break;
c277df42 6187 case 'o':
dfad63ad 6188 if (new_comppad == NULL)
cea2e8a9 6189 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6190 PAD_SAVE_LOCAL(old_comppad,
6191 /* Watch out for global destruction's random ordering. */
6192 (SvTYPE(new_comppad) == SVt_PVAV) ?
6193 new_comppad : Null(PAD *)
6194 );
b34c0dd4 6195 OP_REFCNT_LOCK;
4026c95a
SH
6196 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6197 OP_REFCNT_UNLOCK;
6198 if (!refcnt)
9b978d73 6199 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6200
f3548bdc 6201 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6202 SvREFCNT_dec((SV*)new_comppad);
6203 new_comppad = NULL;
c277df42
IZ
6204 break;
6205 case 'n':
9e55ce06 6206 break;
a3621e74
YO
6207 case 't':
6208 {
6209 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6210 U32 refcount;
6211 OP_REFCNT_LOCK;
6212 refcount = trie->refcount--;
6213 OP_REFCNT_UNLOCK;
6214 if ( !refcount ) {
6215 if (trie->charmap)
6216 Safefree(trie->charmap);
6217 if (trie->widecharmap)
6218 SvREFCNT_dec((SV*)trie->widecharmap);
6219 if (trie->states)
6220 Safefree(trie->states);
6221 if (trie->trans)
6222 Safefree(trie->trans);
6223#ifdef DEBUGGING
6224 if (trie->words)
6225 SvREFCNT_dec((SV*)trie->words);
6226 if (trie->revcharmap)
6227 SvREFCNT_dec((SV*)trie->revcharmap);
6228#endif
6229 Safefree(r->data->data[n]); /* do this last!!!! */
6230 }
6231 break;
6232 }
c277df42 6233 default:
830247a4 6234 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6235 }
6236 }
6237 Safefree(r->data->what);
6238 Safefree(r->data);
a0d0e21e
LW
6239 }
6240 Safefree(r->startp);
6241 Safefree(r->endp);
6242 Safefree(r);
a687059c 6243}
c277df42
IZ
6244
6245/*
6246 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6247 */
6248regnode *
864dbfa3 6249Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
6250{
6251 register I32 offset;
6252
3280af22 6253 if (p == &PL_regdummy)
c277df42
IZ
6254 return(NULL);
6255
6256 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6257 if (offset == 0)
6258 return(NULL);
6259
c277df42 6260 return(p+offset);
c277df42
IZ
6261}
6262
01f988be 6263STATIC void
cea2e8a9 6264S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6265{
6266 va_list args;
6267 STRLEN l1 = strlen(pat1);
6268 STRLEN l2 = strlen(pat2);
6269 char buf[512];
06bf62c7 6270 SV *msv;
73d840c0 6271 const char *message;
c277df42
IZ
6272
6273 if (l1 > 510)
6274 l1 = 510;
6275 if (l1 + l2 > 510)
6276 l2 = 510 - l1;
6277 Copy(pat1, buf, l1 , char);
6278 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6279 buf[l1 + l2] = '\n';
6280 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6281#ifdef I_STDARG
6282 /* ANSI variant takes additional second argument */
c277df42 6283 va_start(args, pat2);
8736538c
AS
6284#else
6285 va_start(args);
6286#endif
5a844595 6287 msv = vmess(buf, &args);
c277df42 6288 va_end(args);
06bf62c7 6289 message = SvPV(msv,l1);
c277df42
IZ
6290 if (l1 > 512)
6291 l1 = 512;
6292 Copy(message, buf, l1 , char);
197cf9b9 6293 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6294 Perl_croak(aTHX_ "%s", buf);
c277df42 6295}
a0ed51b3
LW
6296
6297/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6298
6299void
864dbfa3 6300Perl_save_re_context(pTHX)
b81d288d 6301{
830247a4 6302 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 6303 SAVEPPTR(PL_bostr);
a0ed51b3
LW
6304 SAVEPPTR(PL_reginput); /* String-input pointer. */
6305 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6306 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
6307 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6308 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6309 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a5db57d6 6310 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
a0ed51b3 6311 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 6312 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 6313 PL_reg_start_tmp = 0;
a0ed51b3
LW
6314 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6315 PL_reg_start_tmpl = 0;
7766f137 6316 SAVEVPTR(PL_regdata);
a0ed51b3
LW
6317 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6318 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 6319 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 6320 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
6321 SAVEVPTR(PL_regcc); /* from regexec.c */
6322 SAVEVPTR(PL_curcop);
7766f137
GS
6323 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6324 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
6325 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6326 SAVESPTR(PL_reg_sv); /* from regexec.c */
9febdf04 6327 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
7766f137 6328 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 6329 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
6330 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6331 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
a5db57d6
GS
6332 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
6333 PL_reg_oldsaved = Nullch;
6334 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6335 PL_reg_oldsavedlen = 0;
ed252734
NC
6336#ifdef PERL_COPY_ON_WRITE
6337 SAVESPTR(PL_nrs);
6338 PL_nrs = Nullsv;
6339#endif
a5db57d6
GS
6340 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6341 PL_reg_maxiter = 0;
6342 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6343 PL_reg_leftiter = 0;
6344 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
6345 PL_reg_poscache = Nullch;
6346 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6347 PL_reg_poscache_size = 0;
6348 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5fb7366e 6349 SAVEI32(PL_regnpar); /* () count. */
e49a9654 6350 SAVEI32(PL_regsize); /* from regexec.c */
ada6e8a9
AMS
6351
6352 {
6353 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
a8514157 6354 U32 i;
ada6e8a9
AMS
6355 GV *mgv;
6356 REGEXP *rx;
cf28e18a 6357 char digits[TYPE_CHARS(long)];
ada6e8a9
AMS
6358
6359 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
6360 for (i = 1; i <= rx->nparens; i++) {
d994d9a1 6361 sprintf(digits, "%lu", (long)i);
ada6e8a9
AMS
6362 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
6363 save_scalar(mgv);
6364 }
6365 }
6366 }
6367
54b6e2fa 6368#ifdef DEBUGGING
b81d288d 6369 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 6370#endif
a0ed51b3 6371}
51371543 6372
51371543 6373static void
acfe0abc 6374clear_re(pTHX_ void *r)
51371543
GS
6375{
6376 ReREFCNT_dec((regexp *)r);
6377}
ffbc6a93 6378
241d1a3b
NC
6379/*
6380 * Local variables:
6381 * c-indentation-style: bsd
6382 * c-basic-offset: 4
6383 * indent-tabs-mode: t
6384 * End:
6385 *
c49a72a9 6386 * vim: shiftwidth=4:
241d1a3b 6387*/