This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
literal string macros
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
1aa99e6b 120 I32 utf8;
830247a4
IZ
121#if ADD_TO_REGEXEC
122 char *starttry; /* -Dr: where regtry was called. */
123#define RExC_starttry (pRExC_state->starttry)
124#endif
125} RExC_state_t;
126
e2509266 127#define RExC_flags (pRExC_state->flags)
830247a4
IZ
128#define RExC_precomp (pRExC_state->precomp)
129#define RExC_rx (pRExC_state->rx)
fac92740 130#define RExC_start (pRExC_state->start)
830247a4
IZ
131#define RExC_end (pRExC_state->end)
132#define RExC_parse (pRExC_state->parse)
133#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 134#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 135#define RExC_emit (pRExC_state->emit)
fac92740 136#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
137#define RExC_naughty (pRExC_state->naughty)
138#define RExC_sawback (pRExC_state->sawback)
139#define RExC_seen (pRExC_state->seen)
140#define RExC_size (pRExC_state->size)
141#define RExC_npar (pRExC_state->npar)
142#define RExC_extralen (pRExC_state->extralen)
143#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
144#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 145#define RExC_utf8 (pRExC_state->utf8)
830247a4 146
a687059c
LW
147#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
148#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
149 ((*s) == '{' && regcurly(s)))
a687059c 150
35c8bce7
LW
151#ifdef SPSTART
152#undef SPSTART /* dratted cpp namespace... */
153#endif
a687059c
LW
154/*
155 * Flags to be passed up and down.
156 */
a687059c 157#define WORST 0 /* Worst case. */
821b33a5 158#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
159#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
160#define SPSTART 0x4 /* Starts with * or +. */
161#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 162
2c2d71f5
JH
163/* Length of a variant. */
164
165typedef struct scan_data_t {
166 I32 len_min;
167 I32 len_delta;
168 I32 pos_min;
169 I32 pos_delta;
170 SV *last_found;
171 I32 last_end; /* min value, <0 unless valid. */
172 I32 last_start_min;
173 I32 last_start_max;
174 SV **longest; /* Either &l_fixed, or &l_float. */
175 SV *longest_fixed;
176 I32 offset_fixed;
177 SV *longest_float;
178 I32 offset_float_min;
179 I32 offset_float_max;
180 I32 flags;
181 I32 whilem_c;
cb434fcc 182 I32 *last_closep;
653099ff 183 struct regnode_charclass_class *start_class;
2c2d71f5
JH
184} scan_data_t;
185
a687059c 186/*
e50aee73 187 * Forward declarations for pregcomp()'s friends.
a687059c 188 */
a0d0e21e 189
27da23d5
JH
190static const scan_data_t zero_scan_data =
191 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
192
193#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
194#define SF_BEFORE_SEOL 0x1
195#define SF_BEFORE_MEOL 0x2
196#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
197#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
198
09b7f37c
CB
199#ifdef NO_UNARY_PLUS
200# define SF_FIX_SHIFT_EOL (0+2)
201# define SF_FL_SHIFT_EOL (0+4)
202#else
203# define SF_FIX_SHIFT_EOL (+2)
204# define SF_FL_SHIFT_EOL (+4)
205#endif
c277df42
IZ
206
207#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
208#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
209
210#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
211#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
212#define SF_IS_INF 0x40
213#define SF_HAS_PAR 0x80
214#define SF_IN_PAR 0x100
215#define SF_HAS_EVAL 0x200
4bfe0158 216#define SCF_DO_SUBSTR 0x400
653099ff
GS
217#define SCF_DO_STCLASS_AND 0x0800
218#define SCF_DO_STCLASS_OR 0x1000
219#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 220#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 221
eb160463 222#define UTF (RExC_utf8 != 0)
e2509266
JH
223#define LOC ((RExC_flags & PMf_LOCALE) != 0)
224#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 225
ffc61ed2 226#define OOB_UNICODE 12345678
93733859 227#define OOB_NAMEDCLASS -1
b8c5462f 228
a0ed51b3
LW
229#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
230#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
231
8615cb43 232
b45f050a
JF
233/* length of regex to show in messages that don't mark a position within */
234#define RegexLengthToShowInErrorMessages 127
235
236/*
237 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
238 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
239 * op/pragma/warn/regcomp.
240 */
7253e4e3
RK
241#define MARKER1 "<-- HERE" /* marker as it appears in the description */
242#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 243
7253e4e3 244#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
245
246/*
247 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
248 * arg. Show regex, up to a maximum length. If it's too long, chop and add
249 * "...".
250 */
ccb2c380 251#define FAIL(msg) STMT_START { \
bfed75c6 252 const char *ellipses = ""; \
ccb2c380
MP
253 IV len = RExC_end - RExC_precomp; \
254 \
255 if (!SIZE_ONLY) \
256 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
257 if (len > RegexLengthToShowInErrorMessages) { \
258 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
259 len = RegexLengthToShowInErrorMessages - 10; \
260 ellipses = "..."; \
261 } \
262 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
263 msg, (int)len, RExC_precomp, ellipses); \
264} STMT_END
8615cb43 265
b45f050a 266/*
b45f050a
JF
267 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
268 */
ccb2c380 269#define Simple_vFAIL(m) STMT_START { \
a28509cc 270 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
271 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
272 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
273} STMT_END
b45f050a
JF
274
275/*
276 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
277 */
ccb2c380
MP
278#define vFAIL(m) STMT_START { \
279 if (!SIZE_ONLY) \
280 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
281 Simple_vFAIL(m); \
282} STMT_END
b45f050a
JF
283
284/*
285 * Like Simple_vFAIL(), but accepts two arguments.
286 */
ccb2c380 287#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 288 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
289 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
290 (int)offset, RExC_precomp, RExC_precomp + offset); \
291} STMT_END
b45f050a
JF
292
293/*
294 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
295 */
ccb2c380
MP
296#define vFAIL2(m,a1) STMT_START { \
297 if (!SIZE_ONLY) \
298 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
299 Simple_vFAIL2(m, a1); \
300} STMT_END
b45f050a
JF
301
302
303/*
304 * Like Simple_vFAIL(), but accepts three arguments.
305 */
ccb2c380 306#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 307 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
308 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
309 (int)offset, RExC_precomp, RExC_precomp + offset); \
310} STMT_END
b45f050a
JF
311
312/*
313 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
314 */
ccb2c380
MP
315#define vFAIL3(m,a1,a2) STMT_START { \
316 if (!SIZE_ONLY) \
317 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
318 Simple_vFAIL3(m, a1, a2); \
319} STMT_END
b45f050a
JF
320
321/*
322 * Like Simple_vFAIL(), but accepts four arguments.
323 */
ccb2c380 324#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 325 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
326 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
327 (int)offset, RExC_precomp, RExC_precomp + offset); \
328} STMT_END
b45f050a 329
ccb2c380 330#define vWARN(loc,m) STMT_START { \
a28509cc 331 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
332 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
333 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
334} STMT_END
335
336#define vWARNdep(loc,m) STMT_START { \
a28509cc 337 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
338 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
339 "%s" REPORT_LOCATION, \
340 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
341} STMT_END
342
343
344#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 345 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
346 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
347 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
348} STMT_END
349
350#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 351 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
352 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
353 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
354} STMT_END
355
356#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 357 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
358 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
359 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
360} STMT_END
361
362#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 363 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
364 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
365 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
366} STMT_END
9d1d55b5 367
8615cb43 368
cd439c50 369/* Allow for side effects in s */
ccb2c380
MP
370#define REGC(c,s) STMT_START { \
371 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
372} STMT_END
cd439c50 373
fac92740
MJD
374/* Macros for recording node offsets. 20001227 mjd@plover.com
375 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
376 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
377 * Element 0 holds the number n.
378 */
379
380#define MJD_OFFSET_DEBUG(x)
a3621e74 381/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
382
383
384#define Set_Node_Offset_To_R(node,byte) STMT_START { \
385 if (! SIZE_ONLY) { \
386 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
387 __LINE__, (node), (byte))); \
388 if((node) < 0) { \
551405c4 389 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
390 } else { \
391 RExC_offsets[2*(node)-1] = (byte); \
392 } \
393 } \
394} STMT_END
395
396#define Set_Node_Offset(node,byte) \
397 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
398#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
399
400#define Set_Node_Length_To_R(node,len) STMT_START { \
401 if (! SIZE_ONLY) { \
402 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 403 __LINE__, (int)(node), (int)(len))); \
ccb2c380 404 if((node) < 0) { \
551405c4 405 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
406 } else { \
407 RExC_offsets[2*(node)] = (len); \
408 } \
409 } \
410} STMT_END
411
412#define Set_Node_Length(node,len) \
413 Set_Node_Length_To_R((node)-RExC_emit_start, len)
414#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
415#define Set_Node_Cur_Length(node) \
416 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
417
418/* Get offsets and lengths */
419#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
420#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
421
acfe0abc 422static void clear_re(pTHX_ void *r);
4327152a 423
653099ff
GS
424/* Mark that we cannot extend a found fixed substring at this point.
425 Updata the longest found anchored substring and the longest found
426 floating substrings if needed. */
427
4327152a 428STATIC void
097eb12c 429S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 430{
e1ec3a88
AL
431 const STRLEN l = CHR_SVLEN(data->last_found);
432 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 433
c277df42 434 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 435 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
436 if (*data->longest == data->longest_fixed) {
437 data->offset_fixed = l ? data->last_start_min : data->pos_min;
438 if (data->flags & SF_BEFORE_EOL)
b81d288d 439 data->flags
c277df42
IZ
440 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
441 else
442 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
443 }
444 else {
c277df42 445 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
446 data->offset_float_max = (l
447 ? data->last_start_max
c277df42 448 : data->pos_min + data->pos_delta);
9051bda5
HS
449 if ((U32)data->offset_float_max > (U32)I32_MAX)
450 data->offset_float_max = I32_MAX;
c277df42 451 if (data->flags & SF_BEFORE_EOL)
b81d288d 452 data->flags
c277df42
IZ
453 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
454 else
455 data->flags &= ~SF_FL_BEFORE_EOL;
456 }
457 }
458 SvCUR_set(data->last_found, 0);
0eda9292 459 {
a28509cc 460 SV * const sv = data->last_found;
097eb12c
AL
461 if (SvUTF8(sv) && SvMAGICAL(sv)) {
462 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
463 if (mg)
464 mg->mg_len = 0;
465 }
0eda9292 466 }
c277df42
IZ
467 data->last_end = -1;
468 data->flags &= ~SF_BEFORE_EOL;
469}
470
653099ff
GS
471/* Can match anything (initialization) */
472STATIC void
097eb12c 473S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 474{
653099ff 475 ANYOF_CLASS_ZERO(cl);
f8bef550 476 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 477 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
478 if (LOC)
479 cl->flags |= ANYOF_LOCALE;
480}
481
482/* Can match anything (initialization) */
483STATIC int
5f66b61c 484S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
485{
486 int value;
487
aaa51d5e 488 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
489 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
490 return 1;
1aa99e6b
IH
491 if (!(cl->flags & ANYOF_UNICODE_ALL))
492 return 0;
f8bef550
NC
493 if (!ANYOF_BITMAP_TESTALLSET(cl))
494 return 0;
653099ff
GS
495 return 1;
496}
497
498/* Can match anything (initialization) */
499STATIC void
097eb12c 500S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 501{
8ecf7187 502 Zero(cl, 1, struct regnode_charclass_class);
653099ff 503 cl->type = ANYOF;
830247a4 504 cl_anything(pRExC_state, cl);
653099ff
GS
505}
506
507STATIC void
097eb12c 508S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 509{
8ecf7187 510 Zero(cl, 1, struct regnode_charclass_class);
653099ff 511 cl->type = ANYOF;
830247a4 512 cl_anything(pRExC_state, cl);
653099ff
GS
513 if (LOC)
514 cl->flags |= ANYOF_LOCALE;
515}
516
517/* 'And' a given class with another one. Can create false positives */
518/* We assume that cl is not inverted */
519STATIC void
5f66b61c 520S_cl_and(struct regnode_charclass_class *cl,
a28509cc 521 const struct regnode_charclass_class *and_with)
653099ff 522{
653099ff
GS
523 if (!(and_with->flags & ANYOF_CLASS)
524 && !(cl->flags & ANYOF_CLASS)
525 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
526 && !(and_with->flags & ANYOF_FOLD)
527 && !(cl->flags & ANYOF_FOLD)) {
528 int i;
529
530 if (and_with->flags & ANYOF_INVERT)
531 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
532 cl->bitmap[i] &= ~and_with->bitmap[i];
533 else
534 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
535 cl->bitmap[i] &= and_with->bitmap[i];
536 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
537 if (!(and_with->flags & ANYOF_EOS))
538 cl->flags &= ~ANYOF_EOS;
1aa99e6b 539
14ebb1a2
JH
540 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
541 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
542 cl->flags &= ~ANYOF_UNICODE_ALL;
543 cl->flags |= ANYOF_UNICODE;
544 ARG_SET(cl, ARG(and_with));
545 }
14ebb1a2
JH
546 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
547 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 548 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
549 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
550 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 551 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
552}
553
554/* 'OR' a given class with another one. Can create false positives */
555/* We assume that cl is not inverted */
556STATIC void
097eb12c 557S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 558{
653099ff
GS
559 if (or_with->flags & ANYOF_INVERT) {
560 /* We do not use
561 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
562 * <= (B1 | !B2) | (CL1 | !CL2)
563 * which is wasteful if CL2 is small, but we ignore CL2:
564 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
565 * XXXX Can we handle case-fold? Unclear:
566 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
567 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
568 */
569 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
570 && !(or_with->flags & ANYOF_FOLD)
571 && !(cl->flags & ANYOF_FOLD) ) {
572 int i;
573
574 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
575 cl->bitmap[i] |= ~or_with->bitmap[i];
576 } /* XXXX: logic is complicated otherwise */
577 else {
830247a4 578 cl_anything(pRExC_state, cl);
653099ff
GS
579 }
580 } else {
581 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
582 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 583 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
584 || (cl->flags & ANYOF_FOLD)) ) {
585 int i;
586
587 /* OR char bitmap and class bitmap separately */
588 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
589 cl->bitmap[i] |= or_with->bitmap[i];
590 if (or_with->flags & ANYOF_CLASS) {
591 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
592 cl->classflags[i] |= or_with->classflags[i];
593 cl->flags |= ANYOF_CLASS;
594 }
595 }
596 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 597 cl_anything(pRExC_state, cl);
653099ff
GS
598 }
599 }
600 if (or_with->flags & ANYOF_EOS)
601 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
602
603 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
604 ARG(cl) != ARG(or_with)) {
605 cl->flags |= ANYOF_UNICODE_ALL;
606 cl->flags &= ~ANYOF_UNICODE;
607 }
608 if (or_with->flags & ANYOF_UNICODE_ALL) {
609 cl->flags |= ANYOF_UNICODE_ALL;
610 cl->flags &= ~ANYOF_UNICODE;
611 }
653099ff
GS
612}
613
5d1c421c 614/*
a3621e74
YO
615
616 make_trie(startbranch,first,last,tail,flags)
617 startbranch: the first branch in the whole branch sequence
618 first : start branch of sequence of branch-exact nodes.
619 May be the same as startbranch
620 last : Thing following the last branch.
621 May be the same as tail.
622 tail : item following the branch sequence
623 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
624
625Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
626
627A trie is an N'ary tree where the branches are determined by digital
628decomposition of the key. IE, at the root node you look up the 1st character and
629follow that branch repeat until you find the end of the branches. Nodes can be
630marked as "accepting" meaning they represent a complete word. Eg:
631
632 /he|she|his|hers/
633
634would convert into the following structure. Numbers represent states, letters
635following numbers represent valid transitions on the letter from that state, if
636the number is in square brackets it represents an accepting state, otherwise it
637will be in parenthesis.
638
639 +-h->+-e->[3]-+-r->(8)-+-s->[9]
640 | |
641 | (2)
642 | |
643 (1) +-i->(6)-+-s->[7]
644 |
645 +-s->(3)-+-h->(4)-+-e->[5]
646
647 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
648
649This shows that when matching against the string 'hers' we will begin at state 1
650read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
651then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
652is also accepting. Thus we know that we can match both 'he' and 'hers' with a
653single traverse. We store a mapping from accepting to state to which word was
654matched, and then when we have multiple possibilities we try to complete the
655rest of the regex in the order in which they occured in the alternation.
656
657The only prior NFA like behaviour that would be changed by the TRIE support is
658the silent ignoring of duplicate alternations which are of the form:
659
660 / (DUPE|DUPE) X? (?{ ... }) Y /x
661
662Thus EVAL blocks follwing a trie may be called a different number of times with
663and without the optimisation. With the optimisations dupes will be silently
664ignored. This inconsistant behaviour of EVAL type nodes is well established as
665the following demonstrates:
666
667 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
668
669which prints out 'word' three times, but
670
671 'words'=~/(word|word|word)(?{ print $1 })S/
672
673which doesnt print it out at all. This is due to other optimisations kicking in.
674
675Example of what happens on a structural level:
676
677The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
678
679 1: CURLYM[1] {1,32767}(18)
680 5: BRANCH(8)
681 6: EXACT <ac>(16)
682 8: BRANCH(11)
683 9: EXACT <ad>(16)
684 11: BRANCH(14)
685 12: EXACT <ab>(16)
686 16: SUCCEED(0)
687 17: NOTHING(18)
688 18: END(0)
689
690This would be optimizable with startbranch=5, first=5, last=16, tail=16
691and should turn into:
692
693 1: CURLYM[1] {1,32767}(18)
694 5: TRIE(16)
695 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
696 <ac>
697 <ad>
698 <ab>
699 16: SUCCEED(0)
700 17: NOTHING(18)
701 18: END(0)
702
703Cases where tail != last would be like /(?foo|bar)baz/:
704
705 1: BRANCH(4)
706 2: EXACT <foo>(8)
707 4: BRANCH(7)
708 5: EXACT <bar>(8)
709 7: TAIL(8)
710 8: EXACT <baz>(10)
711 10: END(0)
712
713which would be optimizable with startbranch=1, first=1, last=7, tail=8
714and would end up looking like:
715
716 1: TRIE(8)
717 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
718 <foo>
719 <bar>
720 7: TAIL(8)
721 8: EXACT <baz>(10)
722 10: END(0)
723
724*/
725
726#define TRIE_DEBUG_CHAR \
727 DEBUG_TRIE_COMPILE_r({ \
728 SV *tmp; \
729 if ( UTF ) { \
6136c704 730 tmp = newSVpvs( "" ); \
a3621e74
YO
731 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
732 } else { \
e4584336 733 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
a3621e74
YO
734 } \
735 av_push( trie->revcharmap, tmp ); \
736 })
737
738#define TRIE_READ_CHAR STMT_START { \
739 if ( UTF ) { \
740 if ( folder ) { \
741 if ( foldlen > 0 ) { \
742 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
743 foldlen -= len; \
744 scan += len; \
745 len = 0; \
746 } else { \
e1ec3a88 747 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
748 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
749 foldlen -= UNISKIP( uvc ); \
750 scan = foldbuf + UNISKIP( uvc ); \
751 } \
752 } else { \
e1ec3a88 753 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
754 } \
755 } else { \
756 uvc = (U32)*uc; \
757 len = 1; \
758 } \
759} STMT_END
760
761
762#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
763#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
764#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
765#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
766
767#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
768 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
769 TRIE_LIST_LEN( state ) *= 2; \
770 Renew( trie->states[ state ].trans.list, \
771 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
772 } \
773 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
774 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
775 TRIE_LIST_CUR( state )++; \
776} STMT_END
777
778#define TRIE_LIST_NEW(state) STMT_START { \
a02a5408 779 Newxz( trie->states[ state ].trans.list, \
a3621e74
YO
780 4, reg_trie_trans_le ); \
781 TRIE_LIST_CUR( state ) = 1; \
782 TRIE_LIST_LEN( state ) = 4; \
783} STMT_END
784
785STATIC I32
786S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
787{
27da23d5 788 dVAR;
a3621e74
YO
789 /* first pass, loop through and scan words */
790 reg_trie_data *trie;
791 regnode *cur;
9f7f3913 792 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
793 STRLEN len = 0;
794 UV uvc = 0;
795 U16 curword = 0;
796 U32 next_alloc = 0;
797 /* we just use folder as a flag in utf8 */
e1ec3a88 798 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
799 ? PL_fold
800 : ( flags == EXACTFL
801 ? PL_fold_locale
802 : NULL
803 )
804 );
805
e1ec3a88 806 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74
YO
807 SV *re_trie_maxbuff;
808
809 GET_RE_DEBUG_FLAGS_DECL;
810
a02a5408 811 Newxz( trie, 1, reg_trie_data );
a3621e74
YO
812 trie->refcount = 1;
813 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 814 Newxz( trie->charmap, 256, U16 );
a3621e74
YO
815 DEBUG_r({
816 trie->words = newAV();
817 trie->revcharmap = newAV();
818 });
819
820
0111c4fd 821 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 822 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 823 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74
YO
824 }
825
826 /* -- First loop and Setup --
827
828 We first traverse the branches and scan each word to determine if it
829 contains widechars, and how many unique chars there are, this is
830 important as we have to build a table with at least as many columns as we
831 have unique chars.
832
833 We use an array of integers to represent the character codes 0..255
834 (trie->charmap) and we use a an HV* to store unicode characters. We use the
835 native representation of the character value as the key and IV's for the
836 coded index.
837
838 *TODO* If we keep track of how many times each character is used we can
839 remap the columns so that the table compression later on is more
840 efficient in terms of memory by ensuring most common value is in the
841 middle and the least common are on the outside. IMO this would be better
842 than a most to least common mapping as theres a decent chance the most
843 common letter will share a node with the least common, meaning the node
844 will not be compressable. With a middle is most common approach the worst
845 case is when we have the least common nodes twice.
846
847 */
848
849
850 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 851 regnode * const noper = NEXTOPER( cur );
e1ec3a88 852 const U8 *uc = (U8*)STRING( noper );
a28509cc 853 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
854 STRLEN foldlen = 0;
855 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 856 const U8 *scan = (U8*)NULL;
a3621e74
YO
857
858 for ( ; uc < e ; uc += len ) {
859 trie->charcount++;
860 TRIE_READ_CHAR;
861 if ( uvc < 256 ) {
862 if ( !trie->charmap[ uvc ] ) {
863 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
864 if ( folder )
865 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
866 TRIE_DEBUG_CHAR;
867 }
868 } else {
869 SV** svpp;
870 if ( !trie->widecharmap )
871 trie->widecharmap = newHV();
872
873 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
874
875 if ( !svpp )
e4584336 876 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
877
878 if ( !SvTRUE( *svpp ) ) {
879 sv_setiv( *svpp, ++trie->uniquecharcount );
880 TRIE_DEBUG_CHAR;
881 }
882 }
883 }
884 trie->wordcount++;
885 } /* end first pass */
886 DEBUG_TRIE_COMPILE_r(
887 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
888 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
5d7488b2 889 (int)trie->charcount, trie->uniquecharcount )
a3621e74
YO
890 );
891
892
893 /*
894 We now know what we are dealing with in terms of unique chars and
895 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
896 representation using a flat table will take. If it's over a reasonable
897 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
898 conservative but potentially much slower representation using an array
899 of lists.
900
901 At the end we convert both representations into the same compressed
902 form that will be used in regexec.c for matching with. The latter
903 is a form that cannot be used to construct with but has memory
904 properties similar to the list form and access properties similar
905 to the table form making it both suitable for fast searches and
906 small enough that its feasable to store for the duration of a program.
907
908 See the comment in the code where the compressed table is produced
909 inplace from the flat tabe representation for an explanation of how
910 the compression works.
911
912 */
913
914
915 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
916 /*
917 Second Pass -- Array Of Lists Representation
918
919 Each state will be represented by a list of charid:state records
920 (reg_trie_trans_le) the first such element holds the CUR and LEN
921 points of the allocated array. (See defines above).
922
923 We build the initial structure using the lists, and then convert
924 it into the compressed table form which allows faster lookups
925 (but cant be modified once converted).
926
927
928 */
929
930
931 STRLEN transcount = 1;
932
a02a5408 933 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
934 TRIE_LIST_NEW(1);
935 next_alloc = 2;
936
937 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
938
c445ea15
AL
939 regnode * const noper = NEXTOPER( cur );
940 U8 *uc = (U8*)STRING( noper );
941 const U8 * const e = uc + STR_LEN( noper );
942 U32 state = 1; /* required init */
943 U16 charid = 0; /* sanity init */
944 U8 *scan = (U8*)NULL; /* sanity init */
945 STRLEN foldlen = 0; /* required init */
946 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
947
948 for ( ; uc < e ; uc += len ) {
949
950 TRIE_READ_CHAR;
951
952 if ( uvc < 256 ) {
953 charid = trie->charmap[ uvc ];
954 } else {
955 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
956 if ( !svpp ) {
957 charid = 0;
958 } else {
959 charid=(U16)SvIV( *svpp );
960 }
961 }
962 if ( charid ) {
a3621e74 963
c445ea15
AL
964 U16 check;
965 U32 newstate = 0;
a3621e74 966
c445ea15
AL
967 charid--;
968 if ( !trie->states[ state ].trans.list ) {
969 TRIE_LIST_NEW( state );
970 }
971 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
972 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
973 newstate = TRIE_LIST_ITEM( state, check ).newstate;
974 break;
975 }
976 }
977 if ( ! newstate ) {
978 newstate = next_alloc++;
979 TRIE_LIST_PUSH( state, charid, newstate );
980 transcount++;
981 }
982 state = newstate;
983 } else {
984 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a28509cc 985 }
c445ea15
AL
986 /* charid is now 0 if we dont know the char read, or nonzero if we do */
987 }
a3621e74 988
c445ea15
AL
989 if ( !trie->states[ state ].wordnum ) {
990 /* we havent inserted this word into the structure yet. */
991 trie->states[ state ].wordnum = ++curword;
a3621e74 992
c445ea15
AL
993 DEBUG_r({
994 /* store the word for dumping */
995 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
996 if ( UTF ) SvUTF8_on( tmp );
997 av_push( trie->words, tmp );
998 });
a3621e74 999
c445ea15 1000 } else {
6f207bd3 1001 NOOP; /* It's a dupe. So ignore it. */
c445ea15 1002 }
a3621e74
YO
1003
1004 } /* end second pass */
1005
1006 trie->laststate = next_alloc;
1007 Renew( trie->states, next_alloc, reg_trie_state );
1008
1009 DEBUG_TRIE_COMPILE_MORE_r({
1010 U32 state;
a3621e74 1011
a28509cc 1012 /* print out the table precompression. */
a3621e74
YO
1013
1014 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1015 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1016
1017 for( state=1 ; state < next_alloc ; state ++ ) {
a28509cc 1018 U16 charid;
a3621e74 1019
e4584336 1020 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
a3621e74
YO
1021 if ( ! trie->states[ state ].wordnum ) {
1022 PerlIO_printf( Perl_debug_log, "%5s| ","");
1023 } else {
e4584336 1024 PerlIO_printf( Perl_debug_log, "W%04x| ",
a3621e74
YO
1025 trie->states[ state ].wordnum
1026 );
1027 }
1028 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1029 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
e4584336 1030 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
cfd0369c 1031 SvPV_nolen_const( *tmp ),
a3621e74 1032 TRIE_LIST_ITEM(state,charid).forid,
e4584336 1033 (UV)TRIE_LIST_ITEM(state,charid).newstate
a3621e74
YO
1034 );
1035 }
1036
1037 }
1038 PerlIO_printf( Perl_debug_log, "\n\n" );
1039 });
1040
a02a5408 1041 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1042 {
1043 U32 state;
a3621e74
YO
1044 U32 tp = 0;
1045 U32 zp = 0;
1046
1047
1048 for( state=1 ; state < next_alloc ; state ++ ) {
1049 U32 base=0;
1050
1051 /*
1052 DEBUG_TRIE_COMPILE_MORE_r(
1053 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1054 );
1055 */
1056
1057 if (trie->states[state].trans.list) {
1058 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1059 U16 maxid=minid;
a28509cc 1060 U16 idx;
a3621e74
YO
1061
1062 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1063 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1064 if ( forid < minid ) {
1065 minid=forid;
1066 } else if ( forid > maxid ) {
1067 maxid=forid;
1068 }
a3621e74
YO
1069 }
1070 if ( transcount < tp + maxid - minid + 1) {
1071 transcount *= 2;
1072 Renew( trie->trans, transcount, reg_trie_trans );
1073 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1074 }
1075 base = trie->uniquecharcount + tp - minid;
1076 if ( maxid == minid ) {
1077 U32 set = 0;
1078 for ( ; zp < tp ; zp++ ) {
1079 if ( ! trie->trans[ zp ].next ) {
1080 base = trie->uniquecharcount + zp - minid;
1081 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1082 trie->trans[ zp ].check = state;
1083 set = 1;
1084 break;
1085 }
1086 }
1087 if ( !set ) {
1088 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1089 trie->trans[ tp ].check = state;
1090 tp++;
1091 zp = tp;
1092 }
1093 } else {
1094 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1095 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1096 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1097 trie->trans[ tid ].check = state;
1098 }
1099 tp += ( maxid - minid + 1 );
1100 }
1101 Safefree(trie->states[ state ].trans.list);
1102 }
1103 /*
1104 DEBUG_TRIE_COMPILE_MORE_r(
1105 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1106 );
1107 */
1108 trie->states[ state ].trans.base=base;
1109 }
cc601c31 1110 trie->lasttrans = tp + 1;
a3621e74
YO
1111 }
1112 } else {
1113 /*
1114 Second Pass -- Flat Table Representation.
1115
1116 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1117 We know that we will need Charcount+1 trans at most to store the data
1118 (one row per char at worst case) So we preallocate both structures
1119 assuming worst case.
1120
1121 We then construct the trie using only the .next slots of the entry
1122 structs.
1123
1124 We use the .check field of the first entry of the node temporarily to
1125 make compression both faster and easier by keeping track of how many non
1126 zero fields are in the node.
1127
1128 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1129 transition.
1130
1131 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1132 number representing the first entry of the node, and state as a
1133 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1134 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1135 are 2 entrys per node. eg:
1136
1137 A B A B
1138 1. 2 4 1. 3 7
1139 2. 0 3 3. 0 5
1140 3. 0 0 5. 0 0
1141 4. 0 0 7. 0 0
1142
1143 The table is internally in the right hand, idx form. However as we also
1144 have to deal with the states array which is indexed by nodenum we have to
1145 use TRIE_NODENUM() to convert.
1146
1147 */
1148
a02a5408 1149 Newxz( trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
a3621e74 1150 reg_trie_trans );
a02a5408 1151 Newxz( trie->states, trie->charcount + 2, reg_trie_state );
a3621e74
YO
1152 next_alloc = trie->uniquecharcount + 1;
1153
1154 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1155
c445ea15 1156 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1157 const U8 *uc = (U8*)STRING( noper );
1158 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1159
1160 U32 state = 1; /* required init */
1161
1162 U16 charid = 0; /* sanity init */
1163 U32 accept_state = 0; /* sanity init */
1164 U8 *scan = (U8*)NULL; /* sanity init */
1165
1166 STRLEN foldlen = 0; /* required init */
1167 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1168
1169
1170 for ( ; uc < e ; uc += len ) {
1171
1172 TRIE_READ_CHAR;
1173
1174 if ( uvc < 256 ) {
1175 charid = trie->charmap[ uvc ];
1176 } else {
c445ea15
AL
1177 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1178 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74
YO
1179 }
1180 if ( charid ) {
1181 charid--;
1182 if ( !trie->trans[ state + charid ].next ) {
1183 trie->trans[ state + charid ].next = next_alloc;
1184 trie->trans[ state ].check++;
1185 next_alloc += trie->uniquecharcount;
1186 }
1187 state = trie->trans[ state + charid ].next;
1188 } else {
e4584336 1189 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1190 }
1191 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1192 }
1193
1194 accept_state = TRIE_NODENUM( state );
1195 if ( !trie->states[ accept_state ].wordnum ) {
1196 /* we havent inserted this word into the structure yet. */
1197 trie->states[ accept_state ].wordnum = ++curword;
1198
1199 DEBUG_r({
1200 /* store the word for dumping */
1201 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1202 if ( UTF ) SvUTF8_on( tmp );
1203 av_push( trie->words, tmp );
1204 });
1205
1206 } else {
6f207bd3 1207 NOOP; /* Its a dupe. So ignore it. */
a3621e74
YO
1208 }
1209
1210 } /* end second pass */
1211
1212 DEBUG_TRIE_COMPILE_MORE_r({
1213 /*
1214 print out the table precompression so that we can do a visual check
1215 that they are identical.
1216 */
1217 U32 state;
1218 U16 charid;
1219 PerlIO_printf( Perl_debug_log, "\nChar : " );
1220
1221 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1222 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1223 if ( tmp ) {
cfd0369c 1224 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1225 }
1226 }
1227
1228 PerlIO_printf( Perl_debug_log, "\nState+-" );
1229
1230 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1231 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1232 }
1233
1234 PerlIO_printf( Perl_debug_log, "\n" );
1235
1236 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1237
e4584336 1238 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
a3621e74
YO
1239
1240 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
e4584336
RB
1241 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1242 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
a3621e74
YO
1243 }
1244 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
e4584336 1245 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
a3621e74 1246 } else {
e4584336 1247 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
a3621e74
YO
1248 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1249 }
1250 }
1251 PerlIO_printf( Perl_debug_log, "\n\n" );
1252 });
1253 {
1254 /*
1255 * Inplace compress the table.*
1256
1257 For sparse data sets the table constructed by the trie algorithm will
1258 be mostly 0/FAIL transitions or to put it another way mostly empty.
1259 (Note that leaf nodes will not contain any transitions.)
1260
1261 This algorithm compresses the tables by eliminating most such
1262 transitions, at the cost of a modest bit of extra work during lookup:
1263
1264 - Each states[] entry contains a .base field which indicates the
1265 index in the state[] array wheres its transition data is stored.
1266
1267 - If .base is 0 there are no valid transitions from that node.
1268
1269 - If .base is nonzero then charid is added to it to find an entry in
1270 the trans array.
1271
1272 -If trans[states[state].base+charid].check!=state then the
1273 transition is taken to be a 0/Fail transition. Thus if there are fail
1274 transitions at the front of the node then the .base offset will point
1275 somewhere inside the previous nodes data (or maybe even into a node
1276 even earlier), but the .check field determines if the transition is
1277 valid.
1278
1279 The following process inplace converts the table to the compressed
1280 table: We first do not compress the root node 1,and mark its all its
1281 .check pointers as 1 and set its .base pointer as 1 as well. This
1282 allows to do a DFA construction from the compressed table later, and
1283 ensures that any .base pointers we calculate later are greater than
1284 0.
1285
1286 - We set 'pos' to indicate the first entry of the second node.
1287
1288 - We then iterate over the columns of the node, finding the first and
1289 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1290 and set the .check pointers accordingly, and advance pos
1291 appropriately and repreat for the next node. Note that when we copy
1292 the next pointers we have to convert them from the original
1293 NODEIDX form to NODENUM form as the former is not valid post
1294 compression.
1295
1296 - If a node has no transitions used we mark its base as 0 and do not
1297 advance the pos pointer.
1298
1299 - If a node only has one transition we use a second pointer into the
1300 structure to fill in allocated fail transitions from other states.
1301 This pointer is independent of the main pointer and scans forward
1302 looking for null transitions that are allocated to a state. When it
1303 finds one it writes the single transition into the "hole". If the
1304 pointer doesnt find one the single transition is appeneded as normal.
1305
1306 - Once compressed we can Renew/realloc the structures to release the
1307 excess space.
1308
1309 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1310 specifically Fig 3.47 and the associated pseudocode.
1311
1312 demq
1313 */
a3b680e6 1314 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1315 U32 state, charid;
a3621e74
YO
1316 U32 pos = 0, zp=0;
1317 trie->laststate = laststate;
1318
1319 for ( state = 1 ; state < laststate ; state++ ) {
1320 U8 flag = 0;
a28509cc
AL
1321 const U32 stateidx = TRIE_NODEIDX( state );
1322 const U32 o_used = trie->trans[ stateidx ].check;
1323 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1324 trie->trans[ stateidx ].check = 0;
1325
1326 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1327 if ( flag || trie->trans[ stateidx + charid ].next ) {
1328 if ( trie->trans[ stateidx + charid ].next ) {
1329 if (o_used == 1) {
1330 for ( ; zp < pos ; zp++ ) {
1331 if ( ! trie->trans[ zp ].next ) {
1332 break;
1333 }
1334 }
1335 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1336 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1337 trie->trans[ zp ].check = state;
1338 if ( ++zp > pos ) pos = zp;
1339 break;
1340 }
1341 used--;
1342 }
1343 if ( !flag ) {
1344 flag = 1;
1345 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1346 }
1347 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1348 trie->trans[ pos ].check = state;
1349 pos++;
1350 }
1351 }
1352 }
cc601c31 1353 trie->lasttrans = pos + 1;
a3621e74
YO
1354 Renew( trie->states, laststate + 1, reg_trie_state);
1355 DEBUG_TRIE_COMPILE_MORE_r(
e4584336
RB
1356 PerlIO_printf( Perl_debug_log,
1357 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
5d7488b2
AL
1358 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1359 (IV)next_alloc,
1360 (IV)pos,
a3621e74
YO
1361 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1362 );
1363
1364 } /* end table compress */
1365 }
cc601c31
YO
1366 /* resize the trans array to remove unused space */
1367 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74
YO
1368
1369 DEBUG_TRIE_COMPILE_r({
1370 U32 state;
1371 /*
1372 Now we print it out again, in a slightly different form as there is additional
1373 info we want to be able to see when its compressed. They are close enough for
1374 visual comparison though.
1375 */
1376 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1377
1378 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1379 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1380 if ( tmp ) {
cfd0369c 1381 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen_const( *tmp ) );
a3621e74
YO
1382 }
1383 }
1384 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
cc601c31 1385
a3621e74
YO
1386 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1387 PerlIO_printf( Perl_debug_log, "-----");
1388 PerlIO_printf( Perl_debug_log, "\n");
cc601c31 1389
a3621e74 1390 for( state = 1 ; state < trie->laststate ; state++ ) {
a28509cc 1391 const U32 base = trie->states[ state ].trans.base;
a3621e74 1392
e4584336 1393 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
a3621e74
YO
1394
1395 if ( trie->states[ state ].wordnum ) {
1396 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1397 } else {
1398 PerlIO_printf( Perl_debug_log, "%6s", "" );
1399 }
1400
e4584336 1401 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
a3621e74
YO
1402
1403 if ( base ) {
1404 U32 ofs = 0;
1405
cc601c31
YO
1406 while( ( base + ofs < trie->uniquecharcount ) ||
1407 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1408 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
a3621e74
YO
1409 ofs++;
1410
e4584336 1411 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
a3621e74
YO
1412
1413 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1414 if ( ( base + ofs >= trie->uniquecharcount ) &&
1415 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1416 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1417 {
e4584336
RB
1418 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1419 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
a3621e74
YO
1420 } else {
1421 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1422 }
1423 }
1424
e4584336 1425 PerlIO_printf( Perl_debug_log, "]");
a3621e74
YO
1426
1427 }
1428 PerlIO_printf( Perl_debug_log, "\n" );
1429 }
1430 });
1431
1432 {
1433 /* now finally we "stitch in" the new TRIE node
1434 This means we convert either the first branch or the first Exact,
1435 depending on whether the thing following (in 'last') is a branch
1436 or not and whther first is the startbranch (ie is it a sub part of
1437 the alternation or is it the whole thing.)
1438 Assuming its a sub part we conver the EXACT otherwise we convert
1439 the whole branch sequence, including the first.
1440 */
1441 regnode *convert;
1442
1443
1444
1445
1446 if ( first == startbranch && OP( last ) != BRANCH ) {
1447 convert = first;
1448 } else {
1449 convert = NEXTOPER( first );
1450 NEXT_OFF( first ) = (U16)(last - first);
1451 }
1452
1453 OP( convert ) = TRIE + (U8)( flags - EXACT );
1454 NEXT_OFF( convert ) = (U16)(tail - convert);
1455 ARG_SET( convert, data_slot );
1456
1457 /* tells us if we need to handle accept buffers specially */
1458 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1459
1460
1461 /* needed for dumping*/
1462 DEBUG_r({
1463 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1464 /* We now need to mark all of the space originally used by the
1465 branches as optimized away. This keeps the dumpuntil from
1466 throwing a wobbly as it doesnt use regnext() to traverse the
1467 opcodes.
1468 */
1469 while( optimize < last ) {
1470 OP( optimize ) = OPTIMIZED;
1471 optimize++;
1472 }
1473 });
1474 } /* end node insert */
1475 return 1;
1476}
1477
1478
1479
1480/*
5d1c421c
JH
1481 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1482 * These need to be revisited when a newer toolchain becomes available.
1483 */
1484#if defined(__sparc64__) && defined(__GNUC__)
1485# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1486# undef SPARC64_GCC_WORKAROUND
1487# define SPARC64_GCC_WORKAROUND 1
1488# endif
1489#endif
1490
653099ff
GS
1491/* REx optimizer. Converts nodes into quickier variants "in place".
1492 Finds fixed substrings. */
1493
a0288114 1494/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
1495 to the position after last scanned or to NULL. */
1496
a3621e74 1497
76e3520e 1498STATIC I32
9a957fbc
AL
1499S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap,
1500 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1501 /* scanp: Start here (read-write). */
1502 /* deltap: Write maxlen-minlen here. */
1503 /* last: Stop before this one. */
1504{
97aff369 1505 dVAR;
c277df42
IZ
1506 I32 min = 0, pars = 0, code;
1507 regnode *scan = *scanp, *next;
1508 I32 delta = 0;
1509 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1510 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1511 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1512 scan_data_t data_fake;
653099ff 1513 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1514 SV *re_trie_maxbuff = NULL;
1515
1516 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1517
c277df42
IZ
1518 while (scan && OP(scan) != END && scan < last) {
1519 /* Peephole optimizer: */
a3621e74 1520 DEBUG_OPTIMISE_r({
c445ea15 1521 SV * const mysv=sv_newmortal();
32fc9b6a 1522 regprop(RExC_rx, mysv, scan);
e4584336 1523 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
cfd0369c 1524 (int)depth*2, "", SvPV_nolen_const(mysv), PTR2UV(scan));
a3621e74 1525 });
c277df42 1526
22c35a8c 1527 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 1528 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1529 regnode *n = regnext(scan);
1530 U32 stringok = 1;
1531#ifdef DEBUGGING
1532 regnode *stop = scan;
b81d288d 1533#endif
c277df42 1534
cd439c50 1535 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1536 /* Skip NOTHING, merge EXACT*. */
1537 while (n &&
b81d288d 1538 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
1539 (stringok && (OP(n) == OP(scan))))
1540 && NEXT_OFF(n)
1541 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1542 if (OP(n) == TAIL || n > next)
1543 stringok = 0;
22c35a8c 1544 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
1545 NEXT_OFF(scan) += NEXT_OFF(n);
1546 next = n + NODE_STEP_REGNODE;
1547#ifdef DEBUGGING
1548 if (stringok)
1549 stop = n;
b81d288d 1550#endif
c277df42 1551 n = regnext(n);
a0ed51b3 1552 }
f49d4d0f 1553 else if (stringok) {
a3b680e6 1554 const int oldl = STR_LEN(scan);
c445ea15 1555 regnode * const nnext = regnext(n);
f49d4d0f 1556
b81d288d 1557 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1558 break;
1559 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1560 STR_LEN(scan) += STR_LEN(n);
1561 next = n + NODE_SZ_STR(n);
c277df42 1562 /* Now we can overwrite *n : */
f49d4d0f 1563 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1564#ifdef DEBUGGING
f49d4d0f 1565 stop = next - 1;
b81d288d 1566#endif
c277df42
IZ
1567 n = nnext;
1568 }
1569 }
61a36c01 1570
a3621e74 1571 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1572/*
1573 Two problematic code points in Unicode casefolding of EXACT nodes:
1574
1575 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1576 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1577
1578 which casefold to
1579
1580 Unicode UTF-8
1581
1582 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1583 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1584
1585 This means that in case-insensitive matching (or "loose matching",
1586 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1587 length of the above casefolded versions) can match a target string
1588 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1589 This would rather mess up the minimum length computation.
1590
1591 What we'll do is to look for the tail four bytes, and then peek
1592 at the preceding two bytes to see whether we need to decrease
1593 the minimum length by four (six minus two).
1594
1595 Thanks to the design of UTF-8, there cannot be false matches:
1596 A sequence of valid UTF-8 bytes cannot be a subsequence of
1597 another valid sequence of UTF-8 bytes.
1598
1599*/
c445ea15
AL
1600 char * const s0 = STRING(scan), *s, *t;
1601 char * const s1 = s0 + STR_LEN(scan) - 1;
1602 char * const s2 = s1 - 4;
d4c19fe8 1603 const char t0[] = "\xcc\x88\xcc\x81";
a28509cc 1604 const char * const t1 = t0 + 3;
2af232bd 1605
61a36c01
JH
1606 for (s = s0 + 2;
1607 s < s2 && (t = ninstr(s, s1, t0, t1));
1608 s = t + 4) {
1609 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1610 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1611 min -= 4;
1612 }
1613 }
1614
c277df42
IZ
1615#ifdef DEBUGGING
1616 /* Allow dumping */
cd439c50 1617 n = scan + NODE_SZ_STR(scan);
c277df42 1618 while (n <= stop) {
22c35a8c 1619 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1620 OP(n) = OPTIMIZED;
1621 NEXT_OFF(n) = 0;
1622 }
1623 n++;
1624 }
653099ff 1625#endif
c277df42 1626 }
a3621e74
YO
1627
1628
1629
653099ff
GS
1630 /* Follow the next-chain of the current node and optimize
1631 away all the NOTHINGs from it. */
c277df42 1632 if (OP(scan) != CURLYX) {
a3b680e6 1633 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
1634 ? I32_MAX
1635 /* I32 may be smaller than U16 on CRAYs! */
1636 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1637 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1638 int noff;
1639 regnode *n = scan;
b81d288d 1640
c277df42
IZ
1641 /* Skip NOTHING and LONGJMP. */
1642 while ((n = regnext(n))
22c35a8c 1643 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1644 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1645 && off + noff < max)
1646 off += noff;
1647 if (reg_off_by_arg[OP(scan)])
1648 ARG(scan) = off;
b81d288d 1649 else
c277df42
IZ
1650 NEXT_OFF(scan) = off;
1651 }
a3621e74 1652
653099ff
GS
1653 /* The principal pseudo-switch. Cannot be a switch, since we
1654 look into several different things. */
b81d288d 1655 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1656 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1657 next = regnext(scan);
1658 code = OP(scan);
a3621e74 1659 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1660
1661 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1662 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1663 struct regnode_charclass_class accum;
d4c19fe8 1664 regnode * const startbranch=scan;
c277df42 1665
653099ff 1666 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1667 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1668 if (flags & SCF_DO_STCLASS)
830247a4 1669 cl_init_zero(pRExC_state, &accum);
a3621e74 1670
c277df42 1671 while (OP(scan) == code) {
830247a4 1672 I32 deltanext, minnext, f = 0, fake;
653099ff 1673 struct regnode_charclass_class this_class;
c277df42
IZ
1674
1675 num++;
1676 data_fake.flags = 0;
b81d288d 1677 if (data) {
2c2d71f5 1678 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1679 data_fake.last_closep = data->last_closep;
1680 }
1681 else
1682 data_fake.last_closep = &fake;
c277df42
IZ
1683 next = regnext(scan);
1684 scan = NEXTOPER(scan);
1685 if (code != BRANCH)
1686 scan = NEXTOPER(scan);
653099ff 1687 if (flags & SCF_DO_STCLASS) {
830247a4 1688 cl_init(pRExC_state, &this_class);
653099ff
GS
1689 data_fake.start_class = &this_class;
1690 f = SCF_DO_STCLASS_AND;
b81d288d 1691 }
e1901655
IZ
1692 if (flags & SCF_WHILEM_VISITED_POS)
1693 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1694
653099ff 1695 /* we suppose the run is continuous, last=next...*/
830247a4 1696 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1697 next, &data_fake, f,depth+1);
b81d288d 1698 if (min1 > minnext)
c277df42
IZ
1699 min1 = minnext;
1700 if (max1 < minnext + deltanext)
1701 max1 = minnext + deltanext;
1702 if (deltanext == I32_MAX)
aca2d497 1703 is_inf = is_inf_internal = 1;
c277df42
IZ
1704 scan = next;
1705 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1706 pars++;
405ff068 1707 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1708 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1709 if (data)
1710 data->whilem_c = data_fake.whilem_c;
653099ff 1711 if (flags & SCF_DO_STCLASS)
830247a4 1712 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1713 if (code == SUSPEND)
c277df42
IZ
1714 break;
1715 }
1716 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1717 min1 = 0;
1718 if (flags & SCF_DO_SUBSTR) {
1719 data->pos_min += min1;
1720 data->pos_delta += max1 - min1;
1721 if (max1 != min1 || is_inf)
1722 data->longest = &(data->longest_float);
1723 }
1724 min += min1;
1725 delta += max1 - min1;
653099ff 1726 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1727 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1728 if (min1) {
1729 cl_and(data->start_class, &and_with);
1730 flags &= ~SCF_DO_STCLASS;
1731 }
1732 }
1733 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1734 if (min1) {
1735 cl_and(data->start_class, &accum);
653099ff 1736 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1737 }
1738 else {
b81d288d 1739 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1740 * data->start_class */
1741 StructCopy(data->start_class, &and_with,
1742 struct regnode_charclass_class);
1743 flags &= ~SCF_DO_STCLASS_AND;
1744 StructCopy(&accum, data->start_class,
1745 struct regnode_charclass_class);
1746 flags |= SCF_DO_STCLASS_OR;
1747 data->start_class->flags |= ANYOF_EOS;
1748 }
653099ff 1749 }
a3621e74
YO
1750
1751 /* demq.
1752
1753 Assuming this was/is a branch we are dealing with: 'scan' now
1754 points at the item that follows the branch sequence, whatever
1755 it is. We now start at the beginning of the sequence and look
1756 for subsequences of
1757
1758 BRANCH->EXACT=>X
1759 BRANCH->EXACT=>X
1760
1761 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1762
1763 If we can find such a subseqence we need to turn the first
1764 element into a trie and then add the subsequent branch exact
1765 strings to the trie.
1766
1767 We have two cases
1768
1769 1. patterns where the whole set of branch can be converted to a trie,
1770
1771 2. patterns where only a subset of the alternations can be
1772 converted to a trie.
1773
1774 In case 1 we can replace the whole set with a single regop
1775 for the trie. In case 2 we need to keep the start and end
1776 branchs so
1777
1778 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1779 becomes BRANCH TRIE; BRANCH X;
1780
1781 Hypthetically when we know the regex isnt anchored we can
1782 turn a case 1 into a DFA and let it rip... Every time it finds a match
1783 it would just call its tail, no WHILEM/CURLY needed.
1784
1785 */
0111c4fd
RGS
1786 if (DO_TRIE) {
1787 if (!re_trie_maxbuff) {
1788 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1789 if (!SvIOK(re_trie_maxbuff))
1790 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1791 }
a3621e74
YO
1792 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1793 regnode *cur;
1794 regnode *first = (regnode *)NULL;
1795 regnode *last = (regnode *)NULL;
1796 regnode *tail = scan;
1797 U8 optype = 0;
1798 U32 count=0;
1799
1800#ifdef DEBUGGING
c445ea15 1801 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
1802#endif
1803 /* var tail is used because there may be a TAIL
1804 regop in the way. Ie, the exacts will point to the
1805 thing following the TAIL, but the last branch will
1806 point at the TAIL. So we advance tail. If we
1807 have nested (?:) we may have to move through several
1808 tails.
1809 */
1810
1811 while ( OP( tail ) == TAIL ) {
1812 /* this is the TAIL generated by (?:) */
1813 tail = regnext( tail );
1814 }
1815
1816 DEBUG_OPTIMISE_r({
32fc9b6a 1817 regprop(RExC_rx, mysv, tail );
a3621e74 1818 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
cfd0369c 1819 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen_const( mysv ),
a3621e74
YO
1820 (RExC_seen_evals) ? "[EVAL]" : ""
1821 );
1822 });
1823 /*
1824
1825 step through the branches, cur represents each
1826 branch, noper is the first thing to be matched
1827 as part of that branch and noper_next is the
1828 regnext() of that node. if noper is an EXACT
1829 and noper_next is the same as scan (our current
1830 position in the regex) then the EXACT branch is
1831 a possible optimization target. Once we have
1832 two or more consequetive such branches we can
1833 create a trie of the EXACT's contents and stich
1834 it in place. If the sequence represents all of
1835 the branches we eliminate the whole thing and
1836 replace it with a single TRIE. If it is a
1837 subsequence then we need to stitch it in. This
1838 means the first branch has to remain, and needs
1839 to be repointed at the item on the branch chain
1840 following the last branch optimized. This could
1841 be either a BRANCH, in which case the
1842 subsequence is internal, or it could be the
1843 item following the branch sequence in which
1844 case the subsequence is at the end.
1845
1846 */
1847
1848 /* dont use tail as the end marker for this traverse */
1849 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14
AL
1850 regnode * const noper = NEXTOPER( cur );
1851 regnode * const noper_next = regnext( noper );
a3621e74 1852
a3621e74 1853 DEBUG_OPTIMISE_r({
32fc9b6a 1854 regprop(RExC_rx, mysv, cur);
a3621e74 1855 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1856 (int)depth * 2 + 2," ", SvPV_nolen_const( mysv ) );
a3621e74 1857
32fc9b6a 1858 regprop(RExC_rx, mysv, noper);
a3621e74 1859 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 1860 SvPV_nolen_const(mysv));
a3621e74
YO
1861
1862 if ( noper_next ) {
32fc9b6a 1863 regprop(RExC_rx, mysv, noper_next );
a3621e74 1864 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 1865 SvPV_nolen_const(mysv));
a3621e74
YO
1866 }
1867 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
ca0270c4 1868 (void*)first, (void*)last, (void*)cur );
a3621e74
YO
1869 });
1870 if ( ( first ? OP( noper ) == optype
1871 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1872 && noper_next == tail && count<U16_MAX)
1873 {
1874 count++;
1875 if ( !first ) {
1876 first = cur;
1877 optype = OP( noper );
1878 } else {
1879 DEBUG_OPTIMISE_r(
1880 if (!last ) {
32fc9b6a 1881 regprop(RExC_rx, mysv, first);
a3621e74 1882 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1883 (int)depth * 2 + 2, "F:", SvPV_nolen_const( mysv ) );
32fc9b6a 1884 regprop(RExC_rx, mysv, NEXTOPER(first) );
a3621e74 1885 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1886 SvPV_nolen_const( mysv ) );
a3621e74
YO
1887 }
1888 );
1889 last = cur;
1890 DEBUG_OPTIMISE_r({
32fc9b6a 1891 regprop(RExC_rx, mysv, cur);
a3621e74 1892 PerlIO_printf( Perl_debug_log, "%*s%s",
cfd0369c 1893 (int)depth * 2 + 2, "N:", SvPV_nolen_const( mysv ) );
32fc9b6a 1894 regprop(RExC_rx, mysv, noper );
a3621e74 1895 PerlIO_printf( Perl_debug_log, " -> %s\n",
cfd0369c 1896 SvPV_nolen_const( mysv ) );
a3621e74
YO
1897 });
1898 }
1899 } else {
1900 if ( last ) {
1901 DEBUG_OPTIMISE_r(
1902 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1903 (int)depth * 2 + 2, "E:", "**END**" );
a3621e74
YO
1904 );
1905 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1906 }
1907 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1908 && noper_next == tail )
1909 {
1910 count = 1;
1911 first = cur;
1912 optype = OP( noper );
1913 } else {
1914 count = 0;
1915 first = NULL;
1916 optype = 0;
1917 }
1918 last = NULL;
1919 }
1920 }
1921 DEBUG_OPTIMISE_r({
32fc9b6a 1922 regprop(RExC_rx, mysv, cur);
a3621e74 1923 PerlIO_printf( Perl_debug_log,
e4584336 1924 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
ca0270c4 1925 " ", SvPV_nolen_const( mysv ), (void*)first, (void*)last, (void*)cur);
a3621e74
YO
1926
1927 });
1928 if ( last ) {
1929 DEBUG_OPTIMISE_r(
1930 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1931 (int)depth * 2 + 2, "E:", "==END==" );
a3621e74
YO
1932 );
1933 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1934 }
1935 }
1936 }
a0ed51b3 1937 }
a3621e74 1938 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 1939 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 1940 } else /* single branch is optimized. */
c277df42
IZ
1941 scan = NEXTOPER(scan);
1942 continue;
a0ed51b3
LW
1943 }
1944 else if (OP(scan) == EXACT) {
cd439c50 1945 I32 l = STR_LEN(scan);
c445ea15 1946 UV uc;
a0ed51b3 1947 if (UTF) {
a3b680e6 1948 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 1949 l = utf8_length(s, s + l);
9041c2e3 1950 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
1951 } else {
1952 uc = *((U8*)STRING(scan));
a0ed51b3
LW
1953 }
1954 min += l;
c277df42 1955 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
1956 /* The code below prefers earlier match for fixed
1957 offset, later match for variable offset. */
1958 if (data->last_end == -1) { /* Update the start info. */
1959 data->last_start_min = data->pos_min;
1960 data->last_start_max = is_inf
b81d288d 1961 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 1962 }
cd439c50 1963 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
1964 if (UTF)
1965 SvUTF8_on(data->last_found);
0eda9292 1966 {
9a957fbc 1967 SV * const sv = data->last_found;
a28509cc 1968 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
1969 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1970 if (mg && mg->mg_len >= 0)
5e43f467
JH
1971 mg->mg_len += utf8_length((U8*)STRING(scan),
1972 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 1973 }
c277df42
IZ
1974 data->last_end = data->pos_min + l;
1975 data->pos_min += l; /* As in the first entry. */
1976 data->flags &= ~SF_BEFORE_EOL;
1977 }
653099ff
GS
1978 if (flags & SCF_DO_STCLASS_AND) {
1979 /* Check whether it is compatible with what we know already! */
1980 int compat = 1;
1981
1aa99e6b 1982 if (uc >= 0x100 ||
516a5887 1983 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 1984 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 1985 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 1986 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 1987 )
653099ff
GS
1988 compat = 0;
1989 ANYOF_CLASS_ZERO(data->start_class);
1990 ANYOF_BITMAP_ZERO(data->start_class);
1991 if (compat)
1aa99e6b 1992 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 1993 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
1994 if (uc < 0x100)
1995 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
1996 }
1997 else if (flags & SCF_DO_STCLASS_OR) {
1998 /* false positive possible if the class is case-folded */
1aa99e6b 1999 if (uc < 0x100)
9b877dbb
IH
2000 ANYOF_BITMAP_SET(data->start_class, uc);
2001 else
2002 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2003 data->start_class->flags &= ~ANYOF_EOS;
2004 cl_and(data->start_class, &and_with);
2005 }
2006 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2007 }
653099ff 2008 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2009 I32 l = STR_LEN(scan);
1aa99e6b 2010 UV uc = *((U8*)STRING(scan));
653099ff
GS
2011
2012 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2013 if (flags & SCF_DO_SUBSTR) {
2014 assert(data);
830247a4 2015 scan_commit(pRExC_state, data);
ecaa9b9c 2016 }
a0ed51b3 2017 if (UTF) {
6136c704 2018 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2019 l = utf8_length(s, s + l);
9041c2e3 2020 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2021 }
2022 min += l;
ecaa9b9c 2023 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2024 data->pos_min += l;
653099ff
GS
2025 if (flags & SCF_DO_STCLASS_AND) {
2026 /* Check whether it is compatible with what we know already! */
2027 int compat = 1;
2028
1aa99e6b 2029 if (uc >= 0x100 ||
516a5887 2030 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2031 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2032 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2033 compat = 0;
2034 ANYOF_CLASS_ZERO(data->start_class);
2035 ANYOF_BITMAP_ZERO(data->start_class);
2036 if (compat) {
1aa99e6b 2037 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2038 data->start_class->flags &= ~ANYOF_EOS;
2039 data->start_class->flags |= ANYOF_FOLD;
2040 if (OP(scan) == EXACTFL)
2041 data->start_class->flags |= ANYOF_LOCALE;
2042 }
2043 }
2044 else if (flags & SCF_DO_STCLASS_OR) {
2045 if (data->start_class->flags & ANYOF_FOLD) {
2046 /* false positive possible if the class is case-folded.
2047 Assume that the locale settings are the same... */
1aa99e6b
IH
2048 if (uc < 0x100)
2049 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2050 data->start_class->flags &= ~ANYOF_EOS;
2051 }
2052 cl_and(data->start_class, &and_with);
2053 }
2054 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2055 }
bfed75c6 2056 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2057 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2058 I32 f = flags, pos_before = 0;
d4c19fe8 2059 regnode * const oscan = scan;
653099ff
GS
2060 struct regnode_charclass_class this_class;
2061 struct regnode_charclass_class *oclass = NULL;
727f22e3 2062 I32 next_is_eval = 0;
653099ff 2063
22c35a8c 2064 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2065 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2066 scan = NEXTOPER(scan);
2067 goto finish;
2068 case PLUS:
653099ff 2069 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2070 next = NEXTOPER(scan);
653099ff 2071 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2072 mincount = 1;
2073 maxcount = REG_INFTY;
c277df42
IZ
2074 next = regnext(scan);
2075 scan = NEXTOPER(scan);
2076 goto do_curly;
2077 }
2078 }
2079 if (flags & SCF_DO_SUBSTR)
2080 data->pos_min++;
2081 min++;
2082 /* Fall through. */
2083 case STAR:
653099ff
GS
2084 if (flags & SCF_DO_STCLASS) {
2085 mincount = 0;
b81d288d 2086 maxcount = REG_INFTY;
653099ff
GS
2087 next = regnext(scan);
2088 scan = NEXTOPER(scan);
2089 goto do_curly;
2090 }
b81d288d 2091 is_inf = is_inf_internal = 1;
c277df42
IZ
2092 scan = regnext(scan);
2093 if (flags & SCF_DO_SUBSTR) {
830247a4 2094 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2095 data->longest = &(data->longest_float);
2096 }
2097 goto optimize_curly_tail;
2098 case CURLY:
b81d288d 2099 mincount = ARG1(scan);
c277df42
IZ
2100 maxcount = ARG2(scan);
2101 next = regnext(scan);
cb434fcc
IZ
2102 if (OP(scan) == CURLYX) {
2103 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2104 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2105 }
c277df42 2106 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2107 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2108 do_curly:
2109 if (flags & SCF_DO_SUBSTR) {
830247a4 2110 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2111 pos_before = data->pos_min;
2112 }
2113 if (data) {
2114 fl = data->flags;
2115 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2116 if (is_inf)
2117 data->flags |= SF_IS_INF;
2118 }
653099ff 2119 if (flags & SCF_DO_STCLASS) {
830247a4 2120 cl_init(pRExC_state, &this_class);
653099ff
GS
2121 oclass = data->start_class;
2122 data->start_class = &this_class;
2123 f |= SCF_DO_STCLASS_AND;
2124 f &= ~SCF_DO_STCLASS_OR;
2125 }
e1901655
IZ
2126 /* These are the cases when once a subexpression
2127 fails at a particular position, it cannot succeed
2128 even after backtracking at the enclosing scope.
b81d288d 2129
e1901655
IZ
2130 XXXX what if minimal match and we are at the
2131 initial run of {n,m}? */
2132 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2133 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2134
c277df42 2135 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2136 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2137 (mincount == 0
2138 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2139
2140 if (flags & SCF_DO_STCLASS)
2141 data->start_class = oclass;
2142 if (mincount == 0 || minnext == 0) {
2143 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2144 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2145 }
2146 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2147 /* Switch to OR mode: cache the old value of
653099ff
GS
2148 * data->start_class */
2149 StructCopy(data->start_class, &and_with,
2150 struct regnode_charclass_class);
2151 flags &= ~SCF_DO_STCLASS_AND;
2152 StructCopy(&this_class, data->start_class,
2153 struct regnode_charclass_class);
2154 flags |= SCF_DO_STCLASS_OR;
2155 data->start_class->flags |= ANYOF_EOS;
2156 }
2157 } else { /* Non-zero len */
2158 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2159 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2160 cl_and(data->start_class, &and_with);
2161 }
2162 else if (flags & SCF_DO_STCLASS_AND)
2163 cl_and(data->start_class, &this_class);
2164 flags &= ~SCF_DO_STCLASS;
2165 }
c277df42
IZ
2166 if (!scan) /* It was not CURLYX, but CURLY. */
2167 scan = next;
041457d9
DM
2168 if ( /* ? quantifier ok, except for (?{ ... }) */
2169 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2170 && (minnext == 0) && (deltanext == 0)
99799961 2171 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2172 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2173 && ckWARN(WARN_REGEXP))
b45f050a 2174 {
830247a4 2175 vWARN(RExC_parse,
b45f050a
JF
2176 "Quantifier unexpected on zero-length expression");
2177 }
2178
c277df42 2179 min += minnext * mincount;
b81d288d 2180 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2181 && (minnext + deltanext) > 0)
2182 || deltanext == I32_MAX);
aca2d497 2183 is_inf |= is_inf_internal;
c277df42
IZ
2184 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2185
2186 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2187 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2188 && data->flags & SF_IN_PAR
2189 && !(data->flags & SF_HAS_EVAL)
2190 && !deltanext && minnext == 1 ) {
2191 /* Try to optimize to CURLYN. */
2192 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2193 regnode * const nxt1 = nxt;
497b47a8
JH
2194#ifdef DEBUGGING
2195 regnode *nxt2;
2196#endif
c277df42
IZ
2197
2198 /* Skip open. */
2199 nxt = regnext(nxt);
bfed75c6 2200 if (!strchr((const char*)PL_simple,OP(nxt))
22c35a8c 2201 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 2202 && STR_LEN(nxt) == 1))
c277df42 2203 goto nogo;
497b47a8 2204#ifdef DEBUGGING
c277df42 2205 nxt2 = nxt;
497b47a8 2206#endif
c277df42 2207 nxt = regnext(nxt);
b81d288d 2208 if (OP(nxt) != CLOSE)
c277df42
IZ
2209 goto nogo;
2210 /* Now we know that nxt2 is the only contents: */
eb160463 2211 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2212 OP(oscan) = CURLYN;
2213 OP(nxt1) = NOTHING; /* was OPEN. */
2214#ifdef DEBUGGING
2215 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2216 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2217 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2218 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2219 OP(nxt + 1) = OPTIMIZED; /* was count. */
2220 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2221#endif
c277df42 2222 }
c277df42
IZ
2223 nogo:
2224
2225 /* Try optimization CURLYX => CURLYM. */
b81d288d 2226 if ( OP(oscan) == CURLYX && data
c277df42 2227 && !(data->flags & SF_HAS_PAR)
c277df42 2228 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2229 && !deltanext /* atom is fixed width */
2230 && minnext != 0 /* CURLYM can't handle zero width */
2231 ) {
c277df42
IZ
2232 /* XXXX How to optimize if data == 0? */
2233 /* Optimize to a simpler form. */
2234 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2235 regnode *nxt2;
2236
2237 OP(oscan) = CURLYM;
2238 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2239 && (OP(nxt2) != WHILEM))
c277df42
IZ
2240 nxt = nxt2;
2241 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2242 /* Need to optimize away parenths. */
2243 if (data->flags & SF_IN_PAR) {
2244 /* Set the parenth number. */
2245 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2246
b81d288d 2247 if (OP(nxt) != CLOSE)
b45f050a 2248 FAIL("Panic opt close");
eb160463 2249 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2250 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2251 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2252#ifdef DEBUGGING
2253 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2254 OP(nxt + 1) = OPTIMIZED; /* was count. */
2255 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2256 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2257#endif
c277df42
IZ
2258#if 0
2259 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2260 regnode *nnxt = regnext(nxt1);
b81d288d 2261
c277df42
IZ
2262 if (nnxt == nxt) {
2263 if (reg_off_by_arg[OP(nxt1)])
2264 ARG_SET(nxt1, nxt2 - nxt1);
2265 else if (nxt2 - nxt1 < U16_MAX)
2266 NEXT_OFF(nxt1) = nxt2 - nxt1;
2267 else
2268 OP(nxt) = NOTHING; /* Cannot beautify */
2269 }
2270 nxt1 = nnxt;
2271 }
2272#endif
2273 /* Optimize again: */
b81d288d 2274 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2275 NULL, 0,depth+1);
a0ed51b3
LW
2276 }
2277 else
c277df42 2278 oscan->flags = 0;
c277df42 2279 }
e1901655
IZ
2280 else if ((OP(oscan) == CURLYX)
2281 && (flags & SCF_WHILEM_VISITED_POS)
2282 /* See the comment on a similar expression above.
2283 However, this time it not a subexpression
2284 we care about, but the expression itself. */
2285 && (maxcount == REG_INFTY)
2286 && data && ++data->whilem_c < 16) {
2287 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2288 /* Find WHILEM (as in regexec.c) */
2289 regnode *nxt = oscan + NEXT_OFF(oscan);
2290
2291 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2292 nxt += ARG(nxt);
eb160463
GS
2293 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2294 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2295 }
b81d288d 2296 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2297 pars++;
2298 if (flags & SCF_DO_SUBSTR) {
c445ea15 2299 SV *last_str = NULL;
c277df42
IZ
2300 int counted = mincount != 0;
2301
2302 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2303#if defined(SPARC64_GCC_WORKAROUND)
2304 I32 b = 0;
2305 STRLEN l = 0;
cfd0369c 2306 const char *s = NULL;
5d1c421c
JH
2307 I32 old = 0;
2308
2309 if (pos_before >= data->last_start_min)
2310 b = pos_before;
2311 else
2312 b = data->last_start_min;
2313
2314 l = 0;
cfd0369c 2315 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2316 old = b - data->last_start_min;
2317
2318#else
b81d288d 2319 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2320 ? pos_before : data->last_start_min;
2321 STRLEN l;
d4c19fe8 2322 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2323 I32 old = b - data->last_start_min;
5d1c421c 2324#endif
a0ed51b3
LW
2325
2326 if (UTF)
2327 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2328
a0ed51b3 2329 l -= old;
c277df42 2330 /* Get the added string: */
79cb57f6 2331 last_str = newSVpvn(s + old, l);
0e933229
IH
2332 if (UTF)
2333 SvUTF8_on(last_str);
c277df42
IZ
2334 if (deltanext == 0 && pos_before == b) {
2335 /* What was added is a constant string */
2336 if (mincount > 1) {
2337 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2338 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2339 SvPVX_const(last_str), l, mincount - 1);
b162af07 2340 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2341 /* Add additional parts. */
b81d288d 2342 SvCUR_set(data->last_found,
c277df42
IZ
2343 SvCUR(data->last_found) - l);
2344 sv_catsv(data->last_found, last_str);
0eda9292
JH
2345 {
2346 SV * sv = data->last_found;
2347 MAGIC *mg =
2348 SvUTF8(sv) && SvMAGICAL(sv) ?
2349 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2350 if (mg && mg->mg_len >= 0)
2351 mg->mg_len += CHR_SVLEN(last_str);
2352 }
c277df42
IZ
2353 data->last_end += l * (mincount - 1);
2354 }
2a8d9689
HS
2355 } else {
2356 /* start offset must point into the last copy */
2357 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2358 data->last_start_max += is_inf ? I32_MAX
2359 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2360 }
2361 }
2362 /* It is counted once already... */
2363 data->pos_min += minnext * (mincount - counted);
2364 data->pos_delta += - counted * deltanext +
2365 (minnext + deltanext) * maxcount - minnext * mincount;
2366 if (mincount != maxcount) {
653099ff
GS
2367 /* Cannot extend fixed substrings found inside
2368 the group. */
830247a4 2369 scan_commit(pRExC_state,data);
c277df42 2370 if (mincount && last_str) {
d4c19fe8
AL
2371 SV * const sv = data->last_found;
2372 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
2373 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2374
2375 if (mg)
2376 mg->mg_len = -1;
2377 sv_setsv(sv, last_str);
c277df42 2378 data->last_end = data->pos_min;
b81d288d 2379 data->last_start_min =
a0ed51b3 2380 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2381 data->last_start_max = is_inf
2382 ? I32_MAX
c277df42 2383 : data->pos_min + data->pos_delta
a0ed51b3 2384 - CHR_SVLEN(last_str);
c277df42
IZ
2385 }
2386 data->longest = &(data->longest_float);
2387 }
aca2d497 2388 SvREFCNT_dec(last_str);
c277df42 2389 }
405ff068 2390 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2391 data->flags |= SF_HAS_EVAL;
2392 optimize_curly_tail:
c277df42 2393 if (OP(oscan) != CURLYX) {
22c35a8c 2394 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2395 && NEXT_OFF(next))
2396 NEXT_OFF(oscan) += NEXT_OFF(next);
2397 }
c277df42 2398 continue;
653099ff 2399 default: /* REF and CLUMP only? */
c277df42 2400 if (flags & SCF_DO_SUBSTR) {
830247a4 2401 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2402 data->longest = &(data->longest_float);
2403 }
aca2d497 2404 is_inf = is_inf_internal = 1;
653099ff 2405 if (flags & SCF_DO_STCLASS_OR)
830247a4 2406 cl_anything(pRExC_state, data->start_class);
653099ff 2407 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2408 break;
2409 }
a0ed51b3 2410 }
bfed75c6 2411 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2412 int value = 0;
653099ff 2413
c277df42 2414 if (flags & SCF_DO_SUBSTR) {
830247a4 2415 scan_commit(pRExC_state,data);
c277df42
IZ
2416 data->pos_min++;
2417 }
2418 min++;
653099ff
GS
2419 if (flags & SCF_DO_STCLASS) {
2420 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2421
2422 /* Some of the logic below assumes that switching
2423 locale on will only add false positives. */
2424 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2425 case SANY:
653099ff
GS
2426 default:
2427 do_default:
2428 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2429 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2430 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2431 break;
2432 case REG_ANY:
2433 if (OP(scan) == SANY)
2434 goto do_default;
2435 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2436 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2437 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2438 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2439 }
2440 if (flags & SCF_DO_STCLASS_AND || !value)
2441 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2442 break;
2443 case ANYOF:
2444 if (flags & SCF_DO_STCLASS_AND)
2445 cl_and(data->start_class,
2446 (struct regnode_charclass_class*)scan);
2447 else
830247a4 2448 cl_or(pRExC_state, data->start_class,
653099ff
GS
2449 (struct regnode_charclass_class*)scan);
2450 break;
2451 case ALNUM:
2452 if (flags & SCF_DO_STCLASS_AND) {
2453 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2454 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2455 for (value = 0; value < 256; value++)
2456 if (!isALNUM(value))
2457 ANYOF_BITMAP_CLEAR(data->start_class, value);
2458 }
2459 }
2460 else {
2461 if (data->start_class->flags & ANYOF_LOCALE)
2462 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2463 else {
2464 for (value = 0; value < 256; value++)
2465 if (isALNUM(value))
b81d288d 2466 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2467 }
2468 }
2469 break;
2470 case ALNUML:
2471 if (flags & SCF_DO_STCLASS_AND) {
2472 if (data->start_class->flags & ANYOF_LOCALE)
2473 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2474 }
2475 else {
2476 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2477 data->start_class->flags |= ANYOF_LOCALE;
2478 }
2479 break;
2480 case NALNUM:
2481 if (flags & SCF_DO_STCLASS_AND) {
2482 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2483 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2484 for (value = 0; value < 256; value++)
2485 if (isALNUM(value))
2486 ANYOF_BITMAP_CLEAR(data->start_class, value);
2487 }
2488 }
2489 else {
2490 if (data->start_class->flags & ANYOF_LOCALE)
2491 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2492 else {
2493 for (value = 0; value < 256; value++)
2494 if (!isALNUM(value))
b81d288d 2495 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2496 }
2497 }
2498 break;
2499 case NALNUML:
2500 if (flags & SCF_DO_STCLASS_AND) {
2501 if (data->start_class->flags & ANYOF_LOCALE)
2502 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2503 }
2504 else {
2505 data->start_class->flags |= ANYOF_LOCALE;
2506 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2507 }
2508 break;
2509 case SPACE:
2510 if (flags & SCF_DO_STCLASS_AND) {
2511 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2512 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2513 for (value = 0; value < 256; value++)
2514 if (!isSPACE(value))
2515 ANYOF_BITMAP_CLEAR(data->start_class, value);
2516 }
2517 }
2518 else {
2519 if (data->start_class->flags & ANYOF_LOCALE)
2520 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2521 else {
2522 for (value = 0; value < 256; value++)
2523 if (isSPACE(value))
b81d288d 2524 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2525 }
2526 }
2527 break;
2528 case SPACEL:
2529 if (flags & SCF_DO_STCLASS_AND) {
2530 if (data->start_class->flags & ANYOF_LOCALE)
2531 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2532 }
2533 else {
2534 data->start_class->flags |= ANYOF_LOCALE;
2535 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2536 }
2537 break;
2538 case NSPACE:
2539 if (flags & SCF_DO_STCLASS_AND) {
2540 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2541 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2542 for (value = 0; value < 256; value++)
2543 if (isSPACE(value))
2544 ANYOF_BITMAP_CLEAR(data->start_class, value);
2545 }
2546 }
2547 else {
2548 if (data->start_class->flags & ANYOF_LOCALE)
2549 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2550 else {
2551 for (value = 0; value < 256; value++)
2552 if (!isSPACE(value))
b81d288d 2553 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2554 }
2555 }
2556 break;
2557 case NSPACEL:
2558 if (flags & SCF_DO_STCLASS_AND) {
2559 if (data->start_class->flags & ANYOF_LOCALE) {
2560 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2561 for (value = 0; value < 256; value++)
2562 if (!isSPACE(value))
2563 ANYOF_BITMAP_CLEAR(data->start_class, value);
2564 }
2565 }
2566 else {
2567 data->start_class->flags |= ANYOF_LOCALE;
2568 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2569 }
2570 break;
2571 case DIGIT:
2572 if (flags & SCF_DO_STCLASS_AND) {
2573 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2574 for (value = 0; value < 256; value++)
2575 if (!isDIGIT(value))
2576 ANYOF_BITMAP_CLEAR(data->start_class, value);
2577 }
2578 else {
2579 if (data->start_class->flags & ANYOF_LOCALE)
2580 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2581 else {
2582 for (value = 0; value < 256; value++)
2583 if (isDIGIT(value))
b81d288d 2584 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2585 }
2586 }
2587 break;
2588 case NDIGIT:
2589 if (flags & SCF_DO_STCLASS_AND) {
2590 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2591 for (value = 0; value < 256; value++)
2592 if (isDIGIT(value))
2593 ANYOF_BITMAP_CLEAR(data->start_class, value);
2594 }
2595 else {
2596 if (data->start_class->flags & ANYOF_LOCALE)
2597 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2598 else {
2599 for (value = 0; value < 256; value++)
2600 if (!isDIGIT(value))
b81d288d 2601 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2602 }
2603 }
2604 break;
2605 }
2606 if (flags & SCF_DO_STCLASS_OR)
2607 cl_and(data->start_class, &and_with);
2608 flags &= ~SCF_DO_STCLASS;
2609 }
a0ed51b3 2610 }
22c35a8c 2611 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2612 data->flags |= (OP(scan) == MEOL
2613 ? SF_BEFORE_MEOL
2614 : SF_BEFORE_SEOL);
a0ed51b3 2615 }
653099ff
GS
2616 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2617 /* Lookbehind, or need to calculate parens/evals/stclass: */
2618 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2619 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2620 /* Lookahead/lookbehind */
cb434fcc 2621 I32 deltanext, minnext, fake = 0;
c277df42 2622 regnode *nscan;
653099ff
GS
2623 struct regnode_charclass_class intrnl;
2624 int f = 0;
c277df42
IZ
2625
2626 data_fake.flags = 0;
b81d288d 2627 if (data) {
2c2d71f5 2628 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2629 data_fake.last_closep = data->last_closep;
2630 }
2631 else
2632 data_fake.last_closep = &fake;
653099ff
GS
2633 if ( flags & SCF_DO_STCLASS && !scan->flags
2634 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2635 cl_init(pRExC_state, &intrnl);
653099ff 2636 data_fake.start_class = &intrnl;
e1901655 2637 f |= SCF_DO_STCLASS_AND;
653099ff 2638 }
e1901655
IZ
2639 if (flags & SCF_WHILEM_VISITED_POS)
2640 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2641 next = regnext(scan);
2642 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2643 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2644 if (scan->flags) {
2645 if (deltanext) {
9baa0206 2646 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2647 }
2648 else if (minnext > U8_MAX) {
9baa0206 2649 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2650 }
eb160463 2651 scan->flags = (U8)minnext;
c277df42
IZ
2652 }
2653 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2654 pars++;
405ff068 2655 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2656 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2657 if (data)
2658 data->whilem_c = data_fake.whilem_c;
e1901655 2659 if (f & SCF_DO_STCLASS_AND) {
a28509cc 2660 const int was = (data->start_class->flags & ANYOF_EOS);
653099ff
GS
2661
2662 cl_and(data->start_class, &intrnl);
2663 if (was)
2664 data->start_class->flags |= ANYOF_EOS;
2665 }
a0ed51b3
LW
2666 }
2667 else if (OP(scan) == OPEN) {
c277df42 2668 pars++;
a0ed51b3 2669 }
cb434fcc 2670 else if (OP(scan) == CLOSE) {
eb160463 2671 if ((I32)ARG(scan) == is_par) {
cb434fcc 2672 next = regnext(scan);
c277df42 2673
cb434fcc
IZ
2674 if ( next && (OP(next) != WHILEM) && next < last)
2675 is_par = 0; /* Disable optimization */
2676 }
2677 if (data)
2678 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2679 }
2680 else if (OP(scan) == EVAL) {
c277df42
IZ
2681 if (data)
2682 data->flags |= SF_HAS_EVAL;
2683 }
96776eda 2684 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2685 if (flags & SCF_DO_SUBSTR) {
830247a4 2686 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2687 data->longest = &(data->longest_float);
2688 }
2689 is_inf = is_inf_internal = 1;
653099ff 2690 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2691 cl_anything(pRExC_state, data->start_class);
96776eda 2692 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2693 }
c277df42
IZ
2694 /* Else: zero-length, ignore. */
2695 scan = regnext(scan);
2696 }
2697
2698 finish:
2699 *scanp = scan;
aca2d497 2700 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2701 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2702 data->pos_delta = I32_MAX - data->pos_min;
2703 if (is_par > U8_MAX)
2704 is_par = 0;
2705 if (is_par && pars==1 && data) {
2706 data->flags |= SF_IN_PAR;
2707 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2708 }
2709 else if (pars && data) {
c277df42
IZ
2710 data->flags |= SF_HAS_PAR;
2711 data->flags &= ~SF_IN_PAR;
2712 }
653099ff
GS
2713 if (flags & SCF_DO_STCLASS_OR)
2714 cl_and(data->start_class, &and_with);
c277df42
IZ
2715 return min;
2716}
2717
76e3520e 2718STATIC I32
5f66b61c 2719S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2720{
830247a4 2721 if (RExC_rx->data) {
b81d288d
AB
2722 Renewc(RExC_rx->data,
2723 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2724 char, struct reg_data);
830247a4
IZ
2725 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2726 RExC_rx->data->count += n;
a0ed51b3
LW
2727 }
2728 else {
a02a5408 2729 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2730 char, struct reg_data);
a02a5408 2731 Newx(RExC_rx->data->what, n, U8);
830247a4 2732 RExC_rx->data->count = n;
c277df42 2733 }
830247a4
IZ
2734 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2735 return RExC_rx->data->count - n;
c277df42
IZ
2736}
2737
76234dfb 2738#ifndef PERL_IN_XSUB_RE
d88dccdf 2739void
864dbfa3 2740Perl_reginitcolors(pTHX)
d88dccdf 2741{
97aff369 2742 dVAR;
1df70142 2743 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 2744 if (s) {
1df70142
AL
2745 char *t = savepv(s);
2746 int i = 0;
2747 PL_colors[0] = t;
d88dccdf 2748 while (++i < 6) {
1df70142
AL
2749 t = strchr(t, '\t');
2750 if (t) {
2751 *t = '\0';
2752 PL_colors[i] = ++t;
d88dccdf
IZ
2753 }
2754 else
1df70142 2755 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
2756 }
2757 } else {
1df70142 2758 int i = 0;
b81d288d 2759 while (i < 6)
06b5626a 2760 PL_colors[i++] = (char *)"";
d88dccdf
IZ
2761 }
2762 PL_colorset = 1;
2763}
76234dfb 2764#endif
8615cb43 2765
a687059c 2766/*
e50aee73 2767 - pregcomp - compile a regular expression into internal code
a687059c
LW
2768 *
2769 * We can't allocate space until we know how big the compiled form will be,
2770 * but we can't compile it (and thus know how big it is) until we've got a
2771 * place to put the code. So we cheat: we compile it twice, once with code
2772 * generation turned off and size counting turned on, and once "for real".
2773 * This also means that we don't allocate space until we are sure that the
2774 * thing really will compile successfully, and we never have to move the
2775 * code and thus invalidate pointers into it. (Note that it has to be in
2776 * one piece because free() must be able to free it all.) [NB: not true in perl]
2777 *
2778 * Beware that the optimization-preparation code in here knows about some
2779 * of the structure of the compiled regexp. [I'll say.]
2780 */
2781regexp *
864dbfa3 2782Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2783{
97aff369 2784 dVAR;
a0d0e21e 2785 register regexp *r;
c277df42 2786 regnode *scan;
c277df42 2787 regnode *first;
a0d0e21e 2788 I32 flags;
a0d0e21e
LW
2789 I32 minlen = 0;
2790 I32 sawplus = 0;
2791 I32 sawopen = 0;
2c2d71f5 2792 scan_data_t data;
830247a4
IZ
2793 RExC_state_t RExC_state;
2794 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 2795
a3621e74
YO
2796 GET_RE_DEBUG_FLAGS_DECL;
2797
a0d0e21e 2798 if (exp == NULL)
c277df42 2799 FAIL("NULL regexp argument");
a0d0e21e 2800
a5961de5 2801 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 2802
5cfc7842 2803 RExC_precomp = exp;
a3621e74
YO
2804 DEBUG_r(if (!PL_colorset) reginitcolors());
2805 DEBUG_COMPILE_r({
2806 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
2807 PL_colors[4],PL_colors[5],PL_colors[0],
2808 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2809 });
e2509266 2810 RExC_flags = pm->op_pmflags;
830247a4 2811 RExC_sawback = 0;
bbce6d69 2812
830247a4
IZ
2813 RExC_seen = 0;
2814 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2815 RExC_seen_evals = 0;
2816 RExC_extralen = 0;
c277df42 2817
bbce6d69 2818 /* First pass: determine size, legality. */
830247a4 2819 RExC_parse = exp;
fac92740 2820 RExC_start = exp;
830247a4
IZ
2821 RExC_end = xend;
2822 RExC_naughty = 0;
2823 RExC_npar = 1;
2824 RExC_size = 0L;
2825 RExC_emit = &PL_regdummy;
2826 RExC_whilem_seen = 0;
85ddcde9
JH
2827#if 0 /* REGC() is (currently) a NOP at the first pass.
2828 * Clever compilers notice this and complain. --jhi */
830247a4 2829 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 2830#endif
830247a4 2831 if (reg(pRExC_state, 0, &flags) == NULL) {
c445ea15 2832 RExC_precomp = NULL;
a0d0e21e
LW
2833 return(NULL);
2834 }
a3621e74 2835 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 2836
c277df42
IZ
2837 /* Small enough for pointer-storage convention?
2838 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
2839 if (RExC_size >= 0x10000L && RExC_extralen)
2840 RExC_size += RExC_extralen;
c277df42 2841 else
830247a4
IZ
2842 RExC_extralen = 0;
2843 if (RExC_whilem_seen > 15)
2844 RExC_whilem_seen = 15;
a0d0e21e 2845
bbce6d69 2846 /* Allocate space and initialize. */
a02a5408 2847 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 2848 char, regexp);
a0d0e21e 2849 if (r == NULL)
b45f050a
JF
2850 FAIL("Regexp out of space");
2851
0f79a09d
GS
2852#ifdef DEBUGGING
2853 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 2854 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 2855#endif
c277df42 2856 r->refcnt = 1;
bbce6d69 2857 r->prelen = xend - exp;
5cfc7842 2858 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 2859 r->subbeg = NULL;
f8c7b90f 2860#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 2861 r->saved_copy = NULL;
ed252734 2862#endif
cf93c79d 2863 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 2864 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 2865 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
2866
2867 r->substrs = 0; /* Useful during FAIL. */
2868 r->startp = 0; /* Useful during FAIL. */
2869 r->endp = 0; /* Useful during FAIL. */
2870
a02a5408 2871 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 2872 if (r->offsets) {
2af232bd 2873 r->offsets[0] = RExC_size;
fac92740 2874 }
a3621e74 2875 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
2876 "%s %"UVuf" bytes for offset annotations.\n",
2877 r->offsets ? "Got" : "Couldn't get",
392fbf5d 2878 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 2879
830247a4 2880 RExC_rx = r;
bbce6d69 2881
2882 /* Second pass: emit code. */
e2509266 2883 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
2884 RExC_parse = exp;
2885 RExC_end = xend;
2886 RExC_naughty = 0;
2887 RExC_npar = 1;
fac92740 2888 RExC_emit_start = r->program;
830247a4 2889 RExC_emit = r->program;
2cd61cdb 2890 /* Store the count of eval-groups for security checks: */
eb160463 2891 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 2892 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 2893 r->data = 0;
830247a4 2894 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
2895 return(NULL);
2896
a3621e74 2897
a0d0e21e 2898 /* Dig out information for optimizations. */
cf93c79d 2899 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 2900 pm->op_pmflags = RExC_flags;
a0ed51b3 2901 if (UTF)
5ff6fc6d 2902 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 2903 r->regstclass = NULL;
830247a4 2904 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 2905 r->reganch |= ROPT_NAUGHTY;
c277df42 2906 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
2907
2908 /* XXXX To minimize changes to RE engine we always allocate
2909 3-units-long substrs field. */
a02a5408 2910 Newxz(r->substrs, 1, struct reg_substr_data);
2779dcf1 2911
2c2d71f5 2912 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 2913 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 2914 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 2915 I32 fake;
c5254dd6 2916 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
2917 struct regnode_charclass_class ch_class;
2918 int stclass_flag;
cb434fcc 2919 I32 last_close = 0;
a0d0e21e
LW
2920
2921 first = scan;
c277df42 2922 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 2923 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 2924 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
2925 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2926 (OP(first) == PLUS) ||
2927 (OP(first) == MINMOD) ||
653099ff 2928 /* An {n,m} with n>0 */
22c35a8c 2929 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
2930 if (OP(first) == PLUS)
2931 sawplus = 1;
2932 else
2933 first += regarglen[(U8)OP(first)];
2934 first = NEXTOPER(first);
a687059c
LW
2935 }
2936
a0d0e21e
LW
2937 /* Starting-point info. */
2938 again:
653099ff 2939 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b 2940 if (OP(first) == EXACT)
6f207bd3 2941 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 2942 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
2943 r->regstclass = first;
2944 }
bfed75c6 2945 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 2946 r->regstclass = first;
22c35a8c
GS
2947 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2948 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 2949 r->regstclass = first;
22c35a8c 2950 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
2951 r->reganch |= (OP(first) == MBOL
2952 ? ROPT_ANCH_MBOL
2953 : (OP(first) == SBOL
2954 ? ROPT_ANCH_SBOL
2955 : ROPT_ANCH_BOL));
a0d0e21e 2956 first = NEXTOPER(first);
774d564b 2957 goto again;
2958 }
2959 else if (OP(first) == GPOS) {
2960 r->reganch |= ROPT_ANCH_GPOS;
2961 first = NEXTOPER(first);
2962 goto again;
a0d0e21e 2963 }
e09294f4 2964 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 2965 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
2966 !(r->reganch & ROPT_ANCH) )
2967 {
2968 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
2969 const int type =
2970 (OP(NEXTOPER(first)) == REG_ANY)
2971 ? ROPT_ANCH_MBOL
2972 : ROPT_ANCH_SBOL;
cad2e5aa 2973 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 2974 first = NEXTOPER(first);
774d564b 2975 goto again;
a0d0e21e 2976 }
b81d288d 2977 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 2978 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
2979 /* x+ must match at the 1st pos of run of x's */
2980 r->reganch |= ROPT_SKIP;
a0d0e21e 2981
c277df42 2982 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 2983 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 2984 (IV)(first - scan + 1)));
a0d0e21e
LW
2985 /*
2986 * If there's something expensive in the r.e., find the
2987 * longest literal string that must appear and make it the
2988 * regmust. Resolve ties in favor of later strings, since
2989 * the regstart check works with the beginning of the r.e.
2990 * and avoiding duplication strengthens checking. Not a
2991 * strong reason, but sufficient in the absence of others.
2992 * [Now we resolve ties in favor of the earlier string if
c277df42 2993 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
2994 * earlier string may buy us something the later one won't.]
2995 */
a0d0e21e 2996 minlen = 0;
a687059c 2997
396482e1
GA
2998 data.longest_fixed = newSVpvs("");
2999 data.longest_float = newSVpvs("");
3000 data.last_found = newSVpvs("");
c277df42
IZ
3001 data.longest = &(data.longest_fixed);
3002 first = scan;
653099ff 3003 if (!r->regstclass) {
830247a4 3004 cl_init(pRExC_state, &ch_class);
653099ff
GS
3005 data.start_class = &ch_class;
3006 stclass_flag = SCF_DO_STCLASS_AND;
3007 } else /* XXXX Check for BOUND? */
3008 stclass_flag = 0;
cb434fcc 3009 data.last_closep = &last_close;
653099ff 3010
830247a4 3011 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3012 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3013 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3014 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3015 && !RExC_seen_zerolen
3016 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3017 r->reganch |= ROPT_CHECK_ALL;
830247a4 3018 scan_commit(pRExC_state, &data);
c277df42
IZ
3019 SvREFCNT_dec(data.last_found);
3020
a0ed51b3 3021 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3022 if (longest_float_length
c277df42
IZ
3023 || (data.flags & SF_FL_BEFORE_EOL
3024 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3025 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3026 int t;
3027
a0ed51b3 3028 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3029 && data.offset_fixed == data.offset_float_min
3030 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3031 goto remove_float; /* As in (a)+. */
3032
33b8afdf
JH
3033 if (SvUTF8(data.longest_float)) {
3034 r->float_utf8 = data.longest_float;
c445ea15 3035 r->float_substr = NULL;
33b8afdf
JH
3036 } else {
3037 r->float_substr = data.longest_float;
c445ea15 3038 r->float_utf8 = NULL;
33b8afdf 3039 }
c277df42
IZ
3040 r->float_min_offset = data.offset_float_min;
3041 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3042 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3043 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3044 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3045 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3046 }
3047 else {
aca2d497 3048 remove_float:
c445ea15 3049 r->float_substr = r->float_utf8 = NULL;
c277df42 3050 SvREFCNT_dec(data.longest_float);
c5254dd6 3051 longest_float_length = 0;
a0d0e21e 3052 }
c277df42 3053
a0ed51b3 3054 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3055 if (longest_fixed_length
c277df42
IZ
3056 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3057 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3058 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3059 int t;
3060
33b8afdf
JH
3061 if (SvUTF8(data.longest_fixed)) {
3062 r->anchored_utf8 = data.longest_fixed;
c445ea15 3063 r->anchored_substr = NULL;
33b8afdf
JH
3064 } else {
3065 r->anchored_substr = data.longest_fixed;
c445ea15 3066 r->anchored_utf8 = NULL;
33b8afdf 3067 }
c277df42 3068 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3069 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3070 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3071 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3072 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3073 }
3074 else {
c445ea15 3075 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 3076 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3077 longest_fixed_length = 0;
a0d0e21e 3078 }
b81d288d 3079 if (r->regstclass
ffc61ed2 3080 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3081 r->regstclass = NULL;
33b8afdf
JH
3082 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3083 && stclass_flag
653099ff 3084 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3085 && !cl_is_anything(data.start_class))
3086 {
1df70142 3087 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3088
a02a5408 3089 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3090 struct regnode_charclass_class);
3091 StructCopy(data.start_class,
830247a4 3092 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3093 struct regnode_charclass_class);
830247a4 3094 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3095 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3096 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 3097 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3098 PerlIO_printf(Perl_debug_log,
a0288114 3099 "synthetic stclass \"%s\".\n",
3f7c398e 3100 SvPVX_const(sv));});
653099ff 3101 }
c277df42
IZ
3102
3103 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3104 if (longest_fixed_length > longest_float_length) {
c277df42 3105 r->check_substr = r->anchored_substr;
33b8afdf 3106 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3107 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3108 if (r->reganch & ROPT_ANCH_SINGLE)
3109 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3110 }
3111 else {
c277df42 3112 r->check_substr = r->float_substr;
33b8afdf 3113 r->check_utf8 = r->float_utf8;
c277df42
IZ
3114 r->check_offset_min = data.offset_float_min;
3115 r->check_offset_max = data.offset_float_max;
a0d0e21e 3116 }
30382c73
IZ
3117 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3118 This should be changed ASAP! */
33b8afdf 3119 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3120 r->reganch |= RE_USE_INTUIT;
33b8afdf 3121 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3122 r->reganch |= RE_INTUIT_TAIL;
3123 }
a0ed51b3
LW
3124 }
3125 else {
c277df42
IZ
3126 /* Several toplevels. Best we can is to set minlen. */
3127 I32 fake;
653099ff 3128 struct regnode_charclass_class ch_class;
cb434fcc 3129 I32 last_close = 0;
c277df42 3130
a3621e74 3131 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3132 scan = r->program + 1;
830247a4 3133 cl_init(pRExC_state, &ch_class);
653099ff 3134 data.start_class = &ch_class;
cb434fcc 3135 data.last_closep = &last_close;
a3621e74 3136 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf 3137 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 3138 = r->float_substr = r->float_utf8 = NULL;
653099ff 3139 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3140 && !cl_is_anything(data.start_class))
3141 {
1df70142 3142 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3143
a02a5408 3144 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
3145 struct regnode_charclass_class);
3146 StructCopy(data.start_class,
830247a4 3147 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3148 struct regnode_charclass_class);
830247a4 3149 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3150 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3151 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 3152 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 3153 PerlIO_printf(Perl_debug_log,
a0288114 3154 "synthetic stclass \"%s\".\n",
3f7c398e 3155 SvPVX_const(sv));});
653099ff 3156 }
a0d0e21e
LW
3157 }
3158
a0d0e21e 3159 r->minlen = minlen;
b81d288d 3160 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3161 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3162 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3163 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3164 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3165 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3166 if (RExC_seen & REG_SEEN_CANY)
3167 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
3168 Newxz(r->startp, RExC_npar, I32);
3169 Newxz(r->endp, RExC_npar, I32);
a3621e74 3170 DEBUG_COMPILE_r(regdump(r));
a0d0e21e 3171 return(r);
a687059c
LW
3172}
3173
3174/*
3175 - reg - regular expression, i.e. main body or parenthesized thing
3176 *
3177 * Caller must absorb opening parenthesis.
3178 *
3179 * Combining parenthesis handling with the base level of regular expression
3180 * is a trifle forced, but the need to tie the tails of the branches to what
3181 * follows makes it hard to avoid.
3182 */
76e3520e 3183STATIC regnode *
830247a4 3184S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 3185 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3186{
27da23d5 3187 dVAR;
c277df42
IZ
3188 register regnode *ret; /* Will be the head of the group. */
3189 register regnode *br;
3190 register regnode *lastbr;
cbbf8932 3191 register regnode *ender = NULL;
a0d0e21e 3192 register I32 parno = 0;
cbbf8932
AL
3193 I32 flags;
3194 const I32 oregflags = RExC_flags;
6136c704
AL
3195 bool have_branch = 0;
3196 bool is_open = 0;
9d1d55b5
JP
3197
3198 /* for (?g), (?gc), and (?o) warnings; warning
3199 about (?c) will warn about (?g) -- japhy */
3200
6136c704
AL
3201#define WASTED_O 0x01
3202#define WASTED_G 0x02
3203#define WASTED_C 0x04
3204#define WASTED_GC (0x02|0x04)
cbbf8932 3205 I32 wastedflags = 0x00;
9d1d55b5 3206
fac92740 3207 char * parse_start = RExC_parse; /* MJD */
a28509cc 3208 char * const oregcomp_parse = RExC_parse;
a0d0e21e 3209
821b33a5 3210 *flagp = 0; /* Tentatively. */
a0d0e21e 3211
9d1d55b5 3212
a0d0e21e
LW
3213 /* Make an OPEN node, if parenthesized. */
3214 if (paren) {
fac92740 3215 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3216 U32 posflags = 0, negflags = 0;
3217 U32 *flagsp = &posflags;
6136c704 3218 bool is_logical = 0;
a28509cc 3219 const char * const seqstart = RExC_parse;
ca9dfc88 3220
830247a4
IZ
3221 RExC_parse++;
3222 paren = *RExC_parse++;
c277df42 3223 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3224 switch (paren) {
fac92740 3225 case '<': /* (?<...) */
830247a4 3226 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3227 if (*RExC_parse == '!')
c277df42 3228 paren = ',';
b81d288d 3229 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3230 goto unknown;
830247a4 3231 RExC_parse++;
fac92740
MJD
3232 case '=': /* (?=...) */
3233 case '!': /* (?!...) */
830247a4 3234 RExC_seen_zerolen++;
fac92740
MJD
3235 case ':': /* (?:...) */
3236 case '>': /* (?>...) */
a0d0e21e 3237 break;
fac92740
MJD
3238 case '$': /* (?$...) */
3239 case '@': /* (?@...) */
8615cb43 3240 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3241 break;
fac92740 3242 case '#': /* (?#...) */
830247a4
IZ
3243 while (*RExC_parse && *RExC_parse != ')')
3244 RExC_parse++;
3245 if (*RExC_parse != ')')
c277df42 3246 FAIL("Sequence (?#... not terminated");
830247a4 3247 nextchar(pRExC_state);
a0d0e21e
LW
3248 *flagp = TRYAGAIN;
3249 return NULL;
fac92740 3250 case 'p': /* (?p...) */
9014280d 3251 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3252 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3253 /* FALL THROUGH*/
fac92740 3254 case '?': /* (??...) */
6136c704 3255 is_logical = 1;
438a3801
YST
3256 if (*RExC_parse != '{')
3257 goto unknown;
830247a4 3258 paren = *RExC_parse++;
0f5d15d6 3259 /* FALL THROUGH */
fac92740 3260 case '{': /* (?{...}) */
c277df42 3261 {
c277df42
IZ
3262 I32 count = 1, n = 0;
3263 char c;
830247a4 3264 char *s = RExC_parse;
c277df42 3265
830247a4
IZ
3266 RExC_seen_zerolen++;
3267 RExC_seen |= REG_SEEN_EVAL;
3268 while (count && (c = *RExC_parse)) {
6136c704
AL
3269 if (c == '\\') {
3270 if (RExC_parse[1])
3271 RExC_parse++;
3272 }
b81d288d 3273 else if (c == '{')
c277df42 3274 count++;
b81d288d 3275 else if (c == '}')
c277df42 3276 count--;
830247a4 3277 RExC_parse++;
c277df42 3278 }
6136c704 3279 if (*RExC_parse != ')') {
b81d288d 3280 RExC_parse = s;
b45f050a
JF
3281 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3282 }
c277df42 3283 if (!SIZE_ONLY) {
f3548bdc 3284 PAD *pad;
6136c704
AL
3285 OP_4tree *sop, *rop;
3286 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3287
569233ed
SB
3288 ENTER;
3289 Perl_save_re_context(aTHX);
f3548bdc 3290 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3291 sop->op_private |= OPpREFCOUNTED;
3292 /* re_dup will OpREFCNT_inc */
3293 OpREFCNT_set(sop, 1);
569233ed 3294 LEAVE;
c277df42 3295
830247a4
IZ
3296 n = add_data(pRExC_state, 3, "nop");
3297 RExC_rx->data->data[n] = (void*)rop;
3298 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3299 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3300 SvREFCNT_dec(sv);
a0ed51b3 3301 }
e24b16f9 3302 else { /* First pass */
830247a4 3303 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3304 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3305 /* No compiled RE interpolated, has runtime
3306 components ===> unsafe. */
3307 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3308 if (PL_tainting && PL_tainted)
cc6b7395 3309 FAIL("Eval-group in insecure regular expression");
54df2634 3310#if PERL_VERSION > 8
923e4eb5 3311 if (IN_PERL_COMPILETIME)
b5c19bd7 3312 PL_cv_has_eval = 1;
54df2634 3313#endif
c277df42 3314 }
b5c19bd7 3315
830247a4 3316 nextchar(pRExC_state);
6136c704 3317 if (is_logical) {
830247a4 3318 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3319 if (!SIZE_ONLY)
3320 ret->flags = 2;
830247a4 3321 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3322 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3323 return ret;
3324 }
ccb2c380
MP
3325 ret = reganode(pRExC_state, EVAL, n);
3326 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3327 Set_Node_Offset(ret, parse_start);
3328 return ret;
c277df42 3329 }
fac92740 3330 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3331 {
fac92740 3332 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3333 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3334 || RExC_parse[1] == '<'
830247a4 3335 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3336 I32 flag;
3337
830247a4 3338 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3339 if (!SIZE_ONLY)
3340 ret->flags = 1;
830247a4 3341 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 3342 goto insert_if;
b81d288d 3343 }
a0ed51b3 3344 }
830247a4 3345 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3346 /* (?(1)...) */
6136c704 3347 char c;
830247a4 3348 parno = atoi(RExC_parse++);
c277df42 3349
830247a4
IZ
3350 while (isDIGIT(*RExC_parse))
3351 RExC_parse++;
fac92740 3352 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3353
830247a4 3354 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3355 vFAIL("Switch condition not recognized");
c277df42 3356 insert_if:
830247a4
IZ
3357 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3358 br = regbranch(pRExC_state, &flags, 1);
c277df42 3359 if (br == NULL)
830247a4 3360 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3361 else
830247a4
IZ
3362 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3363 c = *nextchar(pRExC_state);
d1b80229
IZ
3364 if (flags&HASWIDTH)
3365 *flagp |= HASWIDTH;
c277df42 3366 if (c == '|') {
830247a4
IZ
3367 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3368 regbranch(pRExC_state, &flags, 1);
3369 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
3370 if (flags&HASWIDTH)
3371 *flagp |= HASWIDTH;
830247a4 3372 c = *nextchar(pRExC_state);
a0ed51b3
LW
3373 }
3374 else
c277df42
IZ
3375 lastbr = NULL;
3376 if (c != ')')
8615cb43 3377 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
3378 ender = reg_node(pRExC_state, TAIL);
3379 regtail(pRExC_state, br, ender);
c277df42 3380 if (lastbr) {
830247a4
IZ
3381 regtail(pRExC_state, lastbr, ender);
3382 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3383 }
3384 else
830247a4 3385 regtail(pRExC_state, ret, ender);
c277df42 3386 return ret;
a0ed51b3
LW
3387 }
3388 else {
830247a4 3389 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3390 }
3391 }
1b1626e4 3392 case 0:
830247a4 3393 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3394 vFAIL("Sequence (? incomplete");
1b1626e4 3395 break;
a0d0e21e 3396 default:
830247a4 3397 --RExC_parse;
fac92740 3398 parse_flags: /* (?i) */
830247a4 3399 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3400 /* (?g), (?gc) and (?o) are useless here
3401 and must be globally applied -- japhy */
3402
3403 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3404 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 3405 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
3406 if (! (wastedflags & wflagbit) ) {
3407 wastedflags |= wflagbit;
3408 vWARN5(
3409 RExC_parse + 1,
3410 "Useless (%s%c) - %suse /%c modifier",
3411 flagsp == &negflags ? "?-" : "?",
3412 *RExC_parse,
3413 flagsp == &negflags ? "don't " : "",
3414 *RExC_parse
3415 );
3416 }
3417 }
3418 }
3419 else if (*RExC_parse == 'c') {
3420 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
3421 if (! (wastedflags & WASTED_C) ) {
3422 wastedflags |= WASTED_GC;
9d1d55b5
JP
3423 vWARN3(
3424 RExC_parse + 1,
3425 "Useless (%sc) - %suse /gc modifier",
3426 flagsp == &negflags ? "?-" : "?",
3427 flagsp == &negflags ? "don't " : ""
3428 );
3429 }
3430 }
3431 }
3432 else { pmflag(flagsp, *RExC_parse); }
3433
830247a4 3434 ++RExC_parse;
ca9dfc88 3435 }
830247a4 3436 if (*RExC_parse == '-') {
ca9dfc88 3437 flagsp = &negflags;
9d1d55b5 3438 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3439 ++RExC_parse;
ca9dfc88 3440 goto parse_flags;
48c036b1 3441 }
e2509266
JH
3442 RExC_flags |= posflags;
3443 RExC_flags &= ~negflags;
830247a4
IZ
3444 if (*RExC_parse == ':') {
3445 RExC_parse++;
ca9dfc88
IZ
3446 paren = ':';
3447 break;
3448 }
c277df42 3449 unknown:
830247a4
IZ
3450 if (*RExC_parse != ')') {
3451 RExC_parse++;
3452 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3453 }
830247a4 3454 nextchar(pRExC_state);
a0d0e21e
LW
3455 *flagp = TRYAGAIN;
3456 return NULL;
3457 }
3458 }
fac92740 3459 else { /* (...) */
830247a4
IZ
3460 parno = RExC_npar;
3461 RExC_npar++;
3462 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3463 Set_Node_Length(ret, 1); /* MJD */
3464 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 3465 is_open = 1;
a0d0e21e 3466 }
a0ed51b3 3467 }
fac92740 3468 else /* ! paren */
a0d0e21e
LW
3469 ret = NULL;
3470
3471 /* Pick up the branches, linking them together. */
fac92740 3472 parse_start = RExC_parse; /* MJD */
830247a4 3473 br = regbranch(pRExC_state, &flags, 1);
fac92740 3474 /* branch_len = (paren != 0); */
2af232bd 3475
a0d0e21e
LW
3476 if (br == NULL)
3477 return(NULL);
830247a4
IZ
3478 if (*RExC_parse == '|') {
3479 if (!SIZE_ONLY && RExC_extralen) {
3480 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3481 }
fac92740 3482 else { /* MJD */
830247a4 3483 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3484 Set_Node_Length(br, paren != 0);
3485 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3486 }
c277df42
IZ
3487 have_branch = 1;
3488 if (SIZE_ONLY)
830247a4 3489 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3490 }
3491 else if (paren == ':') {
c277df42
IZ
3492 *flagp |= flags&SIMPLE;
3493 }
6136c704 3494 if (is_open) { /* Starts with OPEN. */
830247a4 3495 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3496 }
3497 else if (paren != '?') /* Not Conditional */
a0d0e21e 3498 ret = br;
32a0ca98 3499 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3500 lastbr = br;
830247a4
IZ
3501 while (*RExC_parse == '|') {
3502 if (!SIZE_ONLY && RExC_extralen) {
3503 ender = reganode(pRExC_state, LONGJMP,0);
3504 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3505 }
3506 if (SIZE_ONLY)
830247a4
IZ
3507 RExC_extralen += 2; /* Account for LONGJMP. */
3508 nextchar(pRExC_state);
3509 br = regbranch(pRExC_state, &flags, 0);
2af232bd 3510
a687059c 3511 if (br == NULL)
a0d0e21e 3512 return(NULL);
830247a4 3513 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3514 lastbr = br;
821b33a5
IZ
3515 if (flags&HASWIDTH)
3516 *flagp |= HASWIDTH;
a687059c 3517 *flagp |= flags&SPSTART;
a0d0e21e
LW
3518 }
3519
c277df42
IZ
3520 if (have_branch || paren != ':') {
3521 /* Make a closing node, and hook it on the end. */
3522 switch (paren) {
3523 case ':':
830247a4 3524 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3525 break;
3526 case 1:
830247a4 3527 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3528 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3529 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3530 break;
3531 case '<':
c277df42
IZ
3532 case ',':
3533 case '=':
3534 case '!':
c277df42 3535 *flagp &= ~HASWIDTH;
821b33a5
IZ
3536 /* FALL THROUGH */
3537 case '>':
830247a4 3538 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3539 break;
3540 case 0:
830247a4 3541 ender = reg_node(pRExC_state, END);
c277df42
IZ
3542 break;
3543 }
830247a4 3544 regtail(pRExC_state, lastbr, ender);
a0d0e21e 3545
9674d46a 3546 if (have_branch && !SIZE_ONLY) {
c277df42 3547 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
3548 for (br = ret; br; br = regnext(br)) {
3549 const U8 op = PL_regkind[OP(br)];
3550 if (op == BRANCH) {
3551 regtail(pRExC_state, NEXTOPER(br), ender);
3552 }
3553 else if (op == BRANCHJ) {
3554 regtail(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
3555 }
c277df42
IZ
3556 }
3557 }
a0d0e21e 3558 }
c277df42
IZ
3559
3560 {
e1ec3a88
AL
3561 const char *p;
3562 static const char parens[] = "=!<,>";
c277df42
IZ
3563
3564 if (paren && (p = strchr(parens, paren))) {
eb160463 3565 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
3566 int flag = (p - parens) > 1;
3567
3568 if (paren == '>')
3569 node = SUSPEND, flag = 0;
830247a4 3570 reginsert(pRExC_state, node,ret);
45948336
EP
3571 Set_Node_Cur_Length(ret);
3572 Set_Node_Offset(ret, parse_start + 1);
c277df42 3573 ret->flags = flag;
830247a4 3574 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 3575 }
a0d0e21e
LW
3576 }
3577
3578 /* Check for proper termination. */
ce3e6498 3579 if (paren) {
e2509266 3580 RExC_flags = oregflags;
830247a4
IZ
3581 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3582 RExC_parse = oregcomp_parse;
380a0633 3583 vFAIL("Unmatched (");
ce3e6498 3584 }
a0ed51b3 3585 }
830247a4
IZ
3586 else if (!paren && RExC_parse < RExC_end) {
3587 if (*RExC_parse == ')') {
3588 RExC_parse++;
380a0633 3589 vFAIL("Unmatched )");
a0ed51b3
LW
3590 }
3591 else
b45f050a 3592 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
3593 /* NOTREACHED */
3594 }
a687059c 3595
a0d0e21e 3596 return(ret);
a687059c
LW
3597}
3598
3599/*
3600 - regbranch - one alternative of an | operator
3601 *
3602 * Implements the concatenation operator.
3603 */
76e3520e 3604STATIC regnode *
830247a4 3605S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 3606{
97aff369 3607 dVAR;
c277df42
IZ
3608 register regnode *ret;
3609 register regnode *chain = NULL;
3610 register regnode *latest;
3611 I32 flags = 0, c = 0;
a0d0e21e 3612
b81d288d 3613 if (first)
c277df42
IZ
3614 ret = NULL;
3615 else {
b81d288d 3616 if (!SIZE_ONLY && RExC_extralen)
830247a4 3617 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 3618 else {
830247a4 3619 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
3620 Set_Node_Length(ret, 1);
3621 }
c277df42
IZ
3622 }
3623
b81d288d 3624 if (!first && SIZE_ONLY)
830247a4 3625 RExC_extralen += 1; /* BRANCHJ */
b81d288d 3626
c277df42 3627 *flagp = WORST; /* Tentatively. */
a0d0e21e 3628
830247a4
IZ
3629 RExC_parse--;
3630 nextchar(pRExC_state);
3631 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 3632 flags &= ~TRYAGAIN;
830247a4 3633 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
3634 if (latest == NULL) {
3635 if (flags & TRYAGAIN)
3636 continue;
3637 return(NULL);
a0ed51b3
LW
3638 }
3639 else if (ret == NULL)
c277df42 3640 ret = latest;
a0d0e21e 3641 *flagp |= flags&HASWIDTH;
c277df42 3642 if (chain == NULL) /* First piece. */
a0d0e21e
LW
3643 *flagp |= flags&SPSTART;
3644 else {
830247a4
IZ
3645 RExC_naughty++;
3646 regtail(pRExC_state, chain, latest);
a687059c 3647 }
a0d0e21e 3648 chain = latest;
c277df42
IZ
3649 c++;
3650 }
3651 if (chain == NULL) { /* Loop ran zero times. */
830247a4 3652 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
3653 if (ret == NULL)
3654 ret = chain;
3655 }
3656 if (c == 1) {
3657 *flagp |= flags&SIMPLE;
a0d0e21e 3658 }
a687059c 3659
d4c19fe8 3660 return ret;
a687059c
LW
3661}
3662
3663/*
3664 - regpiece - something followed by possible [*+?]
3665 *
3666 * Note that the branching code sequences used for ? and the general cases
3667 * of * and + are somewhat optimized: they use the same NOTHING node as
3668 * both the endmarker for their branch list and the body of the last branch.
3669 * It might seem that this node could be dispensed with entirely, but the
3670 * endmarker role is not redundant.
3671 */
76e3520e 3672STATIC regnode *
830247a4 3673S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3674{
97aff369 3675 dVAR;
c277df42 3676 register regnode *ret;
a0d0e21e
LW
3677 register char op;
3678 register char *next;
3679 I32 flags;
1df70142 3680 const char * const origparse = RExC_parse;
a0d0e21e
LW
3681 char *maxpos;
3682 I32 min;
c277df42 3683 I32 max = REG_INFTY;
fac92740 3684 char *parse_start;
a0d0e21e 3685
830247a4 3686 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
3687 if (ret == NULL) {
3688 if (flags & TRYAGAIN)
3689 *flagp |= TRYAGAIN;
3690 return(NULL);
3691 }
3692
830247a4 3693 op = *RExC_parse;
a0d0e21e 3694
830247a4 3695 if (op == '{' && regcurly(RExC_parse)) {
fac92740 3696 parse_start = RExC_parse; /* MJD */
830247a4 3697 next = RExC_parse + 1;
c445ea15 3698 maxpos = NULL;
a0d0e21e
LW
3699 while (isDIGIT(*next) || *next == ',') {
3700 if (*next == ',') {
3701 if (maxpos)
3702 break;
3703 else
3704 maxpos = next;
a687059c 3705 }
a0d0e21e
LW
3706 next++;
3707 }
3708 if (*next == '}') { /* got one */
3709 if (!maxpos)
3710 maxpos = next;
830247a4
IZ
3711 RExC_parse++;
3712 min = atoi(RExC_parse);
a0d0e21e
LW
3713 if (*maxpos == ',')
3714 maxpos++;
3715 else
830247a4 3716 maxpos = RExC_parse;
a0d0e21e
LW
3717 max = atoi(maxpos);
3718 if (!max && *maxpos != '0')
c277df42
IZ
3719 max = REG_INFTY; /* meaning "infinity" */
3720 else if (max >= REG_INFTY)
8615cb43 3721 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
3722 RExC_parse = next;
3723 nextchar(pRExC_state);
a0d0e21e
LW
3724
3725 do_curly:
3726 if ((flags&SIMPLE)) {
830247a4
IZ
3727 RExC_naughty += 2 + RExC_naughty / 2;
3728 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
3729 Set_Node_Offset(ret, parse_start+1); /* MJD */
3730 Set_Node_Cur_Length(ret);
a0d0e21e
LW
3731 }
3732 else {
830247a4 3733 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
3734
3735 w->flags = 0;
830247a4
IZ
3736 regtail(pRExC_state, ret, w);
3737 if (!SIZE_ONLY && RExC_extralen) {
3738 reginsert(pRExC_state, LONGJMP,ret);
3739 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
3740 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3741 }
830247a4 3742 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
3743 /* MJD hk */
3744 Set_Node_Offset(ret, parse_start+1);
2af232bd 3745 Set_Node_Length(ret,
fac92740 3746 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 3747
830247a4 3748 if (!SIZE_ONLY && RExC_extralen)
c277df42 3749 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 3750 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 3751 if (SIZE_ONLY)
830247a4
IZ
3752 RExC_whilem_seen++, RExC_extralen += 3;
3753 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 3754 }
c277df42 3755 ret->flags = 0;
a0d0e21e
LW
3756
3757 if (min > 0)
821b33a5
IZ
3758 *flagp = WORST;
3759 if (max > 0)
3760 *flagp |= HASWIDTH;
a0d0e21e 3761 if (max && max < min)
8615cb43 3762 vFAIL("Can't do {n,m} with n > m");
c277df42 3763 if (!SIZE_ONLY) {
eb160463
GS
3764 ARG1_SET(ret, (U16)min);
3765 ARG2_SET(ret, (U16)max);
a687059c 3766 }
a687059c 3767
a0d0e21e 3768 goto nest_check;
a687059c 3769 }
a0d0e21e 3770 }
a687059c 3771
a0d0e21e
LW
3772 if (!ISMULT1(op)) {
3773 *flagp = flags;
a687059c 3774 return(ret);
a0d0e21e 3775 }
bb20fd44 3776
c277df42 3777#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
3778
3779 /* if this is reinstated, don't forget to put this back into perldiag:
3780
3781 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3782
3783 (F) The part of the regexp subject to either the * or + quantifier
3784 could match an empty string. The {#} shows in the regular
3785 expression about where the problem was discovered.
3786
3787 */
3788
bb20fd44 3789 if (!(flags&HASWIDTH) && op != '?')
b45f050a 3790 vFAIL("Regexp *+ operand could be empty");
b81d288d 3791#endif
bb20fd44 3792
fac92740 3793 parse_start = RExC_parse;
830247a4 3794 nextchar(pRExC_state);
a0d0e21e 3795
821b33a5 3796 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
3797
3798 if (op == '*' && (flags&SIMPLE)) {
830247a4 3799 reginsert(pRExC_state, STAR, ret);
c277df42 3800 ret->flags = 0;
830247a4 3801 RExC_naughty += 4;
a0d0e21e
LW
3802 }
3803 else if (op == '*') {
3804 min = 0;
3805 goto do_curly;
a0ed51b3
LW
3806 }
3807 else if (op == '+' && (flags&SIMPLE)) {
830247a4 3808 reginsert(pRExC_state, PLUS, ret);
c277df42 3809 ret->flags = 0;
830247a4 3810 RExC_naughty += 3;
a0d0e21e
LW
3811 }
3812 else if (op == '+') {
3813 min = 1;
3814 goto do_curly;
a0ed51b3
LW
3815 }
3816 else if (op == '?') {
a0d0e21e
LW
3817 min = 0; max = 1;
3818 goto do_curly;
3819 }
3820 nest_check:
041457d9 3821 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 3822 vWARN3(RExC_parse,
b45f050a 3823 "%.*s matches null string many times",
afd78fd5 3824 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 3825 origparse);
a0d0e21e
LW
3826 }
3827
830247a4
IZ
3828 if (*RExC_parse == '?') {
3829 nextchar(pRExC_state);
3830 reginsert(pRExC_state, MINMOD, ret);
3831 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 3832 }
830247a4
IZ
3833 if (ISMULT2(RExC_parse)) {
3834 RExC_parse++;
b45f050a
JF
3835 vFAIL("Nested quantifiers");
3836 }
a0d0e21e
LW
3837
3838 return(ret);
a687059c
LW
3839}
3840
3841/*
3842 - regatom - the lowest level
3843 *
3844 * Optimization: gobbles an entire sequence of ordinary characters so that
3845 * it can turn them into a single node, which is smaller to store and
3846 * faster to run. Backslashed characters are exceptions, each becoming a
3847 * separate node; the code is simpler that way and it's not worth fixing.
3848 *
7f6f358c
YO
3849 * [Yes, it is worth fixing, some scripts can run twice the speed.]
3850 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
3851 */
76e3520e 3852STATIC regnode *
830247a4 3853S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3854{
97aff369 3855 dVAR;
cbbf8932 3856 register regnode *ret = NULL;
a0d0e21e 3857 I32 flags;
45948336 3858 char *parse_start = RExC_parse;
a0d0e21e
LW
3859
3860 *flagp = WORST; /* Tentatively. */
3861
3862tryagain:
830247a4 3863 switch (*RExC_parse) {
a0d0e21e 3864 case '^':
830247a4
IZ
3865 RExC_seen_zerolen++;
3866 nextchar(pRExC_state);
e2509266 3867 if (RExC_flags & PMf_MULTILINE)
830247a4 3868 ret = reg_node(pRExC_state, MBOL);
e2509266 3869 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3870 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3871 else
830247a4 3872 ret = reg_node(pRExC_state, BOL);
fac92740 3873 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3874 break;
3875 case '$':
830247a4 3876 nextchar(pRExC_state);
b81d288d 3877 if (*RExC_parse)
830247a4 3878 RExC_seen_zerolen++;
e2509266 3879 if (RExC_flags & PMf_MULTILINE)
830247a4 3880 ret = reg_node(pRExC_state, MEOL);
e2509266 3881 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3882 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3883 else
830247a4 3884 ret = reg_node(pRExC_state, EOL);
fac92740 3885 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3886 break;
3887 case '.':
830247a4 3888 nextchar(pRExC_state);
e2509266 3889 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
3890 ret = reg_node(pRExC_state, SANY);
3891 else
3892 ret = reg_node(pRExC_state, REG_ANY);
3893 *flagp |= HASWIDTH|SIMPLE;
830247a4 3894 RExC_naughty++;
fac92740 3895 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3896 break;
3897 case '[':
b45f050a 3898 {
830247a4 3899 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 3900 ret = regclass(pRExC_state);
830247a4
IZ
3901 if (*RExC_parse != ']') {
3902 RExC_parse = oregcomp_parse;
b45f050a
JF
3903 vFAIL("Unmatched [");
3904 }
830247a4 3905 nextchar(pRExC_state);
a0d0e21e 3906 *flagp |= HASWIDTH|SIMPLE;
fac92740 3907 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 3908 break;
b45f050a 3909 }
a0d0e21e 3910 case '(':
830247a4
IZ
3911 nextchar(pRExC_state);
3912 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 3913 if (ret == NULL) {
bf93d4cc 3914 if (flags & TRYAGAIN) {
830247a4 3915 if (RExC_parse == RExC_end) {
bf93d4cc
GS
3916 /* Make parent create an empty node if needed. */
3917 *flagp |= TRYAGAIN;
3918 return(NULL);
3919 }
a0d0e21e 3920 goto tryagain;
bf93d4cc 3921 }
a0d0e21e
LW
3922 return(NULL);
3923 }
c277df42 3924 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
3925 break;
3926 case '|':
3927 case ')':
3928 if (flags & TRYAGAIN) {
3929 *flagp |= TRYAGAIN;
3930 return NULL;
3931 }
b45f050a 3932 vFAIL("Internal urp");
a0d0e21e
LW
3933 /* Supposed to be caught earlier. */
3934 break;
85afd4ae 3935 case '{':
830247a4
IZ
3936 if (!regcurly(RExC_parse)) {
3937 RExC_parse++;
85afd4ae
CS
3938 goto defchar;
3939 }
3940 /* FALL THROUGH */
a0d0e21e
LW
3941 case '?':
3942 case '+':
3943 case '*':
830247a4 3944 RExC_parse++;
b45f050a 3945 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
3946 break;
3947 case '\\':
830247a4 3948 switch (*++RExC_parse) {
a0d0e21e 3949 case 'A':
830247a4
IZ
3950 RExC_seen_zerolen++;
3951 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3952 *flagp |= SIMPLE;
830247a4 3953 nextchar(pRExC_state);
fac92740 3954 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3955 break;
3956 case 'G':
830247a4
IZ
3957 ret = reg_node(pRExC_state, GPOS);
3958 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 3959 *flagp |= SIMPLE;
830247a4 3960 nextchar(pRExC_state);
fac92740 3961 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3962 break;
3963 case 'Z':
830247a4 3964 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3965 *flagp |= SIMPLE;
a1917ab9 3966 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 3967 nextchar(pRExC_state);
a0d0e21e 3968 break;
b85d18e9 3969 case 'z':
830247a4 3970 ret = reg_node(pRExC_state, EOS);
b85d18e9 3971 *flagp |= SIMPLE;
830247a4
IZ
3972 RExC_seen_zerolen++; /* Do not optimize RE away */
3973 nextchar(pRExC_state);
fac92740 3974 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 3975 break;
4a2d328f 3976 case 'C':
f33976b4
DB
3977 ret = reg_node(pRExC_state, CANY);
3978 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 3979 *flagp |= HASWIDTH|SIMPLE;
830247a4 3980 nextchar(pRExC_state);
fac92740 3981 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
3982 break;
3983 case 'X':
830247a4 3984 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 3985 *flagp |= HASWIDTH;
830247a4 3986 nextchar(pRExC_state);
fac92740 3987 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 3988 break;
a0d0e21e 3989 case 'w':
eb160463 3990 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 3991 *flagp |= HASWIDTH|SIMPLE;
830247a4 3992 nextchar(pRExC_state);
fac92740 3993 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3994 break;
3995 case 'W':
eb160463 3996 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 3997 *flagp |= HASWIDTH|SIMPLE;
830247a4 3998 nextchar(pRExC_state);
fac92740 3999 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4000 break;
4001 case 'b':
830247a4
IZ
4002 RExC_seen_zerolen++;
4003 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4004 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4005 *flagp |= SIMPLE;
830247a4 4006 nextchar(pRExC_state);
fac92740 4007 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4008 break;
4009 case 'B':
830247a4
IZ
4010 RExC_seen_zerolen++;
4011 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4012 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4013 *flagp |= SIMPLE;
830247a4 4014 nextchar(pRExC_state);
fac92740 4015 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4016 break;
4017 case 's':
eb160463 4018 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4019 *flagp |= HASWIDTH|SIMPLE;
830247a4 4020 nextchar(pRExC_state);
fac92740 4021 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4022 break;
4023 case 'S':
eb160463 4024 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4025 *flagp |= HASWIDTH|SIMPLE;
830247a4 4026 nextchar(pRExC_state);
fac92740 4027 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4028 break;
4029 case 'd':
ffc61ed2 4030 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4031 *flagp |= HASWIDTH|SIMPLE;
830247a4 4032 nextchar(pRExC_state);
fac92740 4033 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4034 break;
4035 case 'D':
ffc61ed2 4036 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4037 *flagp |= HASWIDTH|SIMPLE;
830247a4 4038 nextchar(pRExC_state);
fac92740 4039 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4040 break;
a14b48bc
LW
4041 case 'p':
4042 case 'P':
3568d838 4043 {
830247a4 4044 char* oldregxend = RExC_end;
ccb2c380 4045 char* parse_start = RExC_parse - 2;
a14b48bc 4046
830247a4 4047 if (RExC_parse[1] == '{') {
3568d838 4048 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4049 RExC_end = strchr(RExC_parse, '}');
4050 if (!RExC_end) {
0da60cf5 4051 U8 c = (U8)*RExC_parse;
830247a4
IZ
4052 RExC_parse += 2;
4053 RExC_end = oldregxend;
0da60cf5 4054 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4055 }
830247a4 4056 RExC_end++;
a14b48bc 4057 }
af6f566e 4058 else {
830247a4 4059 RExC_end = RExC_parse + 2;
af6f566e
HS
4060 if (RExC_end > oldregxend)
4061 RExC_end = oldregxend;
4062 }
830247a4 4063 RExC_parse--;
a14b48bc 4064
ffc61ed2 4065 ret = regclass(pRExC_state);
a14b48bc 4066
830247a4
IZ
4067 RExC_end = oldregxend;
4068 RExC_parse--;
ccb2c380
MP
4069
4070 Set_Node_Offset(ret, parse_start + 2);
4071 Set_Node_Cur_Length(ret);
830247a4 4072 nextchar(pRExC_state);
a14b48bc
LW
4073 *flagp |= HASWIDTH|SIMPLE;
4074 }
4075 break;
a0d0e21e
LW
4076 case 'n':
4077 case 'r':
4078 case 't':
4079 case 'f':
4080 case 'e':
4081 case 'a':
4082 case 'x':
4083 case 'c':
4084 case '0':
4085 goto defchar;
4086 case '1': case '2': case '3': case '4':
4087 case '5': case '6': case '7': case '8': case '9':
4088 {
1df70142 4089 const I32 num = atoi(RExC_parse);
a0d0e21e 4090
830247a4 4091 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4092 goto defchar;
4093 else {
fac92740 4094 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4095 while (isDIGIT(*RExC_parse))
4096 RExC_parse++;
b45f050a 4097
eb160463 4098 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4099 vFAIL("Reference to nonexistent group");
830247a4 4100 RExC_sawback = 1;
eb160463
GS
4101 ret = reganode(pRExC_state,
4102 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4103 num);
a0d0e21e 4104 *flagp |= HASWIDTH;
2af232bd 4105
fac92740 4106 /* override incorrect value set in reganode MJD */
2af232bd 4107 Set_Node_Offset(ret, parse_start+1);
fac92740 4108 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4109 RExC_parse--;
4110 nextchar(pRExC_state);
a0d0e21e
LW
4111 }
4112 }
4113 break;
4114 case '\0':
830247a4 4115 if (RExC_parse >= RExC_end)
b45f050a 4116 FAIL("Trailing \\");
a0d0e21e
LW
4117 /* FALL THROUGH */
4118 default:
a0288114 4119 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 4120 back into the quick-grab loop below */
45948336 4121 parse_start--;
a0d0e21e
LW
4122 goto defchar;
4123 }
4124 break;
4633a7c4
LW
4125
4126 case '#':
e2509266 4127 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4128 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4129 if (RExC_parse < RExC_end)
4633a7c4
LW
4130 goto tryagain;
4131 }
4132 /* FALL THROUGH */
4133
a0d0e21e 4134 default: {
ba210ebe 4135 register STRLEN len;
58ae7d3f 4136 register UV ender;
a0d0e21e 4137 register char *p;
c277df42 4138 char *oldp, *s;
80aecb99 4139 STRLEN foldlen;
89ebb4a3 4140 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4141
4142 parse_start = RExC_parse - 1;
a0d0e21e 4143
830247a4 4144 RExC_parse++;
a0d0e21e
LW
4145
4146 defchar:
58ae7d3f 4147 ender = 0;
eb160463
GS
4148 ret = reg_node(pRExC_state,
4149 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4150 s = STRING(ret);
830247a4
IZ
4151 for (len = 0, p = RExC_parse - 1;
4152 len < 127 && p < RExC_end;
a0d0e21e
LW
4153 len++)
4154 {
4155 oldp = p;
5b5a24f7 4156
e2509266 4157 if (RExC_flags & PMf_EXTENDED)
830247a4 4158 p = regwhite(p, RExC_end);
a0d0e21e
LW
4159 switch (*p) {
4160 case '^':
4161 case '$':
4162 case '.':
4163 case '[':
4164 case '(':
4165 case ')':
4166 case '|':
4167 goto loopdone;
4168 case '\\':
4169 switch (*++p) {
4170 case 'A':
1ed8eac0
JF
4171 case 'C':
4172 case 'X':
a0d0e21e
LW
4173 case 'G':
4174 case 'Z':
b85d18e9 4175 case 'z':
a0d0e21e
LW
4176 case 'w':
4177 case 'W':
4178 case 'b':
4179 case 'B':
4180 case 's':
4181 case 'S':
4182 case 'd':
4183 case 'D':
a14b48bc
LW
4184 case 'p':
4185 case 'P':
a0d0e21e
LW
4186 --p;
4187 goto loopdone;
4188 case 'n':
4189 ender = '\n';
4190 p++;
a687059c 4191 break;
a0d0e21e
LW
4192 case 'r':
4193 ender = '\r';
4194 p++;
a687059c 4195 break;
a0d0e21e
LW
4196 case 't':
4197 ender = '\t';
4198 p++;
a687059c 4199 break;
a0d0e21e
LW
4200 case 'f':
4201 ender = '\f';
4202 p++;
a687059c 4203 break;
a0d0e21e 4204 case 'e':
c7f1f016 4205 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4206 p++;
a687059c 4207 break;
a0d0e21e 4208 case 'a':
c7f1f016 4209 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4210 p++;
a687059c 4211 break;
a0d0e21e 4212 case 'x':
a0ed51b3 4213 if (*++p == '{') {
1df70142 4214 char* const e = strchr(p, '}');
b81d288d 4215
b45f050a 4216 if (!e) {
830247a4 4217 RExC_parse = p + 1;
b45f050a
JF
4218 vFAIL("Missing right brace on \\x{}");
4219 }
de5f0749 4220 else {
a4c04bdc
NC
4221 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4222 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 4223 STRLEN numlen = e - p - 1;
53305cf1 4224 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4225 if (ender > 0xff)
4226 RExC_utf8 = 1;
a0ed51b3
LW
4227 p = e + 1;
4228 }
a0ed51b3
LW
4229 }
4230 else {
a4c04bdc 4231 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 4232 STRLEN numlen = 2;
53305cf1 4233 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4234 p += numlen;
4235 }
a687059c 4236 break;
a0d0e21e
LW
4237 case 'c':
4238 p++;
bbce6d69 4239 ender = UCHARAT(p++);
4240 ender = toCTRL(ender);
a687059c 4241 break;
a0d0e21e
LW
4242 case '0': case '1': case '2': case '3':case '4':
4243 case '5': case '6': case '7': case '8':case '9':
4244 if (*p == '0' ||
830247a4 4245 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 4246 I32 flags = 0;
1df70142 4247 STRLEN numlen = 3;
53305cf1 4248 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4249 p += numlen;
4250 }
4251 else {
4252 --p;
4253 goto loopdone;
a687059c
LW
4254 }
4255 break;
a0d0e21e 4256 case '\0':
830247a4 4257 if (p >= RExC_end)
b45f050a 4258 FAIL("Trailing \\");
a687059c 4259 /* FALL THROUGH */
a0d0e21e 4260 default:
041457d9 4261 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 4262 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4263 goto normal_default;
a0d0e21e
LW
4264 }
4265 break;
a687059c 4266 default:
a0ed51b3 4267 normal_default:
fd400ab9 4268 if (UTF8_IS_START(*p) && UTF) {
1df70142 4269 STRLEN numlen;
5e12f4fb 4270 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 4271 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
4272 p += numlen;
4273 }
4274 else
4275 ender = *p++;
a0d0e21e 4276 break;
a687059c 4277 }
e2509266 4278 if (RExC_flags & PMf_EXTENDED)
830247a4 4279 p = regwhite(p, RExC_end);
60a8b682
JH
4280 if (UTF && FOLD) {
4281 /* Prime the casefolded buffer. */
ac7e0132 4282 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4283 }
a0d0e21e
LW
4284 if (ISMULT2(p)) { /* Back off on ?+*. */
4285 if (len)
4286 p = oldp;
16ea2a2e 4287 else if (UTF) {
80aecb99 4288 if (FOLD) {
60a8b682 4289 /* Emit all the Unicode characters. */
1df70142 4290 STRLEN numlen;
80aecb99
JH
4291 for (foldbuf = tmpbuf;
4292 foldlen;
4293 foldlen -= numlen) {
4294 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4295 if (numlen > 0) {
71207a34 4296 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
4297 s += unilen;
4298 len += unilen;
4299 /* In EBCDIC the numlen
4300 * and unilen can differ. */
9dc45d57 4301 foldbuf += numlen;
47654450
JH
4302 if (numlen >= foldlen)
4303 break;
9dc45d57
JH
4304 }
4305 else
4306 break; /* "Can't happen." */
80aecb99
JH
4307 }
4308 }
4309 else {
71207a34 4310 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 4311 if (unilen > 0) {
0ebc6274
JH
4312 s += unilen;
4313 len += unilen;
9dc45d57 4314 }
80aecb99 4315 }
a0ed51b3 4316 }
a0d0e21e
LW
4317 else {
4318 len++;
eb160463 4319 REGC((char)ender, s++);
a0d0e21e
LW
4320 }
4321 break;
a687059c 4322 }
16ea2a2e 4323 if (UTF) {
80aecb99 4324 if (FOLD) {
60a8b682 4325 /* Emit all the Unicode characters. */
1df70142 4326 STRLEN numlen;
80aecb99
JH
4327 for (foldbuf = tmpbuf;
4328 foldlen;
4329 foldlen -= numlen) {
4330 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4331 if (numlen > 0) {
71207a34 4332 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
4333 len += unilen;
4334 s += unilen;
4335 /* In EBCDIC the numlen
4336 * and unilen can differ. */
9dc45d57 4337 foldbuf += numlen;
47654450
JH
4338 if (numlen >= foldlen)
4339 break;
9dc45d57
JH
4340 }
4341 else
4342 break;
80aecb99
JH
4343 }
4344 }
4345 else {
71207a34 4346 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 4347 if (unilen > 0) {
0ebc6274
JH
4348 s += unilen;
4349 len += unilen;
9dc45d57 4350 }
80aecb99
JH
4351 }
4352 len--;
a0ed51b3
LW
4353 }
4354 else
eb160463 4355 REGC((char)ender, s++);
a0d0e21e
LW
4356 }
4357 loopdone:
830247a4 4358 RExC_parse = p - 1;
fac92740 4359 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4360 nextchar(pRExC_state);
793db0cb
JH
4361 {
4362 /* len is STRLEN which is unsigned, need to copy to signed */
4363 IV iv = len;
4364 if (iv < 0)
4365 vFAIL("Internal disaster");
4366 }
a0d0e21e
LW
4367 if (len > 0)
4368 *flagp |= HASWIDTH;
090f7165 4369 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4370 *flagp |= SIMPLE;
c277df42 4371 if (!SIZE_ONLY)
cd439c50
IZ
4372 STR_LEN(ret) = len;
4373 if (SIZE_ONLY)
830247a4 4374 RExC_size += STR_SZ(len);
cd439c50 4375 else
830247a4 4376 RExC_emit += STR_SZ(len);
a687059c 4377 }
a0d0e21e
LW
4378 break;
4379 }
a687059c 4380
60a8b682
JH
4381 /* If the encoding pragma is in effect recode the text of
4382 * any EXACT-kind nodes. */
22c54be3 4383 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
4384 STRLEN oldlen = STR_LEN(ret);
4385 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4386
4387 if (RExC_utf8)
4388 SvUTF8_on(sv);
4389 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
4390 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4391 const STRLEN newlen = SvCUR(sv);
d0063567
DK
4392
4393 if (SvUTF8(sv))
4394 RExC_utf8 = 1;
4395 if (!SIZE_ONLY) {
a3621e74
YO
4396 GET_RE_DEBUG_FLAGS_DECL;
4397 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
4398 (int)oldlen, STRING(ret),
4399 (int)newlen, s));
4400 Copy(s, STRING(ret), newlen, char);
4401 STR_LEN(ret) += newlen - oldlen;
4402 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4403 } else
4404 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4405 }
a72c7584
JH
4406 }
4407
a0d0e21e 4408 return(ret);
a687059c
LW
4409}
4410
873ef191 4411STATIC char *
5f66b61c 4412S_regwhite(char *p, const char *e)
5b5a24f7
CS
4413{
4414 while (p < e) {
4415 if (isSPACE(*p))
4416 ++p;
4417 else if (*p == '#') {
4418 do {
4419 p++;
4420 } while (p < e && *p != '\n');
4421 }
4422 else
4423 break;
4424 }
4425 return p;
4426}
4427
b8c5462f
JH
4428/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4429 Character classes ([:foo:]) can also be negated ([:^foo:]).
4430 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4431 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 4432 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
4433
4434#define POSIXCC_DONE(c) ((c) == ':')
4435#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4436#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4437
b8c5462f 4438STATIC I32
830247a4 4439S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 4440{
97aff369 4441 dVAR;
936ed897 4442 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 4443
830247a4 4444 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 4445 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 4446 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 4447 const char c = UCHARAT(RExC_parse);
097eb12c 4448 char* const s = RExC_parse++;
b81d288d 4449
9a86a77b 4450 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
4451 RExC_parse++;
4452 if (RExC_parse == RExC_end)
620e46c5 4453 /* Grandfather lone [:, [=, [. */
830247a4 4454 RExC_parse = s;
620e46c5 4455 else {
1df70142 4456 const char* t = RExC_parse++; /* skip over the c */
a28509cc 4457 const char *posixcc;
b8c5462f 4458
80916619
NC
4459 assert(*t == c);
4460
9a86a77b 4461 if (UCHARAT(RExC_parse) == ']') {
830247a4 4462 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
4463 posixcc = s + 1;
4464 if (*s == ':') {
1df70142
AL
4465 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4466 const I32 skip = t - posixcc;
80916619
NC
4467
4468 /* Initially switch on the length of the name. */
4469 switch (skip) {
4470 case 4:
4471 if (memEQ(posixcc, "word", 4)) {
4472 /* this is not POSIX, this is the Perl \w */;
4473 namedclass
4474 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4475 }
cc4319de 4476 break;
80916619
NC
4477 case 5:
4478 /* Names all of length 5. */
4479 /* alnum alpha ascii blank cntrl digit graph lower
4480 print punct space upper */
4481 /* Offset 4 gives the best switch position. */
4482 switch (posixcc[4]) {
4483 case 'a':
4484 if (memEQ(posixcc, "alph", 4)) {
4485 /* a */
4486 namedclass
4487 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4488 }
4489 break;
4490 case 'e':
4491 if (memEQ(posixcc, "spac", 4)) {
4492 /* e */
4493 namedclass
4494 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4495 }
4496 break;
4497 case 'h':
4498 if (memEQ(posixcc, "grap", 4)) {
4499 /* h */
4500 namedclass
4501 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4502 }
4503 break;
4504 case 'i':
4505 if (memEQ(posixcc, "asci", 4)) {
4506 /* i */
4507 namedclass
4508 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4509 }
4510 break;
4511 case 'k':
4512 if (memEQ(posixcc, "blan", 4)) {
4513 /* k */
4514 namedclass
4515 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4516 }
4517 break;
4518 case 'l':
4519 if (memEQ(posixcc, "cntr", 4)) {
4520 /* l */
4521 namedclass
4522 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4523 }
4524 break;
4525 case 'm':
4526 if (memEQ(posixcc, "alnu", 4)) {
4527 /* m */
4528 namedclass
4529 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4530 }
4531 break;
4532 case 'r':
4533 if (memEQ(posixcc, "lowe", 4)) {
4534 /* r */
4535 namedclass
4536 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4537 }
4538 if (memEQ(posixcc, "uppe", 4)) {
8fdec511 4539 /* r */
80916619
NC
4540 namedclass
4541 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4542 }
4543 break;
4544 case 't':
4545 if (memEQ(posixcc, "digi", 4)) {
4546 /* t */
4547 namedclass
4548 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4549 }
4550 if (memEQ(posixcc, "prin", 4)) {
8fdec511 4551 /* t */
80916619
NC
4552 namedclass
4553 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4554 }
4555 if (memEQ(posixcc, "punc", 4)) {
4556 /* t */
4557 namedclass
4558 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4559 }
4560 break;
b8c5462f
JH
4561 }
4562 break;
80916619
NC
4563 case 6:
4564 if (memEQ(posixcc, "xdigit", 6)) {
4565 namedclass
4566 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
4567 }
4568 break;
4569 }
80916619
NC
4570
4571 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
4572 {
4573 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4574 t - s - 1, s + 1);
4575 }
80916619
NC
4576 assert (posixcc[skip] == ':');
4577 assert (posixcc[skip+1] == ']');
b45f050a 4578 } else if (!SIZE_ONLY) {
b8c5462f 4579 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 4580
830247a4 4581 /* adjust RExC_parse so the warning shows after
b45f050a 4582 the class closes */
9a86a77b 4583 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 4584 RExC_parse++;
b45f050a
JF
4585 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4586 }
b8c5462f
JH
4587 } else {
4588 /* Maternal grandfather:
4589 * "[:" ending in ":" but not in ":]" */
830247a4 4590 RExC_parse = s;
767d463e 4591 }
620e46c5
JH
4592 }
4593 }
4594
b8c5462f
JH
4595 return namedclass;
4596}
4597
4598STATIC void
830247a4 4599S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 4600{
97aff369 4601 dVAR;
b938889d 4602 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
4603 const char *s = RExC_parse;
4604 const char c = *s++;
b8c5462f
JH
4605
4606 while(*s && isALNUM(*s))
4607 s++;
4608 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
4609 if (ckWARN(WARN_REGEXP))
4610 vWARN3(s+2,
4611 "POSIX syntax [%c %c] belongs inside character classes",
4612 c, c);
b45f050a
JF
4613
4614 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 4615 if (POSIXCC_NOTYET(c)) {
830247a4 4616 /* adjust RExC_parse so the error shows after
b45f050a 4617 the class closes */
9a86a77b 4618 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
4619 ;
4620 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4621 }
b8c5462f
JH
4622 }
4623 }
620e46c5
JH
4624}
4625
7f6f358c
YO
4626
4627/*
4628 parse a class specification and produce either an ANYOF node that
4629 matches the pattern. If the pattern matches a single char only and
4630 that char is < 256 then we produce an EXACT node instead.
4631*/
76e3520e 4632STATIC regnode *
830247a4 4633S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 4634{
97aff369 4635 dVAR;
ffc61ed2 4636 register UV value;
9a86a77b 4637 register UV nextvalue;
3568d838 4638 register IV prevvalue = OOB_UNICODE;
ffc61ed2 4639 register IV range = 0;
c277df42 4640 register regnode *ret;
ba210ebe 4641 STRLEN numlen;
ffc61ed2 4642 IV namedclass;
cbbf8932 4643 char *rangebegin = NULL;
936ed897 4644 bool need_class = 0;
c445ea15 4645 SV *listsv = NULL;
ffc61ed2
JH
4646 register char *e;
4647 UV n;
9e55ce06 4648 bool optimize_invert = TRUE;
cbbf8932 4649 AV* unicode_alternate = NULL;
1b2d223b
JH
4650#ifdef EBCDIC
4651 UV literal_endpoint = 0;
4652#endif
7f6f358c 4653 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 4654
7f6f358c
YO
4655 regnode *orig_emit = RExC_emit; /* Save the original RExC_emit in
4656 case we need to change the emitted regop to an EXACT. */
4657
4658 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
4659 ret = reganode(pRExC_state, ANYOF, 0);
4660
4661 if (!SIZE_ONLY)
4662 ANYOF_FLAGS(ret) = 0;
4663
9a86a77b 4664 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
4665 RExC_naughty++;
4666 RExC_parse++;
4667 if (!SIZE_ONLY)
4668 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4669 }
a0d0e21e 4670
73060fc4 4671 if (SIZE_ONLY) {
830247a4 4672 RExC_size += ANYOF_SKIP;
73060fc4
JH
4673 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
4674 }
936ed897 4675 else {
830247a4 4676 RExC_emit += ANYOF_SKIP;
936ed897
IZ
4677 if (FOLD)
4678 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4679 if (LOC)
4680 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 4681 ANYOF_BITMAP_ZERO(ret);
396482e1 4682 listsv = newSVpvs("# comment\n");
a0d0e21e 4683 }
b8c5462f 4684
9a86a77b
JH
4685 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4686
b938889d 4687 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 4688 checkposixcc(pRExC_state);
b8c5462f 4689
f064b6ad
HS
4690 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4691 if (UCHARAT(RExC_parse) == ']')
4692 goto charclassloop;
ffc61ed2 4693
9a86a77b 4694 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
4695
4696 charclassloop:
4697
4698 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4699
73b437c8 4700 if (!range)
830247a4 4701 rangebegin = RExC_parse;
ffc61ed2 4702 if (UTF) {
5e12f4fb 4703 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 4704 RExC_end - RExC_parse,
9f7f3913 4705 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
4706 RExC_parse += numlen;
4707 }
4708 else
4709 value = UCHARAT(RExC_parse++);
7f6f358c 4710
9a86a77b
JH
4711 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4712 if (value == '[' && POSIXCC(nextvalue))
830247a4 4713 namedclass = regpposixcc(pRExC_state, value);
620e46c5 4714 else if (value == '\\') {
ffc61ed2 4715 if (UTF) {
5e12f4fb 4716 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 4717 RExC_end - RExC_parse,
9f7f3913 4718 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
4719 RExC_parse += numlen;
4720 }
4721 else
4722 value = UCHARAT(RExC_parse++);
470c3474 4723 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 4724 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
4725 * be a problem later if we want switch on Unicode.
4726 * A similar issue a little bit later when switching on
4727 * namedclass. --jhi */
ffc61ed2 4728 switch ((I32)value) {
b8c5462f
JH
4729 case 'w': namedclass = ANYOF_ALNUM; break;
4730 case 'W': namedclass = ANYOF_NALNUM; break;
4731 case 's': namedclass = ANYOF_SPACE; break;
4732 case 'S': namedclass = ANYOF_NSPACE; break;
4733 case 'd': namedclass = ANYOF_DIGIT; break;
4734 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
4735 case 'p':
4736 case 'P':
af6f566e 4737 if (RExC_parse >= RExC_end)
2a4859cd 4738 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 4739 if (*RExC_parse == '{') {
1df70142 4740 const U8 c = (U8)value;
ffc61ed2
JH
4741 e = strchr(RExC_parse++, '}');
4742 if (!e)
0da60cf5 4743 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
4744 while (isSPACE(UCHARAT(RExC_parse)))
4745 RExC_parse++;
4746 if (e == RExC_parse)
0da60cf5 4747 vFAIL2("Empty \\%c{}", c);
ffc61ed2 4748 n = e - RExC_parse;
ab13f0c7
JH
4749 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4750 n--;
ffc61ed2
JH
4751 }
4752 else {
4753 e = RExC_parse;
4754 n = 1;
4755 }
4756 if (!SIZE_ONLY) {
ab13f0c7
JH
4757 if (UCHARAT(RExC_parse) == '^') {
4758 RExC_parse++;
4759 n--;
4760 value = value == 'p' ? 'P' : 'p'; /* toggle */
4761 while (isSPACE(UCHARAT(RExC_parse))) {
4762 RExC_parse++;
4763 n--;
4764 }
4765 }
097eb12c
AL
4766 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
4767 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
4768 }
4769 RExC_parse = e + 1;
4770 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2
JP
4771 namedclass = ANYOF_MAX; /* no official name, but it's named */
4772 break;
b8c5462f
JH
4773 case 'n': value = '\n'; break;
4774 case 'r': value = '\r'; break;
4775 case 't': value = '\t'; break;
4776 case 'f': value = '\f'; break;
4777 case 'b': value = '\b'; break;
c7f1f016
NIS
4778 case 'e': value = ASCII_TO_NATIVE('\033');break;
4779 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 4780 case 'x':
ffc61ed2 4781 if (*RExC_parse == '{') {
a4c04bdc
NC
4782 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4783 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 4784 e = strchr(RExC_parse++, '}');
b81d288d 4785 if (!e)
ffc61ed2 4786 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
4787
4788 numlen = e - RExC_parse;
4789 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4790 RExC_parse = e + 1;
4791 }
4792 else {
a4c04bdc 4793 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4794 numlen = 2;
4795 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4796 RExC_parse += numlen;
4797 }
b8c5462f
JH
4798 break;
4799 case 'c':
830247a4 4800 value = UCHARAT(RExC_parse++);
b8c5462f
JH
4801 value = toCTRL(value);
4802 break;
4803 case '0': case '1': case '2': case '3': case '4':
4804 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
4805 {
4806 I32 flags = 0;
4807 numlen = 3;
4808 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 4809 RExC_parse += numlen;
b8c5462f 4810 break;
53305cf1 4811 }
1028017a 4812 default:
041457d9 4813 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
4814 vWARN2(RExC_parse,
4815 "Unrecognized escape \\%c in character class passed through",
4816 (int)value);
1028017a 4817 break;
b8c5462f 4818 }
ffc61ed2 4819 } /* end of \blah */
1b2d223b
JH
4820#ifdef EBCDIC
4821 else
4822 literal_endpoint++;
4823#endif
ffc61ed2
JH
4824
4825 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4826
4827 if (!SIZE_ONLY && !need_class)
936ed897 4828 ANYOF_CLASS_ZERO(ret);
ffc61ed2 4829
936ed897 4830 need_class = 1;
ffc61ed2
JH
4831
4832 /* a bad range like a-\d, a-[:digit:] ? */
4833 if (range) {
73b437c8 4834 if (!SIZE_ONLY) {
afd78fd5 4835 if (ckWARN(WARN_REGEXP)) {
097eb12c 4836 const int w =
afd78fd5
JH
4837 RExC_parse >= rangebegin ?
4838 RExC_parse - rangebegin : 0;
830247a4 4839 vWARN4(RExC_parse,
b45f050a 4840 "False [] range \"%*.*s\"",
097eb12c 4841 w, w, rangebegin);
afd78fd5 4842 }
3568d838
JH
4843 if (prevvalue < 256) {
4844 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
4845 ANYOF_BITMAP_SET(ret, '-');
4846 }
4847 else {
4848 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4849 Perl_sv_catpvf(aTHX_ listsv,
3568d838 4850 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 4851 }
b8c5462f 4852 }
ffc61ed2
JH
4853
4854 range = 0; /* this was not a true range */
73b437c8 4855 }
ffc61ed2 4856
73b437c8 4857 if (!SIZE_ONLY) {
c49a72a9
NC
4858 const char *what = NULL;
4859 char yesno = 0;
4860
3568d838
JH
4861 if (namedclass > OOB_NAMEDCLASS)
4862 optimize_invert = FALSE;
e2962f66
JH
4863 /* Possible truncation here but in some 64-bit environments
4864 * the compiler gets heartburn about switch on 64-bit values.
4865 * A similar issue a little earlier when switching on value.
98f323fa 4866 * --jhi */
e2962f66 4867 switch ((I32)namedclass) {
73b437c8
JH
4868 case ANYOF_ALNUM:
4869 if (LOC)
936ed897 4870 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
4871 else {
4872 for (value = 0; value < 256; value++)
4873 if (isALNUM(value))
936ed897 4874 ANYOF_BITMAP_SET(ret, value);
73b437c8 4875 }
c49a72a9
NC
4876 yesno = '+';
4877 what = "Word";
73b437c8
JH
4878 break;
4879 case ANYOF_NALNUM:
4880 if (LOC)
936ed897 4881 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
4882 else {
4883 for (value = 0; value < 256; value++)
4884 if (!isALNUM(value))
936ed897 4885 ANYOF_BITMAP_SET(ret, value);
73b437c8 4886 }
c49a72a9
NC
4887 yesno = '!';
4888 what = "Word";
73b437c8 4889 break;
ffc61ed2 4890 case ANYOF_ALNUMC:
73b437c8 4891 if (LOC)
ffc61ed2 4892 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
4893 else {
4894 for (value = 0; value < 256; value++)
ffc61ed2 4895 if (isALNUMC(value))
936ed897 4896 ANYOF_BITMAP_SET(ret, value);
73b437c8 4897 }
c49a72a9
NC
4898 yesno = '+';
4899 what = "Alnum";
73b437c8
JH
4900 break;
4901 case ANYOF_NALNUMC:
4902 if (LOC)
936ed897 4903 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
4904 else {
4905 for (value = 0; value < 256; value++)
4906 if (!isALNUMC(value))
936ed897 4907 ANYOF_BITMAP_SET(ret, value);
73b437c8 4908 }
c49a72a9
NC
4909 yesno = '!';
4910 what = "Alnum";
73b437c8
JH
4911 break;
4912 case ANYOF_ALPHA:
4913 if (LOC)
936ed897 4914 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
4915 else {
4916 for (value = 0; value < 256; value++)
4917 if (isALPHA(value))
936ed897 4918 ANYOF_BITMAP_SET(ret, value);
73b437c8 4919 }
c49a72a9
NC
4920 yesno = '+';
4921 what = "Alpha";
73b437c8
JH
4922 break;
4923 case ANYOF_NALPHA:
4924 if (LOC)
936ed897 4925 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
4926 else {
4927 for (value = 0; value < 256; value++)
4928 if (!isALPHA(value))
936ed897 4929 ANYOF_BITMAP_SET(ret, value);
73b437c8 4930 }
c49a72a9
NC
4931 yesno = '!';
4932 what = "Alpha";
73b437c8
JH
4933 break;
4934 case ANYOF_ASCII:
4935 if (LOC)
936ed897 4936 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 4937 else {
c7f1f016 4938#ifndef EBCDIC
1ba5c669
JH
4939 for (value = 0; value < 128; value++)
4940 ANYOF_BITMAP_SET(ret, value);
4941#else /* EBCDIC */
ffbc6a93 4942 for (value = 0; value < 256; value++) {
3a3c4447
JH
4943 if (isASCII(value))
4944 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4945 }
1ba5c669 4946#endif /* EBCDIC */
73b437c8 4947 }
c49a72a9
NC
4948 yesno = '+';
4949 what = "ASCII";
73b437c8
JH
4950 break;
4951 case ANYOF_NASCII:
4952 if (LOC)
936ed897 4953 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 4954 else {
c7f1f016 4955#ifndef EBCDIC
1ba5c669
JH
4956 for (value = 128; value < 256; value++)
4957 ANYOF_BITMAP_SET(ret, value);
4958#else /* EBCDIC */
ffbc6a93 4959 for (value = 0; value < 256; value++) {
3a3c4447
JH
4960 if (!isASCII(value))
4961 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4962 }
1ba5c669 4963#endif /* EBCDIC */
73b437c8 4964 }
c49a72a9
NC
4965 yesno = '!';
4966 what = "ASCII";
73b437c8 4967 break;
aaa51d5e
JF
4968 case ANYOF_BLANK:
4969 if (LOC)
4970 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4971 else {
4972 for (value = 0; value < 256; value++)
4973 if (isBLANK(value))
4974 ANYOF_BITMAP_SET(ret, value);
4975 }
c49a72a9
NC
4976 yesno = '+';
4977 what = "Blank";
aaa51d5e
JF
4978 break;
4979 case ANYOF_NBLANK:
4980 if (LOC)
4981 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
4982 else {
4983 for (value = 0; value < 256; value++)
4984 if (!isBLANK(value))
4985 ANYOF_BITMAP_SET(ret, value);
4986 }
c49a72a9
NC
4987 yesno = '!';
4988 what = "Blank";
aaa51d5e 4989 break;
73b437c8
JH
4990 case ANYOF_CNTRL:
4991 if (LOC)
936ed897 4992 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
4993 else {
4994 for (value = 0; value < 256; value++)
4995 if (isCNTRL(value))
936ed897 4996 ANYOF_BITMAP_SET(ret, value);
73b437c8 4997 }
c49a72a9
NC
4998 yesno = '+';
4999 what = "Cntrl";
73b437c8
JH
5000 break;
5001 case ANYOF_NCNTRL:
5002 if (LOC)
936ed897 5003 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5004 else {
5005 for (value = 0; value < 256; value++)
5006 if (!isCNTRL(value))
936ed897 5007 ANYOF_BITMAP_SET(ret, value);
73b437c8 5008 }
c49a72a9
NC
5009 yesno = '!';
5010 what = "Cntrl";
ffc61ed2
JH
5011 break;
5012 case ANYOF_DIGIT:
5013 if (LOC)
5014 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5015 else {
5016 /* consecutive digits assumed */
5017 for (value = '0'; value <= '9'; value++)
5018 ANYOF_BITMAP_SET(ret, value);
5019 }
c49a72a9
NC
5020 yesno = '+';
5021 what = "Digit";
ffc61ed2
JH
5022 break;
5023 case ANYOF_NDIGIT:
5024 if (LOC)
5025 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5026 else {
5027 /* consecutive digits assumed */
5028 for (value = 0; value < '0'; value++)
5029 ANYOF_BITMAP_SET(ret, value);
5030 for (value = '9' + 1; value < 256; value++)
5031 ANYOF_BITMAP_SET(ret, value);
5032 }
c49a72a9
NC
5033 yesno = '!';
5034 what = "Digit";
73b437c8
JH
5035 break;
5036 case ANYOF_GRAPH:
5037 if (LOC)
936ed897 5038 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5039 else {
5040 for (value = 0; value < 256; value++)
5041 if (isGRAPH(value))
936ed897 5042 ANYOF_BITMAP_SET(ret, value);
73b437c8 5043 }
c49a72a9
NC
5044 yesno = '+';
5045 what = "Graph";
73b437c8
JH
5046 break;
5047 case ANYOF_NGRAPH:
5048 if (LOC)
936ed897 5049 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5050 else {
5051 for (value = 0; value < 256; value++)
5052 if (!isGRAPH(value))
936ed897 5053 ANYOF_BITMAP_SET(ret, value);
73b437c8 5054 }
c49a72a9
NC
5055 yesno = '!';
5056 what = "Graph";
73b437c8
JH
5057 break;
5058 case ANYOF_LOWER:
5059 if (LOC)
936ed897 5060 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5061 else {
5062 for (value = 0; value < 256; value++)
5063 if (isLOWER(value))
936ed897 5064 ANYOF_BITMAP_SET(ret, value);
73b437c8 5065 }
c49a72a9
NC
5066 yesno = '+';
5067 what = "Lower";
73b437c8
JH
5068 break;
5069 case ANYOF_NLOWER:
5070 if (LOC)
936ed897 5071 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5072 else {
5073 for (value = 0; value < 256; value++)
5074 if (!isLOWER(value))
936ed897 5075 ANYOF_BITMAP_SET(ret, value);
73b437c8 5076 }
c49a72a9
NC
5077 yesno = '!';
5078 what = "Lower";
73b437c8
JH
5079 break;
5080 case ANYOF_PRINT:
5081 if (LOC)
936ed897 5082 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5083 else {
5084 for (value = 0; value < 256; value++)
5085 if (isPRINT(value))
936ed897 5086 ANYOF_BITMAP_SET(ret, value);
73b437c8 5087 }
c49a72a9
NC
5088 yesno = '+';
5089 what = "Print";
73b437c8
JH
5090 break;
5091 case ANYOF_NPRINT:
5092 if (LOC)
936ed897 5093 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5094 else {
5095 for (value = 0; value < 256; value++)
5096 if (!isPRINT(value))
936ed897 5097 ANYOF_BITMAP_SET(ret, value);
73b437c8 5098 }
c49a72a9
NC
5099 yesno = '!';
5100 what = "Print";
73b437c8 5101 break;
aaa51d5e
JF
5102 case ANYOF_PSXSPC:
5103 if (LOC)
5104 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5105 else {
5106 for (value = 0; value < 256; value++)
5107 if (isPSXSPC(value))
5108 ANYOF_BITMAP_SET(ret, value);
5109 }
c49a72a9
NC
5110 yesno = '+';
5111 what = "Space";
aaa51d5e
JF
5112 break;
5113 case ANYOF_NPSXSPC:
5114 if (LOC)
5115 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5116 else {
5117 for (value = 0; value < 256; value++)
5118 if (!isPSXSPC(value))
5119 ANYOF_BITMAP_SET(ret, value);
5120 }
c49a72a9
NC
5121 yesno = '!';
5122 what = "Space";
aaa51d5e 5123 break;
73b437c8
JH
5124 case ANYOF_PUNCT:
5125 if (LOC)
936ed897 5126 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5127 else {
5128 for (value = 0; value < 256; value++)
5129 if (isPUNCT(value))
936ed897 5130 ANYOF_BITMAP_SET(ret, value);
73b437c8 5131 }
c49a72a9
NC
5132 yesno = '+';
5133 what = "Punct";
73b437c8
JH
5134 break;
5135 case ANYOF_NPUNCT:
5136 if (LOC)
936ed897 5137 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5138 else {
5139 for (value = 0; value < 256; value++)
5140 if (!isPUNCT(value))
936ed897 5141 ANYOF_BITMAP_SET(ret, value);
73b437c8 5142 }
c49a72a9
NC
5143 yesno = '!';
5144 what = "Punct";
ffc61ed2
JH
5145 break;
5146 case ANYOF_SPACE:
5147 if (LOC)
5148 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5149 else {
5150 for (value = 0; value < 256; value++)
5151 if (isSPACE(value))
5152 ANYOF_BITMAP_SET(ret, value);
5153 }
c49a72a9
NC
5154 yesno = '+';
5155 what = "SpacePerl";
ffc61ed2
JH
5156 break;
5157 case ANYOF_NSPACE:
5158 if (LOC)
5159 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5160 else {
5161 for (value = 0; value < 256; value++)
5162 if (!isSPACE(value))
5163 ANYOF_BITMAP_SET(ret, value);
5164 }
c49a72a9
NC
5165 yesno = '!';
5166 what = "SpacePerl";
73b437c8
JH
5167 break;
5168 case ANYOF_UPPER:
5169 if (LOC)
936ed897 5170 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5171 else {
5172 for (value = 0; value < 256; value++)
5173 if (isUPPER(value))
936ed897 5174 ANYOF_BITMAP_SET(ret, value);
73b437c8 5175 }
c49a72a9
NC
5176 yesno = '+';
5177 what = "Upper";
73b437c8
JH
5178 break;
5179 case ANYOF_NUPPER:
5180 if (LOC)
936ed897 5181 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5182 else {
5183 for (value = 0; value < 256; value++)
5184 if (!isUPPER(value))
936ed897 5185 ANYOF_BITMAP_SET(ret, value);
73b437c8 5186 }
c49a72a9
NC
5187 yesno = '!';
5188 what = "Upper";
73b437c8
JH
5189 break;
5190 case ANYOF_XDIGIT:
5191 if (LOC)
936ed897 5192 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5193 else {
5194 for (value = 0; value < 256; value++)
5195 if (isXDIGIT(value))
936ed897 5196 ANYOF_BITMAP_SET(ret, value);
73b437c8 5197 }
c49a72a9
NC
5198 yesno = '+';
5199 what = "XDigit";
73b437c8
JH
5200 break;
5201 case ANYOF_NXDIGIT:
5202 if (LOC)
936ed897 5203 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5204 else {
5205 for (value = 0; value < 256; value++)
5206 if (!isXDIGIT(value))
936ed897 5207 ANYOF_BITMAP_SET(ret, value);
73b437c8 5208 }
c49a72a9
NC
5209 yesno = '!';
5210 what = "XDigit";
73b437c8 5211 break;
f81125e2
JP
5212 case ANYOF_MAX:
5213 /* this is to handle \p and \P */
5214 break;
73b437c8 5215 default:
b45f050a 5216 vFAIL("Invalid [::] class");
73b437c8 5217 break;
b8c5462f 5218 }
c49a72a9
NC
5219 if (what) {
5220 /* Strings such as "+utf8::isWord\n" */
5221 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5222 }
b8c5462f 5223 if (LOC)
936ed897 5224 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5225 continue;
a0d0e21e 5226 }
ffc61ed2
JH
5227 } /* end of namedclass \blah */
5228
a0d0e21e 5229 if (range) {
eb160463 5230 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
5231 const int w = RExC_parse - rangebegin;
5232 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 5233 range = 0; /* not a valid range */
73b437c8 5234 }
a0d0e21e
LW
5235 }
5236 else {
3568d838 5237 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5238 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5239 RExC_parse[1] != ']') {
5240 RExC_parse++;
ffc61ed2
JH
5241
5242 /* a bad range like \w-, [:word:]- ? */
5243 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 5244 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 5245 const int w =
afd78fd5
JH
5246 RExC_parse >= rangebegin ?
5247 RExC_parse - rangebegin : 0;
830247a4 5248 vWARN4(RExC_parse,
b45f050a 5249 "False [] range \"%*.*s\"",
097eb12c 5250 w, w, rangebegin);
afd78fd5 5251 }
73b437c8 5252 if (!SIZE_ONLY)
936ed897 5253 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5254 } else
ffc61ed2
JH
5255 range = 1; /* yeah, it's a range! */
5256 continue; /* but do it the next time */
a0d0e21e 5257 }
a687059c 5258 }
ffc61ed2 5259
93733859 5260 /* now is the next time */
7f6f358c 5261 stored += (value - prevvalue + 1);
ae5c130c 5262 if (!SIZE_ONLY) {
3568d838 5263 IV i;
3568d838 5264 if (prevvalue < 256) {
1df70142 5265 const IV ceilvalue = value < 256 ? value : 255;
3568d838
JH
5266
5267#ifdef EBCDIC
1b2d223b
JH
5268 /* In EBCDIC [\x89-\x91] should include
5269 * the \x8e but [i-j] should not. */
5270 if (literal_endpoint == 2 &&
5271 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5272 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5273 {
3568d838
JH
5274 if (isLOWER(prevvalue)) {
5275 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5276 if (isLOWER(i))
5277 ANYOF_BITMAP_SET(ret, i);
5278 } else {
3568d838 5279 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5280 if (isUPPER(i))
5281 ANYOF_BITMAP_SET(ret, i);
5282 }
8ada0baa 5283 }
ffc61ed2 5284 else
8ada0baa 5285#endif
a5961de5
JH
5286 for (i = prevvalue; i <= ceilvalue; i++)
5287 ANYOF_BITMAP_SET(ret, i);
3568d838 5288 }
a5961de5 5289 if (value > 255 || UTF) {
1df70142
AL
5290 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5291 const UV natvalue = NATIVE_TO_UNI(value);
b08decb7 5292
ffc61ed2 5293 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5294 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5295 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5296 prevnatvalue, natvalue);
5297 }
5298 else if (prevnatvalue == natvalue) {
5299 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5300 if (FOLD) {
89ebb4a3 5301 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5302 STRLEN foldlen;
1df70142 5303 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5304
c840d2a2
JH
5305 /* If folding and foldable and a single
5306 * character, insert also the folded version
5307 * to the charclass. */
9e55ce06 5308 if (f != value) {
eb160463 5309 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5310 Perl_sv_catpvf(aTHX_ listsv,
5311 "%04"UVxf"\n", f);
5312 else {
5313 /* Any multicharacter foldings
5314 * require the following transform:
5315 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5316 * where E folds into "pq" and F folds
5317 * into "rst", all other characters
5318 * fold to single characters. We save
5319 * away these multicharacter foldings,
5320 * to be later saved as part of the
5321 * additional "s" data. */
5322 SV *sv;
5323
5324 if (!unicode_alternate)
5325 unicode_alternate = newAV();
5326 sv = newSVpvn((char*)foldbuf, foldlen);
5327 SvUTF8_on(sv);
5328 av_push(unicode_alternate, sv);
5329 }
5330 }
254ba52a 5331
60a8b682
JH
5332 /* If folding and the value is one of the Greek
5333 * sigmas insert a few more sigmas to make the
5334 * folding rules of the sigmas to work right.
5335 * Note that not all the possible combinations
5336 * are handled here: some of them are handled
9e55ce06
JH
5337 * by the standard folding rules, and some of
5338 * them (literal or EXACTF cases) are handled
5339 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5340 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5341 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5342 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5343 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5344 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5345 }
5346 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5347 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5348 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5349 }
5350 }
ffc61ed2 5351 }
1b2d223b
JH
5352#ifdef EBCDIC
5353 literal_endpoint = 0;
5354#endif
8ada0baa 5355 }
ffc61ed2
JH
5356
5357 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5358 }
ffc61ed2 5359
936ed897 5360 if (need_class) {
4f66b38d 5361 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5362 if (SIZE_ONLY)
830247a4 5363 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5364 else
830247a4 5365 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5366 }
ffc61ed2 5367
7f6f358c
YO
5368
5369 if (SIZE_ONLY)
5370 return ret;
5371 /****** !SIZE_ONLY AFTER HERE *********/
5372
5373 if( stored == 1 && value < 256
5374 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
5375 ) {
5376 /* optimize single char class to an EXACT node
5377 but *only* when its not a UTF/high char */
5378 RExC_emit = orig_emit;
5379 ret = reg_node(pRExC_state,
5380 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
5381 *STRING(ret)= (char)value;
5382 STR_LEN(ret)= 1;
5383 RExC_emit += STR_SZ(1);
5384 return ret;
5385 }
ae5c130c 5386 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 5387 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5388 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5389 ) {
a0ed51b3 5390 for (value = 0; value < 256; ++value) {
936ed897 5391 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5392 UV fold = PL_fold[value];
ffc61ed2
JH
5393
5394 if (fold != value)
5395 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5396 }
5397 }
936ed897 5398 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5399 }
ffc61ed2 5400
ae5c130c 5401 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 5402 if (optimize_invert &&
ffc61ed2
JH
5403 /* If the only flag is inversion. */
5404 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5405 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5406 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5407 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5408 }
7f6f358c 5409 {
097eb12c 5410 AV * const av = newAV();
ffc61ed2 5411 SV *rv;
9e55ce06 5412 /* The 0th element stores the character class description
6a0407ee 5413 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5414 * to initialize the appropriate swash (which gets stored in
5415 * the 1st element), and also useful for dumping the regnode.
5416 * The 2nd element stores the multicharacter foldings,
6a0407ee 5417 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
5418 av_store(av, 0, listsv);
5419 av_store(av, 1, NULL);
9e55ce06 5420 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 5421 rv = newRV_noinc((SV*)av);
19860706 5422 n = add_data(pRExC_state, 1, "s");
830247a4 5423 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 5424 ARG_SET(ret, n);
a0ed51b3 5425 }
a0ed51b3
LW
5426 return ret;
5427}
5428
76e3520e 5429STATIC char*
830247a4 5430S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 5431{
097eb12c 5432 char* const retval = RExC_parse++;
a0d0e21e 5433
4633a7c4 5434 for (;;) {
830247a4
IZ
5435 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5436 RExC_parse[2] == '#') {
e994fd66
AE
5437 while (*RExC_parse != ')') {
5438 if (RExC_parse == RExC_end)
5439 FAIL("Sequence (?#... not terminated");
830247a4 5440 RExC_parse++;
e994fd66 5441 }
830247a4 5442 RExC_parse++;
4633a7c4
LW
5443 continue;
5444 }
e2509266 5445 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
5446 if (isSPACE(*RExC_parse)) {
5447 RExC_parse++;
748a9306
LW
5448 continue;
5449 }
830247a4 5450 else if (*RExC_parse == '#') {
e994fd66
AE
5451 while (RExC_parse < RExC_end)
5452 if (*RExC_parse++ == '\n') break;
748a9306
LW
5453 continue;
5454 }
748a9306 5455 }
4633a7c4 5456 return retval;
a0d0e21e 5457 }
a687059c
LW
5458}
5459
5460/*
c277df42 5461- reg_node - emit a node
a0d0e21e 5462*/
76e3520e 5463STATIC regnode * /* Location. */
830247a4 5464S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 5465{
97aff369 5466 dVAR;
c277df42 5467 register regnode *ptr;
504618e9 5468 regnode * const ret = RExC_emit;
a687059c 5469
c277df42 5470 if (SIZE_ONLY) {
830247a4
IZ
5471 SIZE_ALIGN(RExC_size);
5472 RExC_size += 1;
a0d0e21e
LW
5473 return(ret);
5474 }
a687059c 5475
c277df42 5476 NODE_ALIGN_FILL(ret);
a0d0e21e 5477 ptr = ret;
c277df42 5478 FILL_ADVANCE_NODE(ptr, op);
fac92740 5479 if (RExC_offsets) { /* MJD */
ccb2c380 5480 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
5481 "reg_node", __LINE__,
5482 reg_name[op],
5483 RExC_emit - RExC_emit_start > RExC_offsets[0]
5484 ? "Overwriting end of array!\n" : "OK",
5485 RExC_emit - RExC_emit_start,
5486 RExC_parse - RExC_start,
5487 RExC_offsets[0]));
ccb2c380 5488 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
5489 }
5490
830247a4 5491 RExC_emit = ptr;
a687059c 5492
a0d0e21e 5493 return(ret);
a687059c
LW
5494}
5495
5496/*
a0d0e21e
LW
5497- reganode - emit a node with an argument
5498*/
76e3520e 5499STATIC regnode * /* Location. */
830247a4 5500S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 5501{
97aff369 5502 dVAR;
c277df42 5503 register regnode *ptr;
504618e9 5504 regnode * const ret = RExC_emit;
fe14fcc3 5505
c277df42 5506 if (SIZE_ONLY) {
830247a4
IZ
5507 SIZE_ALIGN(RExC_size);
5508 RExC_size += 2;
a0d0e21e
LW
5509 return(ret);
5510 }
fe14fcc3 5511
c277df42 5512 NODE_ALIGN_FILL(ret);
a0d0e21e 5513 ptr = ret;
c277df42 5514 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 5515 if (RExC_offsets) { /* MJD */
ccb2c380 5516 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5517 "reganode",
ccb2c380
MP
5518 __LINE__,
5519 reg_name[op],
fac92740
MJD
5520 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5521 "Overwriting end of array!\n" : "OK",
5522 RExC_emit - RExC_emit_start,
5523 RExC_parse - RExC_start,
5524 RExC_offsets[0]));
ccb2c380 5525 Set_Cur_Node_Offset;
fac92740
MJD
5526 }
5527
830247a4 5528 RExC_emit = ptr;
fe14fcc3 5529
a0d0e21e 5530 return(ret);
fe14fcc3
LW
5531}
5532
5533/*
cd439c50 5534- reguni - emit (if appropriate) a Unicode character
a0ed51b3 5535*/
71207a34
AL
5536STATIC STRLEN
5537S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 5538{
97aff369 5539 dVAR;
71207a34 5540 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
5541}
5542
5543/*
a0d0e21e
LW
5544- reginsert - insert an operator in front of already-emitted operand
5545*
5546* Means relocating the operand.
5547*/
76e3520e 5548STATIC void
830247a4 5549S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 5550{
97aff369 5551 dVAR;
c277df42
IZ
5552 register regnode *src;
5553 register regnode *dst;
5554 register regnode *place;
504618e9 5555 const int offset = regarglen[(U8)op];
b81d288d 5556
22c35a8c 5557/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
5558
5559 if (SIZE_ONLY) {
830247a4 5560 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
5561 return;
5562 }
a687059c 5563
830247a4
IZ
5564 src = RExC_emit;
5565 RExC_emit += NODE_STEP_REGNODE + offset;
5566 dst = RExC_emit;
fac92740 5567 while (src > opnd) {
c277df42 5568 StructCopy(--src, --dst, regnode);
fac92740 5569 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 5570 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 5571 "reg_insert",
ccb2c380
MP
5572 __LINE__,
5573 reg_name[op],
fac92740
MJD
5574 dst - RExC_emit_start > RExC_offsets[0]
5575 ? "Overwriting end of array!\n" : "OK",
5576 src - RExC_emit_start,
5577 dst - RExC_emit_start,
5578 RExC_offsets[0]));
ccb2c380
MP
5579 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5580 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
5581 }
5582 }
5583
a0d0e21e
LW
5584
5585 place = opnd; /* Op node, where operand used to be. */
fac92740 5586 if (RExC_offsets) { /* MJD */
ccb2c380 5587 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5588 "reginsert",
ccb2c380
MP
5589 __LINE__,
5590 reg_name[op],
fac92740
MJD
5591 place - RExC_emit_start > RExC_offsets[0]
5592 ? "Overwriting end of array!\n" : "OK",
5593 place - RExC_emit_start,
5594 RExC_parse - RExC_start,
5595 RExC_offsets[0]));
ccb2c380 5596 Set_Node_Offset(place, RExC_parse);
45948336 5597 Set_Node_Length(place, 1);
fac92740 5598 }
c277df42
IZ
5599 src = NEXTOPER(place);
5600 FILL_ADVANCE_NODE(place, op);
5601 Zero(src, offset, regnode);
a687059c
LW
5602}
5603
5604/*
c277df42 5605- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 5606*/
097eb12c 5607/* TODO: All three parms should be const */
76e3520e 5608STATIC void
097eb12c 5609S_regtail(pTHX_ const RExC_state_t *pRExC_state, regnode *p, const regnode *val)
a687059c 5610{
97aff369 5611 dVAR;
c277df42 5612 register regnode *scan;
a0d0e21e 5613
c277df42 5614 if (SIZE_ONLY)
a0d0e21e
LW
5615 return;
5616
5617 /* Find last node. */
5618 scan = p;
5619 for (;;) {
504618e9 5620 regnode * const temp = regnext(scan);
a0d0e21e
LW
5621 if (temp == NULL)
5622 break;
5623 scan = temp;
5624 }
a687059c 5625
c277df42
IZ
5626 if (reg_off_by_arg[OP(scan)]) {
5627 ARG_SET(scan, val - scan);
a0ed51b3
LW
5628 }
5629 else {
c277df42
IZ
5630 NEXT_OFF(scan) = val - scan;
5631 }
a687059c
LW
5632}
5633
5634/*
a687059c
LW
5635 - regcurly - a little FSA that accepts {\d+,?\d*}
5636 */
79072805 5637STATIC I32
5f66b61c 5638S_regcurly(register const char *s)
a687059c
LW
5639{
5640 if (*s++ != '{')
5641 return FALSE;
f0fcb552 5642 if (!isDIGIT(*s))
a687059c 5643 return FALSE;
f0fcb552 5644 while (isDIGIT(*s))
a687059c
LW
5645 s++;
5646 if (*s == ',')
5647 s++;
f0fcb552 5648 while (isDIGIT(*s))
a687059c
LW
5649 s++;
5650 if (*s != '}')
5651 return FALSE;
5652 return TRUE;
5653}
5654
a687059c
LW
5655
5656/*
fd181c75 5657 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
5658 */
5659void
097eb12c 5660Perl_regdump(pTHX_ const regexp *r)
a687059c 5661{
35ff7856 5662#ifdef DEBUGGING
97aff369 5663 dVAR;
c445ea15 5664 SV * const sv = sv_newmortal();
a687059c 5665
4f639d21 5666 (void)dumpuntil(r, r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
5667
5668 /* Header fields of interest. */
c277df42 5669 if (r->anchored_substr)
7b0972df 5670 PerlIO_printf(Perl_debug_log,
a0288114 5671 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
3280af22 5672 PL_colors[0],
7b0972df 5673 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
3f7c398e 5674 SvPVX_const(r->anchored_substr),
3280af22 5675 PL_colors[1],
c277df42 5676 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 5677 (IV)r->anchored_offset);
33b8afdf
JH
5678 else if (r->anchored_utf8)
5679 PerlIO_printf(Perl_debug_log,
a0288114 5680 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
33b8afdf
JH
5681 PL_colors[0],
5682 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
3f7c398e 5683 SvPVX_const(r->anchored_utf8),
33b8afdf
JH
5684 PL_colors[1],
5685 SvTAIL(r->anchored_utf8) ? "$" : "",
5686 (IV)r->anchored_offset);
c277df42 5687 if (r->float_substr)
7b0972df 5688 PerlIO_printf(Perl_debug_log,
a0288114 5689 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
3280af22 5690 PL_colors[0],
b81d288d 5691 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
3f7c398e 5692 SvPVX_const(r->float_substr),
3280af22 5693 PL_colors[1],
c277df42 5694 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 5695 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
5696 else if (r->float_utf8)
5697 PerlIO_printf(Perl_debug_log,
a0288114 5698 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
33b8afdf
JH
5699 PL_colors[0],
5700 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
3f7c398e 5701 SvPVX_const(r->float_utf8),
33b8afdf
JH
5702 PL_colors[1],
5703 SvTAIL(r->float_utf8) ? "$" : "",
5704 (IV)r->float_min_offset, (UV)r->float_max_offset);
5705 if (r->check_substr || r->check_utf8)
b81d288d
AB
5706 PerlIO_printf(Perl_debug_log,
5707 r->check_substr == r->float_substr
33b8afdf 5708 && r->check_utf8 == r->float_utf8
c277df42
IZ
5709 ? "(checking floating" : "(checking anchored");
5710 if (r->reganch & ROPT_NOSCAN)
5711 PerlIO_printf(Perl_debug_log, " noscan");
5712 if (r->reganch & ROPT_CHECK_ALL)
5713 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 5714 if (r->check_substr || r->check_utf8)
c277df42
IZ
5715 PerlIO_printf(Perl_debug_log, ") ");
5716
46fc3d4c 5717 if (r->regstclass) {
32fc9b6a 5718 regprop(r, sv, r->regstclass);
3f7c398e 5719 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
46fc3d4c 5720 }
774d564b 5721 if (r->reganch & ROPT_ANCH) {
5722 PerlIO_printf(Perl_debug_log, "anchored");
5723 if (r->reganch & ROPT_ANCH_BOL)
5724 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
5725 if (r->reganch & ROPT_ANCH_MBOL)
5726 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
5727 if (r->reganch & ROPT_ANCH_SBOL)
5728 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 5729 if (r->reganch & ROPT_ANCH_GPOS)
5730 PerlIO_printf(Perl_debug_log, "(GPOS)");
5731 PerlIO_putc(Perl_debug_log, ' ');
5732 }
c277df42
IZ
5733 if (r->reganch & ROPT_GPOS_SEEN)
5734 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 5735 if (r->reganch & ROPT_SKIP)
760ac839 5736 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 5737 if (r->reganch & ROPT_IMPLICIT)
760ac839 5738 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 5739 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
5740 if (r->reganch & ROPT_EVAL_SEEN)
5741 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 5742 PerlIO_printf(Perl_debug_log, "\n");
fac92740 5743 if (r->offsets) {
e4584336 5744 const U32 len = r->offsets[0];
a3621e74
YO
5745 GET_RE_DEBUG_FLAGS_DECL;
5746 DEBUG_OFFSETS_r({
1df70142 5747 U32 i;
e4584336
RB
5748 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5749 for (i = 1; i <= len; i++)
5750 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5751 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5752 PerlIO_printf(Perl_debug_log, "\n");
a3621e74 5753 });
fac92740 5754 }
65e66c80 5755#else
96a5add6 5756 PERL_UNUSED_CONTEXT;
65e66c80 5757 PERL_UNUSED_ARG(r);
17c3b450 5758#endif /* DEBUGGING */
a687059c
LW
5759}
5760
5761/*
a0d0e21e
LW
5762- regprop - printable representation of opcode
5763*/
46fc3d4c 5764void
32fc9b6a 5765Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 5766{
35ff7856 5767#ifdef DEBUGGING
97aff369 5768 dVAR;
9b155405 5769 register int k;
a0d0e21e 5770
54dc92de 5771 sv_setpvn(sv, "", 0);
9b155405 5772 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
5773 /* It would be nice to FAIL() here, but this may be called from
5774 regexec.c, and it would be hard to supply pRExC_state. */
5775 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 5776 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405
IZ
5777
5778 k = PL_regkind[(U8)OP(o)];
5779
2a782b5b 5780 if (k == EXACT) {
396482e1 5781 SV * const dsv = sv_2mortal(newSVpvs(""));
c728cb41
JH
5782 /* Using is_utf8_string() is a crude hack but it may
5783 * be the best for now since we have no flag "this EXACTish
5784 * node was UTF-8" --jhi */
1df70142 5785 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
2d03de9c 5786 const char * const s = do_utf8 ?
c728cb41
JH
5787 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5788 UNI_DISPLAY_REGEX) :
2a782b5b 5789 STRING(o);
e1ec3a88 5790 const int len = do_utf8 ?
2a782b5b
JH
5791 strlen(s) :
5792 STR_LEN(o);
5793 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5794 PL_colors[0],
5795 len, s,
5796 PL_colors[1]);
bb263b4e 5797 } else if (k == TRIE) {
6f207bd3 5798 NOOP;
4f639d21
DM
5799 /* print the details od the trie in dumpuntil instead, as
5800 * prog->data isn't available here */
a3621e74 5801 } else if (k == CURLY) {
cb434fcc 5802 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
5803 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5804 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 5805 }
2c2d71f5
JH
5806 else if (k == WHILEM && o->flags) /* Ordinal/of */
5807 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 5808 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 5809 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 5810 else if (k == LOGICAL)
04ebc1ab 5811 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
5812 else if (k == ANYOF) {
5813 int i, rangestart = -1;
2d03de9c 5814 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
5815
5816 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
5817 static const char * const anyofs[] = {
653099ff
GS
5818 "\\w",
5819 "\\W",
5820 "\\s",
5821 "\\S",
5822 "\\d",
5823 "\\D",
5824 "[:alnum:]",
5825 "[:^alnum:]",
5826 "[:alpha:]",
5827 "[:^alpha:]",
5828 "[:ascii:]",
5829 "[:^ascii:]",
5830 "[:ctrl:]",
5831 "[:^ctrl:]",
5832 "[:graph:]",
5833 "[:^graph:]",
5834 "[:lower:]",
5835 "[:^lower:]",
5836 "[:print:]",
5837 "[:^print:]",
5838 "[:punct:]",
5839 "[:^punct:]",
5840 "[:upper:]",
aaa51d5e 5841 "[:^upper:]",
653099ff 5842 "[:xdigit:]",
aaa51d5e
JF
5843 "[:^xdigit:]",
5844 "[:space:]",
5845 "[:^space:]",
5846 "[:blank:]",
5847 "[:^blank:]"
653099ff
GS
5848 };
5849
19860706 5850 if (flags & ANYOF_LOCALE)
396482e1 5851 sv_catpvs(sv, "{loc}");
19860706 5852 if (flags & ANYOF_FOLD)
396482e1 5853 sv_catpvs(sv, "{i}");
653099ff 5854 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 5855 if (flags & ANYOF_INVERT)
396482e1 5856 sv_catpvs(sv, "^");
ffc61ed2
JH
5857 for (i = 0; i <= 256; i++) {
5858 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
5859 if (rangestart == -1)
5860 rangestart = i;
5861 } else if (rangestart != -1) {
5862 if (i <= rangestart + 3)
5863 for (; rangestart < i; rangestart++)
653099ff 5864 put_byte(sv, rangestart);
ffc61ed2
JH
5865 else {
5866 put_byte(sv, rangestart);
396482e1 5867 sv_catpvs(sv, "-");
ffc61ed2 5868 put_byte(sv, i - 1);
653099ff 5869 }
ffc61ed2 5870 rangestart = -1;
653099ff 5871 }
847a199f 5872 }
ffc61ed2
JH
5873
5874 if (o->flags & ANYOF_CLASS)
bb7a0f54 5875 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
5876 if (ANYOF_CLASS_TEST(o,i))
5877 sv_catpv(sv, anyofs[i]);
5878
5879 if (flags & ANYOF_UNICODE)
396482e1 5880 sv_catpvs(sv, "{unicode}");
1aa99e6b 5881 else if (flags & ANYOF_UNICODE_ALL)
396482e1 5882 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
5883
5884 {
5885 SV *lv;
32fc9b6a 5886 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 5887
ffc61ed2
JH
5888 if (lv) {
5889 if (sw) {
89ebb4a3 5890 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 5891
ffc61ed2 5892 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 5893 uvchr_to_utf8(s, i);
ffc61ed2 5894
3568d838 5895 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
5896 if (rangestart == -1)
5897 rangestart = i;
5898 } else if (rangestart != -1) {
ffc61ed2
JH
5899 if (i <= rangestart + 3)
5900 for (; rangestart < i; rangestart++) {
2d03de9c
AL
5901 const U8 * const e = uvchr_to_utf8(s,rangestart);
5902 U8 *p;
5903 for(p = s; p < e; p++)
ffc61ed2
JH
5904 put_byte(sv, *p);
5905 }
5906 else {
2d03de9c
AL
5907 const U8 *e = uvchr_to_utf8(s,rangestart);
5908 U8 *p;
5909 for (p = s; p < e; p++)
ffc61ed2 5910 put_byte(sv, *p);
396482e1 5911 sv_catpvs(sv, "-");
2d03de9c
AL
5912 e = uvchr_to_utf8(s, i-1);
5913 for (p = s; p < e; p++)
1df70142 5914 put_byte(sv, *p);
ffc61ed2
JH
5915 }
5916 rangestart = -1;
5917 }
19860706 5918 }
ffc61ed2 5919
396482e1 5920 sv_catpvs(sv, "..."); /* et cetera */
19860706 5921 }
fde631ed 5922
ffc61ed2 5923 {
2e0de35c 5924 char *s = savesvpv(lv);
c445ea15 5925 char * const origs = s;
b81d288d 5926
ffc61ed2 5927 while(*s && *s != '\n') s++;
b81d288d 5928
ffc61ed2 5929 if (*s == '\n') {
2d03de9c 5930 const char * const t = ++s;
ffc61ed2
JH
5931
5932 while (*s) {
5933 if (*s == '\n')
5934 *s = ' ';
5935 s++;
5936 }
5937 if (s[-1] == ' ')
5938 s[-1] = 0;
5939
5940 sv_catpv(sv, t);
fde631ed 5941 }
b81d288d 5942
ffc61ed2 5943 Safefree(origs);
fde631ed
JH
5944 }
5945 }
653099ff 5946 }
ffc61ed2 5947
653099ff
GS
5948 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
5949 }
9b155405 5950 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 5951 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
65e66c80 5952#else
96a5add6 5953 PERL_UNUSED_CONTEXT;
65e66c80
SP
5954 PERL_UNUSED_ARG(sv);
5955 PERL_UNUSED_ARG(o);
17c3b450 5956#endif /* DEBUGGING */
35ff7856 5957}
a687059c 5958
cad2e5aa
JH
5959SV *
5960Perl_re_intuit_string(pTHX_ regexp *prog)
5961{ /* Assume that RE_INTUIT is set */
97aff369 5962 dVAR;
a3621e74 5963 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
5964 PERL_UNUSED_CONTEXT;
5965
a3621e74 5966 DEBUG_COMPILE_r(
cfd0369c 5967 {
2d03de9c 5968 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 5969 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
5970
5971 if (!PL_colorset) reginitcolors();
5972 PerlIO_printf(Perl_debug_log,
a0288114 5973 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
5974 PL_colors[4],
5975 prog->check_substr ? "" : "utf8 ",
5976 PL_colors[5],PL_colors[0],
cad2e5aa
JH
5977 s,
5978 PL_colors[1],
5979 (strlen(s) > 60 ? "..." : ""));
5980 } );
5981
33b8afdf 5982 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
5983}
5984
2b69d0c2 5985void
864dbfa3 5986Perl_pregfree(pTHX_ struct regexp *r)
a687059c 5987{
27da23d5 5988 dVAR;
9e55ce06 5989#ifdef DEBUGGING
c445ea15 5990 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
9e55ce06 5991#endif
fc32ee4a 5992 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 5993
7821416a
IZ
5994 if (!r || (--r->refcnt > 0))
5995 return;
a3621e74 5996 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
c445ea15 5997 const char * const s = (r->reganch & ROPT_UTF8)
e1ec3a88 5998 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 5999 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
e1ec3a88 6000 const int len = SvCUR(dsv);
9e55ce06
JH
6001 if (!PL_colorset)
6002 reginitcolors();
6003 PerlIO_printf(Perl_debug_log,
a3621e74 6004 "%sFreeing REx:%s %s%*.*s%s%s\n",
9e55ce06
JH
6005 PL_colors[4],PL_colors[5],PL_colors[0],
6006 len, len, s,
6007 PL_colors[1],
6008 len > 60 ? "..." : "");
6009 });
cad2e5aa 6010
43c5f42d
NC
6011 /* gcov results gave these as non-null 100% of the time, so there's no
6012 optimisation in checking them before calling Safefree */
6013 Safefree(r->precomp);
6014 Safefree(r->offsets); /* 20010421 MJD */
ed252734 6015 RX_MATCH_COPY_FREE(r);
f8c7b90f 6016#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
6017 if (r->saved_copy)
6018 SvREFCNT_dec(r->saved_copy);
6019#endif
a193d654
GS
6020 if (r->substrs) {
6021 if (r->anchored_substr)
6022 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6023 if (r->anchored_utf8)
6024 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6025 if (r->float_substr)
6026 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6027 if (r->float_utf8)
6028 SvREFCNT_dec(r->float_utf8);
2779dcf1 6029 Safefree(r->substrs);
a193d654 6030 }
c277df42
IZ
6031 if (r->data) {
6032 int n = r->data->count;
f3548bdc
DM
6033 PAD* new_comppad = NULL;
6034 PAD* old_comppad;
4026c95a 6035 PADOFFSET refcnt;
dfad63ad 6036
c277df42 6037 while (--n >= 0) {
261faec3 6038 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6039 switch (r->data->what[n]) {
6040 case 's':
6041 SvREFCNT_dec((SV*)r->data->data[n]);
6042 break;
653099ff
GS
6043 case 'f':
6044 Safefree(r->data->data[n]);
6045 break;
dfad63ad
HS
6046 case 'p':
6047 new_comppad = (AV*)r->data->data[n];
6048 break;
c277df42 6049 case 'o':
dfad63ad 6050 if (new_comppad == NULL)
cea2e8a9 6051 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6052 PAD_SAVE_LOCAL(old_comppad,
6053 /* Watch out for global destruction's random ordering. */
c445ea15 6054 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 6055 );
b34c0dd4 6056 OP_REFCNT_LOCK;
4026c95a
SH
6057 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6058 OP_REFCNT_UNLOCK;
6059 if (!refcnt)
9b978d73 6060 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6061
f3548bdc 6062 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6063 SvREFCNT_dec((SV*)new_comppad);
6064 new_comppad = NULL;
c277df42
IZ
6065 break;
6066 case 'n':
9e55ce06 6067 break;
a3621e74
YO
6068 case 't':
6069 {
c445ea15 6070 reg_trie_data * const trie=(reg_trie_data*)r->data->data[n];
a3621e74
YO
6071 U32 refcount;
6072 OP_REFCNT_LOCK;
e27afef8 6073 refcount = --trie->refcount;
a3621e74
YO
6074 OP_REFCNT_UNLOCK;
6075 if ( !refcount ) {
43c5f42d 6076 Safefree(trie->charmap);
a3621e74
YO
6077 if (trie->widecharmap)
6078 SvREFCNT_dec((SV*)trie->widecharmap);
43c5f42d
NC
6079 Safefree(trie->states);
6080 Safefree(trie->trans);
a3621e74
YO
6081#ifdef DEBUGGING
6082 if (trie->words)
6083 SvREFCNT_dec((SV*)trie->words);
6084 if (trie->revcharmap)
6085 SvREFCNT_dec((SV*)trie->revcharmap);
6086#endif
6087 Safefree(r->data->data[n]); /* do this last!!!! */
6088 }
6089 break;
6090 }
c277df42 6091 default:
830247a4 6092 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6093 }
6094 }
6095 Safefree(r->data->what);
6096 Safefree(r->data);
a0d0e21e
LW
6097 }
6098 Safefree(r->startp);
6099 Safefree(r->endp);
6100 Safefree(r);
a687059c 6101}
c277df42 6102
76234dfb 6103#ifndef PERL_IN_XSUB_RE
c277df42
IZ
6104/*
6105 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6106 */
6107regnode *
864dbfa3 6108Perl_regnext(pTHX_ register regnode *p)
c277df42 6109{
97aff369 6110 dVAR;
c277df42
IZ
6111 register I32 offset;
6112
3280af22 6113 if (p == &PL_regdummy)
c277df42
IZ
6114 return(NULL);
6115
6116 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6117 if (offset == 0)
6118 return(NULL);
6119
c277df42 6120 return(p+offset);
c277df42 6121}
76234dfb 6122#endif
c277df42 6123
01f988be 6124STATIC void
cea2e8a9 6125S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6126{
6127 va_list args;
6128 STRLEN l1 = strlen(pat1);
6129 STRLEN l2 = strlen(pat2);
6130 char buf[512];
06bf62c7 6131 SV *msv;
73d840c0 6132 const char *message;
c277df42
IZ
6133
6134 if (l1 > 510)
6135 l1 = 510;
6136 if (l1 + l2 > 510)
6137 l2 = 510 - l1;
6138 Copy(pat1, buf, l1 , char);
6139 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6140 buf[l1 + l2] = '\n';
6141 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6142#ifdef I_STDARG
6143 /* ANSI variant takes additional second argument */
c277df42 6144 va_start(args, pat2);
8736538c
AS
6145#else
6146 va_start(args);
6147#endif
5a844595 6148 msv = vmess(buf, &args);
c277df42 6149 va_end(args);
cfd0369c 6150 message = SvPV_const(msv,l1);
c277df42
IZ
6151 if (l1 > 512)
6152 l1 = 512;
6153 Copy(message, buf, l1 , char);
197cf9b9 6154 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6155 Perl_croak(aTHX_ "%s", buf);
c277df42 6156}
a0ed51b3
LW
6157
6158/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6159
76234dfb 6160#ifndef PERL_IN_XSUB_RE
a0ed51b3 6161void
864dbfa3 6162Perl_save_re_context(pTHX)
b81d288d 6163{
97aff369 6164 dVAR;
1ade1aa1
NC
6165
6166 struct re_save_state *state;
6167
6168 SAVEVPTR(PL_curcop);
6169 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
6170
6171 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
6172 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
6173 SSPUSHINT(SAVEt_RE_STATE);
6174
46ab3289 6175 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 6176
a0ed51b3 6177 PL_reg_start_tmp = 0;
a0ed51b3 6178 PL_reg_start_tmpl = 0;
c445ea15 6179 PL_reg_oldsaved = NULL;
a5db57d6 6180 PL_reg_oldsavedlen = 0;
a5db57d6 6181 PL_reg_maxiter = 0;
a5db57d6 6182 PL_reg_leftiter = 0;
c445ea15 6183 PL_reg_poscache = NULL;
a5db57d6 6184 PL_reg_poscache_size = 0;
1ade1aa1
NC
6185#ifdef PERL_OLD_COPY_ON_WRITE
6186 PL_nrs = NULL;
6187#endif
ada6e8a9 6188
c445ea15
AL
6189 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
6190 if (PL_curpm) {
6191 const REGEXP * const rx = PM_GETRE(PL_curpm);
6192 if (rx) {
1df70142 6193 U32 i;
ada6e8a9 6194 for (i = 1; i <= rx->nparens; i++) {
1df70142 6195 char digits[TYPE_CHARS(long)];
d9fad198 6196 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
6197 GV *const *const gvp
6198 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
6199
b37c2d43
AL
6200 if (gvp) {
6201 GV * const gv = *gvp;
6202 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
6203 save_scalar(gv);
49f27e4b 6204 }
ada6e8a9
AMS
6205 }
6206 }
6207 }
a0ed51b3 6208}
76234dfb 6209#endif
51371543 6210
51371543 6211static void
acfe0abc 6212clear_re(pTHX_ void *r)
51371543 6213{
97aff369 6214 dVAR;
51371543
GS
6215 ReREFCNT_dec((regexp *)r);
6216}
ffbc6a93 6217
a28509cc
AL
6218#ifdef DEBUGGING
6219
6220STATIC void
6221S_put_byte(pTHX_ SV *sv, int c)
6222{
6223 if (isCNTRL(c) || c == 255 || !isPRINT(c))
6224 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
6225 else if (c == '-' || c == ']' || c == '\\' || c == '^')
6226 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
6227 else
6228 Perl_sv_catpvf(aTHX_ sv, "%c", c);
6229}
6230
6231
b5a2f8d8
NC
6232STATIC const regnode *
6233S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
6234 const regnode *last, SV* sv, I32 l)
a28509cc 6235{
97aff369 6236 dVAR;
a28509cc 6237 register U8 op = EXACT; /* Arbitrary non-END op. */
b5a2f8d8 6238 register const regnode *next;
a28509cc
AL
6239
6240 while (op != END && (!last || node < last)) {
6241 /* While that wasn't END last time... */
6242
6243 NODE_ALIGN(node);
6244 op = OP(node);
6245 if (op == CLOSE)
6246 l--;
b5a2f8d8 6247 next = regnext((regnode *)node);
a28509cc
AL
6248 /* Where, what. */
6249 if (OP(node) == OPTIMIZED)
6250 goto after_print;
32fc9b6a 6251 regprop(r, sv, node);
a28509cc
AL
6252 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
6253 (int)(2*l + 1), "", SvPVX_const(sv));
6254 if (next == NULL) /* Next ptr. */
6255 PerlIO_printf(Perl_debug_log, "(0)");
6256 else
6257 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
6258 (void)PerlIO_putc(Perl_debug_log, '\n');
6259 after_print:
6260 if (PL_regkind[(U8)op] == BRANCHJ) {
b5a2f8d8
NC
6261 register const regnode *nnode = (OP(next) == LONGJMP
6262 ? regnext((regnode *)next)
6263 : next);
a28509cc
AL
6264 if (last && nnode > last)
6265 nnode = last;
4f639d21 6266 node = dumpuntil(r, start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a28509cc
AL
6267 }
6268 else if (PL_regkind[(U8)op] == BRANCH) {
4f639d21 6269 node = dumpuntil(r, start, NEXTOPER(node), next, sv, l + 1);
a28509cc
AL
6270 }
6271 else if ( PL_regkind[(U8)op] == TRIE ) {
6272 const I32 n = ARG(node);
4f639d21 6273 const reg_trie_data * const trie = (reg_trie_data*)r->data->data[n];
a28509cc
AL
6274 const I32 arry_len = av_len(trie->words)+1;
6275 I32 word_idx;
6276 PerlIO_printf(Perl_debug_log,
6277 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
6278 (int)(2*(l+3)),
6279 "",
6280 trie->wordcount,
6281 (int)trie->charcount,
6282 trie->uniquecharcount,
6283 (IV)trie->laststate-1,
6284 node->flags ? " EVAL mode" : "");
6285
6286 for (word_idx=0; word_idx < arry_len; word_idx++) {
097eb12c 6287 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
a28509cc
AL
6288 if (elem_ptr) {
6289 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
6290 (int)(2*(l+4)), "",
6291 PL_colors[0],
cfd0369c 6292 SvPV_nolen_const(*elem_ptr),
a28509cc
AL
6293 PL_colors[1]
6294 );
6295 /*
6296 if (next == NULL)
6297 PerlIO_printf(Perl_debug_log, "(0)\n");
6298 else
6299 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
6300 */
6301 }
6302
6303 }
6304
6305 node = NEXTOPER(node);
6306 node += regarglen[(U8)op];
6307
6308 }
6309 else if ( op == CURLY) { /* "next" might be very big: optimizer */
4f639d21 6310 node = dumpuntil(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
6311 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
6312 }
6313 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4f639d21 6314 node = dumpuntil(r, start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
a28509cc
AL
6315 next, sv, l + 1);
6316 }
6317 else if ( op == PLUS || op == STAR) {
4f639d21 6318 node = dumpuntil(r, start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a28509cc
AL
6319 }
6320 else if (op == ANYOF) {
6321 /* arglen 1 + class block */
6322 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
6323 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
6324 node = NEXTOPER(node);
6325 }
6326 else if (PL_regkind[(U8)op] == EXACT) {
6327 /* Literal string, where present. */
6328 node += NODE_SZ_STR(node) - 1;
6329 node = NEXTOPER(node);
6330 }
6331 else {
6332 node = NEXTOPER(node);
6333 node += regarglen[(U8)op];
6334 }
6335 if (op == CURLYX || op == OPEN)
6336 l++;
6337 else if (op == WHILEM)
6338 l--;
6339 }
6340 return node;
6341}
6342
6343#endif /* DEBUGGING */
6344
241d1a3b
NC
6345/*
6346 * Local variables:
6347 * c-indentation-style: bsd
6348 * c-basic-offset: 4
6349 * indent-tabs-mode: t
6350 * End:
6351 *
37442d52
RGS
6352 * ex: set ts=8 sts=4 sw=4 noet:
6353 */