This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade to Module::CoreList 2.09:
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
1aa99e6b 120 I32 utf8;
fc8cd66c 121 HV *charnames; /* cache of named sequences */
830247a4
IZ
122#if ADD_TO_REGEXEC
123 char *starttry; /* -Dr: where regtry was called. */
124#define RExC_starttry (pRExC_state->starttry)
125#endif
3dab1dad 126#ifdef DEBUGGING
be8e71aa 127 const char *lastparse;
3dab1dad
YO
128 I32 lastnum;
129#define RExC_lastparse (pRExC_state->lastparse)
130#define RExC_lastnum (pRExC_state->lastnum)
131#endif
830247a4
IZ
132} RExC_state_t;
133
e2509266 134#define RExC_flags (pRExC_state->flags)
830247a4
IZ
135#define RExC_precomp (pRExC_state->precomp)
136#define RExC_rx (pRExC_state->rx)
fac92740 137#define RExC_start (pRExC_state->start)
830247a4
IZ
138#define RExC_end (pRExC_state->end)
139#define RExC_parse (pRExC_state->parse)
140#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 141#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 142#define RExC_emit (pRExC_state->emit)
fac92740 143#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
144#define RExC_naughty (pRExC_state->naughty)
145#define RExC_sawback (pRExC_state->sawback)
146#define RExC_seen (pRExC_state->seen)
147#define RExC_size (pRExC_state->size)
148#define RExC_npar (pRExC_state->npar)
149#define RExC_extralen (pRExC_state->extralen)
150#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
151#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 152#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 153#define RExC_charnames (pRExC_state->charnames)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
3dab1dad
YO
171#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
172
07be1b83
YO
173/* whether trie related optimizations are enabled */
174#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
175#define TRIE_STUDY_OPT
786e8c11 176#define FULL_TRIE_STUDY
07be1b83
YO
177#define TRIE_STCLASS
178#endif
1de06328
YO
179
180
181/* About scan_data_t.
182
183 During optimisation we recurse through the regexp program performing
184 various inplace (keyhole style) optimisations. In addition study_chunk
185 and scan_commit populate this data structure with information about
186 what strings MUST appear in the pattern. We look for the longest
187 string that must appear for at a fixed location, and we look for the
188 longest string that may appear at a floating location. So for instance
189 in the pattern:
190
191 /FOO[xX]A.*B[xX]BAR/
192
193 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
194 strings (because they follow a .* construct). study_chunk will identify
195 both FOO and BAR as being the longest fixed and floating strings respectively.
196
197 The strings can be composites, for instance
198
199 /(f)(o)(o)/
200
201 will result in a composite fixed substring 'foo'.
202
203 For each string some basic information is maintained:
204
205 - offset or min_offset
206 This is the position the string must appear at, or not before.
207 It also implicitly (when combined with minlenp) tells us how many
208 character must match before the string we are searching.
209 Likewise when combined with minlenp and the length of the string
210 tells us how many characters must appear after the string we have
211 found.
212
213 - max_offset
214 Only used for floating strings. This is the rightmost point that
215 the string can appear at. Ifset to I32 max it indicates that the
216 string can occur infinitely far to the right.
217
218 - minlenp
219 A pointer to the minimum length of the pattern that the string
220 was found inside. This is important as in the case of positive
221 lookahead or positive lookbehind we can have multiple patterns
222 involved. Consider
223
224 /(?=FOO).*F/
225
226 The minimum length of the pattern overall is 3, the minimum length
227 of the lookahead part is 3, but the minimum length of the part that
228 will actually match is 1. So 'FOO's minimum length is 3, but the
229 minimum length for the F is 1. This is important as the minimum length
230 is used to determine offsets in front of and behind the string being
231 looked for. Since strings can be composites this is the length of the
232 pattern at the time it was commited with a scan_commit. Note that
233 the length is calculated by study_chunk, so that the minimum lengths
234 are not known until the full pattern has been compiled, thus the
235 pointer to the value.
236
237 - lookbehind
238
239 In the case of lookbehind the string being searched for can be
240 offset past the start point of the final matching string.
241 If this value was just blithely removed from the min_offset it would
242 invalidate some of the calculations for how many chars must match
243 before or after (as they are derived from min_offset and minlen and
244 the length of the string being searched for).
245 When the final pattern is compiled and the data is moved from the
246 scan_data_t structure into the regexp structure the information
247 about lookbehind is factored in, with the information that would
248 have been lost precalculated in the end_shift field for the
249 associated string.
250
251 The fields pos_min and pos_delta are used to store the minimum offset
252 and the delta to the maximum offset at the current point in the pattern.
253
254*/
2c2d71f5
JH
255
256typedef struct scan_data_t {
1de06328
YO
257 /*I32 len_min; unused */
258 /*I32 len_delta; unused */
2c2d71f5
JH
259 I32 pos_min;
260 I32 pos_delta;
261 SV *last_found;
1de06328 262 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
263 I32 last_start_min;
264 I32 last_start_max;
1de06328
YO
265 SV **longest; /* Either &l_fixed, or &l_float. */
266 SV *longest_fixed; /* longest fixed string found in pattern */
267 I32 offset_fixed; /* offset where it starts */
268 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
269 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
270 SV *longest_float; /* longest floating string found in pattern */
271 I32 offset_float_min; /* earliest point in string it can appear */
272 I32 offset_float_max; /* latest point in string it can appear */
273 I32 *minlen_float; /* pointer to the minlen relevent to the string */
274 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
275 I32 flags;
276 I32 whilem_c;
cb434fcc 277 I32 *last_closep;
653099ff 278 struct regnode_charclass_class *start_class;
2c2d71f5
JH
279} scan_data_t;
280
a687059c 281/*
e50aee73 282 * Forward declarations for pregcomp()'s friends.
a687059c 283 */
a0d0e21e 284
27da23d5 285static const scan_data_t zero_scan_data =
1de06328 286 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
287
288#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
289#define SF_BEFORE_SEOL 0x0001
290#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
291#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
292#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
293
09b7f37c
CB
294#ifdef NO_UNARY_PLUS
295# define SF_FIX_SHIFT_EOL (0+2)
296# define SF_FL_SHIFT_EOL (0+4)
297#else
298# define SF_FIX_SHIFT_EOL (+2)
299# define SF_FL_SHIFT_EOL (+4)
300#endif
c277df42
IZ
301
302#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
303#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
304
305#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
306#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
307#define SF_IS_INF 0x0040
308#define SF_HAS_PAR 0x0080
309#define SF_IN_PAR 0x0100
310#define SF_HAS_EVAL 0x0200
311#define SCF_DO_SUBSTR 0x0400
653099ff
GS
312#define SCF_DO_STCLASS_AND 0x0800
313#define SCF_DO_STCLASS_OR 0x1000
314#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 315#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 316
786e8c11
YO
317#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
318
07be1b83 319
eb160463 320#define UTF (RExC_utf8 != 0)
e2509266
JH
321#define LOC ((RExC_flags & PMf_LOCALE) != 0)
322#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 323
ffc61ed2 324#define OOB_UNICODE 12345678
93733859 325#define OOB_NAMEDCLASS -1
b8c5462f 326
a0ed51b3
LW
327#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
328#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
329
8615cb43 330
b45f050a
JF
331/* length of regex to show in messages that don't mark a position within */
332#define RegexLengthToShowInErrorMessages 127
333
334/*
335 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
336 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
337 * op/pragma/warn/regcomp.
338 */
7253e4e3
RK
339#define MARKER1 "<-- HERE" /* marker as it appears in the description */
340#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 341
7253e4e3 342#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
343
344/*
345 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
346 * arg. Show regex, up to a maximum length. If it's too long, chop and add
347 * "...".
348 */
ccb2c380 349#define FAIL(msg) STMT_START { \
bfed75c6 350 const char *ellipses = ""; \
ccb2c380
MP
351 IV len = RExC_end - RExC_precomp; \
352 \
353 if (!SIZE_ONLY) \
354 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
355 if (len > RegexLengthToShowInErrorMessages) { \
356 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
357 len = RegexLengthToShowInErrorMessages - 10; \
358 ellipses = "..."; \
359 } \
360 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
361 msg, (int)len, RExC_precomp, ellipses); \
362} STMT_END
8615cb43 363
b45f050a 364/*
b45f050a
JF
365 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
366 */
ccb2c380 367#define Simple_vFAIL(m) STMT_START { \
a28509cc 368 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
369 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
370 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
371} STMT_END
b45f050a
JF
372
373/*
374 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
375 */
ccb2c380
MP
376#define vFAIL(m) STMT_START { \
377 if (!SIZE_ONLY) \
378 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
379 Simple_vFAIL(m); \
380} STMT_END
b45f050a
JF
381
382/*
383 * Like Simple_vFAIL(), but accepts two arguments.
384 */
ccb2c380 385#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 386 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
387 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
388 (int)offset, RExC_precomp, RExC_precomp + offset); \
389} STMT_END
b45f050a
JF
390
391/*
392 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
393 */
ccb2c380
MP
394#define vFAIL2(m,a1) STMT_START { \
395 if (!SIZE_ONLY) \
396 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
397 Simple_vFAIL2(m, a1); \
398} STMT_END
b45f050a
JF
399
400
401/*
402 * Like Simple_vFAIL(), but accepts three arguments.
403 */
ccb2c380 404#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 405 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
406 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
407 (int)offset, RExC_precomp, RExC_precomp + offset); \
408} STMT_END
b45f050a
JF
409
410/*
411 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
412 */
ccb2c380
MP
413#define vFAIL3(m,a1,a2) STMT_START { \
414 if (!SIZE_ONLY) \
415 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
416 Simple_vFAIL3(m, a1, a2); \
417} STMT_END
b45f050a
JF
418
419/*
420 * Like Simple_vFAIL(), but accepts four arguments.
421 */
ccb2c380 422#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 423 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
424 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
425 (int)offset, RExC_precomp, RExC_precomp + offset); \
426} STMT_END
b45f050a 427
ccb2c380 428#define vWARN(loc,m) STMT_START { \
a28509cc 429 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
430 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
431 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
432} STMT_END
433
434#define vWARNdep(loc,m) STMT_START { \
a28509cc 435 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
436 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
437 "%s" REPORT_LOCATION, \
438 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
439} STMT_END
440
441
442#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 443 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
444 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
445 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
446} STMT_END
447
448#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 449 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
450 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
451 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
452} STMT_END
453
454#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 455 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
456 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
457 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
458} STMT_END
459
460#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 461 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
462 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
463 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
464} STMT_END
9d1d55b5 465
8615cb43 466
cd439c50 467/* Allow for side effects in s */
ccb2c380
MP
468#define REGC(c,s) STMT_START { \
469 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
470} STMT_END
cd439c50 471
fac92740
MJD
472/* Macros for recording node offsets. 20001227 mjd@plover.com
473 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
474 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
475 * Element 0 holds the number n.
07be1b83 476 * Position is 1 indexed.
fac92740
MJD
477 */
478
ccb2c380
MP
479#define Set_Node_Offset_To_R(node,byte) STMT_START { \
480 if (! SIZE_ONLY) { \
481 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 482 __LINE__, (node), (int)(byte))); \
ccb2c380 483 if((node) < 0) { \
551405c4 484 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
485 } else { \
486 RExC_offsets[2*(node)-1] = (byte); \
487 } \
488 } \
489} STMT_END
490
491#define Set_Node_Offset(node,byte) \
492 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
493#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
494
495#define Set_Node_Length_To_R(node,len) STMT_START { \
496 if (! SIZE_ONLY) { \
497 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 498 __LINE__, (int)(node), (int)(len))); \
ccb2c380 499 if((node) < 0) { \
551405c4 500 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
501 } else { \
502 RExC_offsets[2*(node)] = (len); \
503 } \
504 } \
505} STMT_END
506
507#define Set_Node_Length(node,len) \
508 Set_Node_Length_To_R((node)-RExC_emit_start, len)
509#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
510#define Set_Node_Cur_Length(node) \
511 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
512
513/* Get offsets and lengths */
514#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
515#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
516
07be1b83
YO
517#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
518 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
519 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
520} STMT_END
521
522
523#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
524#define EXPERIMENTAL_INPLACESCAN
525#endif
526
1de06328 527#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 528DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328
YO
529 PerlIO_printf(Perl_debug_log, \
530 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
531 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
532 (int)(depth)*2, "", \
533 (IV)((data)->pos_min), \
534 (IV)((data)->pos_delta), \
535 (IV)((data)->flags), \
536 (IV)((data)->whilem_c), \
537 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
538 ); \
539 if ((data)->last_found) \
540 PerlIO_printf(Perl_debug_log, \
541 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
542 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
543 SvPVX_const((data)->last_found), \
544 (IV)((data)->last_end), \
545 (IV)((data)->last_start_min), \
546 (IV)((data)->last_start_max), \
547 ((data)->longest && \
548 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
549 SvPVX_const((data)->longest_fixed), \
550 (IV)((data)->offset_fixed), \
551 ((data)->longest && \
552 (data)->longest==&((data)->longest_float)) ? "*" : "", \
553 SvPVX_const((data)->longest_float), \
554 (IV)((data)->offset_float_min), \
555 (IV)((data)->offset_float_max) \
556 ); \
557 PerlIO_printf(Perl_debug_log,"\n"); \
558});
559
acfe0abc 560static void clear_re(pTHX_ void *r);
4327152a 561
653099ff 562/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 563 Update the longest found anchored substring and the longest found
653099ff
GS
564 floating substrings if needed. */
565
4327152a 566STATIC void
1de06328 567S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 568{
e1ec3a88
AL
569 const STRLEN l = CHR_SVLEN(data->last_found);
570 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 571 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 572
c277df42 573 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 574 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
575 if (*data->longest == data->longest_fixed) {
576 data->offset_fixed = l ? data->last_start_min : data->pos_min;
577 if (data->flags & SF_BEFORE_EOL)
b81d288d 578 data->flags
c277df42
IZ
579 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
580 else
581 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
582 data->minlen_fixed=minlenp;
583 data->lookbehind_fixed=0;
a0ed51b3
LW
584 }
585 else {
c277df42 586 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
587 data->offset_float_max = (l
588 ? data->last_start_max
c277df42 589 : data->pos_min + data->pos_delta);
9051bda5
HS
590 if ((U32)data->offset_float_max > (U32)I32_MAX)
591 data->offset_float_max = I32_MAX;
c277df42 592 if (data->flags & SF_BEFORE_EOL)
b81d288d 593 data->flags
c277df42
IZ
594 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
595 else
596 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
597 data->minlen_float=minlenp;
598 data->lookbehind_float=0;
c277df42
IZ
599 }
600 }
601 SvCUR_set(data->last_found, 0);
0eda9292 602 {
a28509cc 603 SV * const sv = data->last_found;
097eb12c
AL
604 if (SvUTF8(sv) && SvMAGICAL(sv)) {
605 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
606 if (mg)
607 mg->mg_len = 0;
608 }
0eda9292 609 }
c277df42
IZ
610 data->last_end = -1;
611 data->flags &= ~SF_BEFORE_EOL;
1de06328 612 DEBUG_STUDYDATA(data,0);
c277df42
IZ
613}
614
653099ff
GS
615/* Can match anything (initialization) */
616STATIC void
097eb12c 617S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 618{
653099ff 619 ANYOF_CLASS_ZERO(cl);
f8bef550 620 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 621 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
622 if (LOC)
623 cl->flags |= ANYOF_LOCALE;
624}
625
626/* Can match anything (initialization) */
627STATIC int
5f66b61c 628S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
629{
630 int value;
631
aaa51d5e 632 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
633 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
634 return 1;
1aa99e6b
IH
635 if (!(cl->flags & ANYOF_UNICODE_ALL))
636 return 0;
10edeb5d 637 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 638 return 0;
653099ff
GS
639 return 1;
640}
641
642/* Can match anything (initialization) */
643STATIC void
097eb12c 644S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 645{
8ecf7187 646 Zero(cl, 1, struct regnode_charclass_class);
653099ff 647 cl->type = ANYOF;
830247a4 648 cl_anything(pRExC_state, cl);
653099ff
GS
649}
650
651STATIC void
097eb12c 652S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 653{
8ecf7187 654 Zero(cl, 1, struct regnode_charclass_class);
653099ff 655 cl->type = ANYOF;
830247a4 656 cl_anything(pRExC_state, cl);
653099ff
GS
657 if (LOC)
658 cl->flags |= ANYOF_LOCALE;
659}
660
661/* 'And' a given class with another one. Can create false positives */
662/* We assume that cl is not inverted */
663STATIC void
5f66b61c 664S_cl_and(struct regnode_charclass_class *cl,
a28509cc 665 const struct regnode_charclass_class *and_with)
653099ff 666{
653099ff
GS
667 if (!(and_with->flags & ANYOF_CLASS)
668 && !(cl->flags & ANYOF_CLASS)
669 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
670 && !(and_with->flags & ANYOF_FOLD)
671 && !(cl->flags & ANYOF_FOLD)) {
672 int i;
673
674 if (and_with->flags & ANYOF_INVERT)
675 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
676 cl->bitmap[i] &= ~and_with->bitmap[i];
677 else
678 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
679 cl->bitmap[i] &= and_with->bitmap[i];
680 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
681 if (!(and_with->flags & ANYOF_EOS))
682 cl->flags &= ~ANYOF_EOS;
1aa99e6b 683
14ebb1a2
JH
684 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
685 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
686 cl->flags &= ~ANYOF_UNICODE_ALL;
687 cl->flags |= ANYOF_UNICODE;
688 ARG_SET(cl, ARG(and_with));
689 }
14ebb1a2
JH
690 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
691 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 692 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
693 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
694 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 695 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
696}
697
698/* 'OR' a given class with another one. Can create false positives */
699/* We assume that cl is not inverted */
700STATIC void
097eb12c 701S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 702{
653099ff
GS
703 if (or_with->flags & ANYOF_INVERT) {
704 /* We do not use
705 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
706 * <= (B1 | !B2) | (CL1 | !CL2)
707 * which is wasteful if CL2 is small, but we ignore CL2:
708 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
709 * XXXX Can we handle case-fold? Unclear:
710 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
711 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
712 */
713 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
714 && !(or_with->flags & ANYOF_FOLD)
715 && !(cl->flags & ANYOF_FOLD) ) {
716 int i;
717
718 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
719 cl->bitmap[i] |= ~or_with->bitmap[i];
720 } /* XXXX: logic is complicated otherwise */
721 else {
830247a4 722 cl_anything(pRExC_state, cl);
653099ff
GS
723 }
724 } else {
725 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
726 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 727 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
728 || (cl->flags & ANYOF_FOLD)) ) {
729 int i;
730
731 /* OR char bitmap and class bitmap separately */
732 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
733 cl->bitmap[i] |= or_with->bitmap[i];
734 if (or_with->flags & ANYOF_CLASS) {
735 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
736 cl->classflags[i] |= or_with->classflags[i];
737 cl->flags |= ANYOF_CLASS;
738 }
739 }
740 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 741 cl_anything(pRExC_state, cl);
653099ff
GS
742 }
743 }
744 if (or_with->flags & ANYOF_EOS)
745 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
746
747 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
748 ARG(cl) != ARG(or_with)) {
749 cl->flags |= ANYOF_UNICODE_ALL;
750 cl->flags &= ~ANYOF_UNICODE;
751 }
752 if (or_with->flags & ANYOF_UNICODE_ALL) {
753 cl->flags |= ANYOF_UNICODE_ALL;
754 cl->flags &= ~ANYOF_UNICODE;
755 }
653099ff
GS
756}
757
a3621e74
YO
758#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
759#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
760#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
761#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
762
3dab1dad
YO
763
764#ifdef DEBUGGING
07be1b83 765/*
3dab1dad
YO
766 dump_trie(trie)
767 dump_trie_interim_list(trie,next_alloc)
768 dump_trie_interim_table(trie,next_alloc)
769
770 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
771 The _interim_ variants are used for debugging the interim
772 tables that are used to generate the final compressed
773 representation which is what dump_trie expects.
774
3dab1dad
YO
775 Part of the reason for their existance is to provide a form
776 of documentation as to how the different representations function.
07be1b83
YO
777
778*/
3dab1dad
YO
779
780/*
781 dump_trie(trie)
782 Dumps the final compressed table form of the trie to Perl_debug_log.
783 Used for debugging make_trie().
784*/
785
786STATIC void
787S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
788{
789 U32 state;
ab3bbdeb
YO
790 SV *sv=sv_newmortal();
791 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
792 GET_RE_DEBUG_FLAGS_DECL;
793
ab3bbdeb 794
3dab1dad
YO
795 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
796 (int)depth * 2 + 2,"",
797 "Match","Base","Ofs" );
798
799 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 800 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 801 if ( tmp ) {
ab3bbdeb
YO
802 PerlIO_printf( Perl_debug_log, "%*s",
803 colwidth,
ddc5bc0f 804 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
805 PL_colors[0], PL_colors[1],
806 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
807 PERL_PV_ESCAPE_FIRSTCHAR
808 )
809 );
3dab1dad
YO
810 }
811 }
812 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
813 (int)depth * 2 + 2,"");
814
815 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 816 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
817 PerlIO_printf( Perl_debug_log, "\n");
818
786e8c11 819 for( state = 1 ; state < trie->laststate ; state++ ) {
be8e71aa 820 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
821
822 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
823
824 if ( trie->states[ state ].wordnum ) {
825 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
826 } else {
827 PerlIO_printf( Perl_debug_log, "%6s", "" );
828 }
829
830 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
831
832 if ( base ) {
833 U32 ofs = 0;
834
835 while( ( base + ofs < trie->uniquecharcount ) ||
836 ( base + ofs - trie->uniquecharcount < trie->lasttrans
837 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
838 ofs++;
839
840 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
841
842 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
843 if ( ( base + ofs >= trie->uniquecharcount ) &&
844 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
845 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
846 {
ab3bbdeb
YO
847 PerlIO_printf( Perl_debug_log, "%*"UVXf,
848 colwidth,
3dab1dad
YO
849 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
850 } else {
ab3bbdeb 851 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
852 }
853 }
854
855 PerlIO_printf( Perl_debug_log, "]");
856
857 }
858 PerlIO_printf( Perl_debug_log, "\n" );
859 }
860}
861/*
862 dump_trie_interim_list(trie,next_alloc)
863 Dumps a fully constructed but uncompressed trie in list form.
864 List tries normally only are used for construction when the number of
865 possible chars (trie->uniquecharcount) is very high.
866 Used for debugging make_trie().
867*/
868STATIC void
869S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
870{
871 U32 state;
ab3bbdeb
YO
872 SV *sv=sv_newmortal();
873 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
874 GET_RE_DEBUG_FLAGS_DECL;
875 /* print out the table precompression. */
ab3bbdeb
YO
876 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
877 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
878 "------:-----+-----------------\n" );
3dab1dad
YO
879
880 for( state=1 ; state < next_alloc ; state ++ ) {
881 U16 charid;
882
ab3bbdeb 883 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
884 (int)depth * 2 + 2,"", (UV)state );
885 if ( ! trie->states[ state ].wordnum ) {
886 PerlIO_printf( Perl_debug_log, "%5s| ","");
887 } else {
888 PerlIO_printf( Perl_debug_log, "W%4x| ",
889 trie->states[ state ].wordnum
890 );
891 }
892 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 893 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
894 if ( tmp ) {
895 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
896 colwidth,
ddc5bc0f 897 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
898 PL_colors[0], PL_colors[1],
899 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
900 PERL_PV_ESCAPE_FIRSTCHAR
901 ) ,
3dab1dad
YO
902 TRIE_LIST_ITEM(state,charid).forid,
903 (UV)TRIE_LIST_ITEM(state,charid).newstate
904 );
905 }
ab3bbdeb
YO
906 }
907 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
908 }
909}
910
911/*
912 dump_trie_interim_table(trie,next_alloc)
913 Dumps a fully constructed but uncompressed trie in table form.
914 This is the normal DFA style state transition table, with a few
915 twists to facilitate compression later.
916 Used for debugging make_trie().
917*/
918STATIC void
919S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
920{
921 U32 state;
922 U16 charid;
ab3bbdeb
YO
923 SV *sv=sv_newmortal();
924 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
925 GET_RE_DEBUG_FLAGS_DECL;
926
927 /*
928 print out the table precompression so that we can do a visual check
929 that they are identical.
930 */
931
932 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
933
934 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 935 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 936 if ( tmp ) {
ab3bbdeb
YO
937 PerlIO_printf( Perl_debug_log, "%*s",
938 colwidth,
ddc5bc0f 939 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
940 PL_colors[0], PL_colors[1],
941 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
942 PERL_PV_ESCAPE_FIRSTCHAR
943 )
944 );
3dab1dad
YO
945 }
946 }
947
948 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
949
950 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 951 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
952 }
953
954 PerlIO_printf( Perl_debug_log, "\n" );
955
956 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
957
958 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
959 (int)depth * 2 + 2,"",
960 (UV)TRIE_NODENUM( state ) );
961
962 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
963 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
964 if (v)
965 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
966 else
967 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
968 }
969 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
970 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
971 } else {
972 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
973 trie->states[ TRIE_NODENUM( state ) ].wordnum );
974 }
975 }
07be1b83 976}
3dab1dad
YO
977
978#endif
979
786e8c11
YO
980/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
981 startbranch: the first branch in the whole branch sequence
982 first : start branch of sequence of branch-exact nodes.
983 May be the same as startbranch
984 last : Thing following the last branch.
985 May be the same as tail.
986 tail : item following the branch sequence
987 count : words in the sequence
988 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
989 depth : indent depth
3dab1dad 990
786e8c11 991Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 992
786e8c11
YO
993A trie is an N'ary tree where the branches are determined by digital
994decomposition of the key. IE, at the root node you look up the 1st character and
995follow that branch repeat until you find the end of the branches. Nodes can be
996marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 997
786e8c11 998 /he|she|his|hers/
72f13be8 999
786e8c11
YO
1000would convert into the following structure. Numbers represent states, letters
1001following numbers represent valid transitions on the letter from that state, if
1002the number is in square brackets it represents an accepting state, otherwise it
1003will be in parenthesis.
07be1b83 1004
786e8c11
YO
1005 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1006 | |
1007 | (2)
1008 | |
1009 (1) +-i->(6)-+-s->[7]
1010 |
1011 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1012
786e8c11
YO
1013 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1014
1015This shows that when matching against the string 'hers' we will begin at state 1
1016read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1017then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1018is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1019single traverse. We store a mapping from accepting to state to which word was
1020matched, and then when we have multiple possibilities we try to complete the
1021rest of the regex in the order in which they occured in the alternation.
1022
1023The only prior NFA like behaviour that would be changed by the TRIE support is
1024the silent ignoring of duplicate alternations which are of the form:
1025
1026 / (DUPE|DUPE) X? (?{ ... }) Y /x
1027
1028Thus EVAL blocks follwing a trie may be called a different number of times with
1029and without the optimisation. With the optimisations dupes will be silently
1030ignored. This inconsistant behaviour of EVAL type nodes is well established as
1031the following demonstrates:
1032
1033 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1034
1035which prints out 'word' three times, but
1036
1037 'words'=~/(word|word|word)(?{ print $1 })S/
1038
1039which doesnt print it out at all. This is due to other optimisations kicking in.
1040
1041Example of what happens on a structural level:
1042
1043The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1044
1045 1: CURLYM[1] {1,32767}(18)
1046 5: BRANCH(8)
1047 6: EXACT <ac>(16)
1048 8: BRANCH(11)
1049 9: EXACT <ad>(16)
1050 11: BRANCH(14)
1051 12: EXACT <ab>(16)
1052 16: SUCCEED(0)
1053 17: NOTHING(18)
1054 18: END(0)
1055
1056This would be optimizable with startbranch=5, first=5, last=16, tail=16
1057and should turn into:
1058
1059 1: CURLYM[1] {1,32767}(18)
1060 5: TRIE(16)
1061 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1062 <ac>
1063 <ad>
1064 <ab>
1065 16: SUCCEED(0)
1066 17: NOTHING(18)
1067 18: END(0)
1068
1069Cases where tail != last would be like /(?foo|bar)baz/:
1070
1071 1: BRANCH(4)
1072 2: EXACT <foo>(8)
1073 4: BRANCH(7)
1074 5: EXACT <bar>(8)
1075 7: TAIL(8)
1076 8: EXACT <baz>(10)
1077 10: END(0)
1078
1079which would be optimizable with startbranch=1, first=1, last=7, tail=8
1080and would end up looking like:
1081
1082 1: TRIE(8)
1083 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1084 <foo>
1085 <bar>
1086 7: TAIL(8)
1087 8: EXACT <baz>(10)
1088 10: END(0)
1089
1090 d = uvuni_to_utf8_flags(d, uv, 0);
1091
1092is the recommended Unicode-aware way of saying
1093
1094 *(d++) = uv;
1095*/
1096
1097#define TRIE_STORE_REVCHAR \
1098 STMT_START { \
1099 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
1100 if (UTF) SvUTF8_on(tmp); \
1101 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1102 } STMT_END
1103
1104#define TRIE_READ_CHAR STMT_START { \
1105 wordlen++; \
1106 if ( UTF ) { \
1107 if ( folder ) { \
1108 if ( foldlen > 0 ) { \
1109 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1110 foldlen -= len; \
1111 scan += len; \
1112 len = 0; \
1113 } else { \
1114 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1115 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1116 foldlen -= UNISKIP( uvc ); \
1117 scan = foldbuf + UNISKIP( uvc ); \
1118 } \
1119 } else { \
1120 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1121 } \
1122 } else { \
1123 uvc = (U32)*uc; \
1124 len = 1; \
1125 } \
1126} STMT_END
1127
1128
1129
1130#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1131 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1132 TRIE_LIST_LEN( state ) *= 2; \
1133 Renew( trie->states[ state ].trans.list, \
1134 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
1135 } \
1136 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1137 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1138 TRIE_LIST_CUR( state )++; \
1139} STMT_END
07be1b83 1140
786e8c11
YO
1141#define TRIE_LIST_NEW(state) STMT_START { \
1142 Newxz( trie->states[ state ].trans.list, \
1143 4, reg_trie_trans_le ); \
1144 TRIE_LIST_CUR( state ) = 1; \
1145 TRIE_LIST_LEN( state ) = 4; \
1146} STMT_END
07be1b83 1147
786e8c11
YO
1148#define TRIE_HANDLE_WORD(state) STMT_START { \
1149 U16 dupe= trie->states[ state ].wordnum; \
1150 regnode * const noper_next = regnext( noper ); \
1151 \
1152 if (trie->wordlen) \
1153 trie->wordlen[ curword ] = wordlen; \
1154 DEBUG_r({ \
1155 /* store the word for dumping */ \
1156 SV* tmp; \
1157 if (OP(noper) != NOTHING) \
1158 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1159 else \
1160 tmp = newSVpvn( "", 0 ); \
1161 if ( UTF ) SvUTF8_on( tmp ); \
1162 av_push( trie->words, tmp ); \
1163 }); \
1164 \
1165 curword++; \
1166 \
1167 if ( noper_next < tail ) { \
1168 if (!trie->jump) \
1169 Newxz( trie->jump, word_count + 1, U16); \
1170 trie->jump[curword] = (U16)(tail - noper_next); \
1171 if (!jumper) \
1172 jumper = noper_next; \
1173 if (!nextbranch) \
1174 nextbranch= regnext(cur); \
1175 } \
1176 \
1177 if ( dupe ) { \
1178 /* So it's a dupe. This means we need to maintain a */\
1179 /* linked-list from the first to the next. */\
1180 /* we only allocate the nextword buffer when there */\
1181 /* a dupe, so first time we have to do the allocation */\
1182 if (!trie->nextword) \
1183 Newxz( trie->nextword, word_count + 1, U16); \
1184 while ( trie->nextword[dupe] ) \
1185 dupe= trie->nextword[dupe]; \
1186 trie->nextword[dupe]= curword; \
1187 } else { \
1188 /* we haven't inserted this word yet. */ \
1189 trie->states[ state ].wordnum = curword; \
1190 } \
1191} STMT_END
07be1b83 1192
3dab1dad 1193
786e8c11
YO
1194#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1195 ( ( base + charid >= ucharcount \
1196 && base + charid < ubound \
1197 && state == trie->trans[ base - ucharcount + charid ].check \
1198 && trie->trans[ base - ucharcount + charid ].next ) \
1199 ? trie->trans[ base - ucharcount + charid ].next \
1200 : ( state==1 ? special : 0 ) \
1201 )
3dab1dad 1202
786e8c11
YO
1203#define MADE_TRIE 1
1204#define MADE_JUMP_TRIE 2
1205#define MADE_EXACT_TRIE 4
3dab1dad 1206
a3621e74 1207STATIC I32
786e8c11 1208S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1209{
27da23d5 1210 dVAR;
a3621e74
YO
1211 /* first pass, loop through and scan words */
1212 reg_trie_data *trie;
1213 regnode *cur;
9f7f3913 1214 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1215 STRLEN len = 0;
1216 UV uvc = 0;
1217 U16 curword = 0;
1218 U32 next_alloc = 0;
786e8c11
YO
1219 regnode *jumper = NULL;
1220 regnode *nextbranch = NULL;
a3621e74 1221 /* we just use folder as a flag in utf8 */
e1ec3a88 1222 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1223 ? PL_fold
1224 : ( flags == EXACTFL
1225 ? PL_fold_locale
1226 : NULL
1227 )
1228 );
1229
e1ec3a88 1230 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1231 SV *re_trie_maxbuff;
3dab1dad
YO
1232#ifndef DEBUGGING
1233 /* these are only used during construction but are useful during
8e11feef 1234 * debugging so we store them in the struct when debugging.
8e11feef 1235 */
3dab1dad 1236 STRLEN trie_charcount=0;
3dab1dad
YO
1237 AV *trie_revcharmap;
1238#endif
a3621e74 1239 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1240#ifndef DEBUGGING
1241 PERL_UNUSED_ARG(depth);
1242#endif
a3621e74 1243
a02a5408 1244 Newxz( trie, 1, reg_trie_data );
a3621e74 1245 trie->refcount = 1;
3dab1dad 1246 trie->startstate = 1;
786e8c11 1247 trie->wordcount = word_count;
a3621e74 1248 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1249 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1250 if (!(UTF && folder))
1251 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1252 DEBUG_r({
1253 trie->words = newAV();
a3621e74 1254 });
3dab1dad 1255 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1256
0111c4fd 1257 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1258 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1259 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1260 }
3dab1dad
YO
1261 DEBUG_OPTIMISE_r({
1262 PerlIO_printf( Perl_debug_log,
786e8c11 1263 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1264 (int)depth * 2 + 2, "",
1265 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1266 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1267 (int)depth);
3dab1dad 1268 });
a3621e74
YO
1269 /* -- First loop and Setup --
1270
1271 We first traverse the branches and scan each word to determine if it
1272 contains widechars, and how many unique chars there are, this is
1273 important as we have to build a table with at least as many columns as we
1274 have unique chars.
1275
1276 We use an array of integers to represent the character codes 0..255
1277 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1278 native representation of the character value as the key and IV's for the
1279 coded index.
1280
1281 *TODO* If we keep track of how many times each character is used we can
1282 remap the columns so that the table compression later on is more
1283 efficient in terms of memory by ensuring most common value is in the
1284 middle and the least common are on the outside. IMO this would be better
1285 than a most to least common mapping as theres a decent chance the most
1286 common letter will share a node with the least common, meaning the node
1287 will not be compressable. With a middle is most common approach the worst
1288 case is when we have the least common nodes twice.
1289
1290 */
1291
a3621e74 1292 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1293 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1294 const U8 *uc = (U8*)STRING( noper );
a28509cc 1295 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1296 STRLEN foldlen = 0;
1297 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1298 const U8 *scan = (U8*)NULL;
07be1b83 1299 U32 wordlen = 0; /* required init */
3dab1dad 1300 STRLEN chars=0;
a3621e74 1301
3dab1dad
YO
1302 if (OP(noper) == NOTHING) {
1303 trie->minlen= 0;
1304 continue;
1305 }
1306 if (trie->bitmap) {
1307 TRIE_BITMAP_SET(trie,*uc);
1308 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1309 }
a3621e74 1310 for ( ; uc < e ; uc += len ) {
3dab1dad 1311 TRIE_CHARCOUNT(trie)++;
a3621e74 1312 TRIE_READ_CHAR;
3dab1dad 1313 chars++;
a3621e74
YO
1314 if ( uvc < 256 ) {
1315 if ( !trie->charmap[ uvc ] ) {
1316 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1317 if ( folder )
1318 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1319 TRIE_STORE_REVCHAR;
a3621e74
YO
1320 }
1321 } else {
1322 SV** svpp;
1323 if ( !trie->widecharmap )
1324 trie->widecharmap = newHV();
1325
1326 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1327
1328 if ( !svpp )
e4584336 1329 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1330
1331 if ( !SvTRUE( *svpp ) ) {
1332 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1333 TRIE_STORE_REVCHAR;
a3621e74
YO
1334 }
1335 }
1336 }
3dab1dad
YO
1337 if( cur == first ) {
1338 trie->minlen=chars;
1339 trie->maxlen=chars;
1340 } else if (chars < trie->minlen) {
1341 trie->minlen=chars;
1342 } else if (chars > trie->maxlen) {
1343 trie->maxlen=chars;
1344 }
1345
a3621e74
YO
1346 } /* end first pass */
1347 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1348 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1349 (int)depth * 2 + 2,"",
85c3142d 1350 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1351 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1352 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1353 );
786e8c11 1354 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1355
1356 /*
1357 We now know what we are dealing with in terms of unique chars and
1358 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1359 representation using a flat table will take. If it's over a reasonable
1360 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1361 conservative but potentially much slower representation using an array
1362 of lists.
1363
1364 At the end we convert both representations into the same compressed
1365 form that will be used in regexec.c for matching with. The latter
1366 is a form that cannot be used to construct with but has memory
1367 properties similar to the list form and access properties similar
1368 to the table form making it both suitable for fast searches and
1369 small enough that its feasable to store for the duration of a program.
1370
1371 See the comment in the code where the compressed table is produced
1372 inplace from the flat tabe representation for an explanation of how
1373 the compression works.
1374
1375 */
1376
1377
3dab1dad 1378 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1379 /*
1380 Second Pass -- Array Of Lists Representation
1381
1382 Each state will be represented by a list of charid:state records
1383 (reg_trie_trans_le) the first such element holds the CUR and LEN
1384 points of the allocated array. (See defines above).
1385
1386 We build the initial structure using the lists, and then convert
1387 it into the compressed table form which allows faster lookups
1388 (but cant be modified once converted).
a3621e74
YO
1389 */
1390
a3621e74
YO
1391 STRLEN transcount = 1;
1392
3dab1dad 1393 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1394 TRIE_LIST_NEW(1);
1395 next_alloc = 2;
1396
1397 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1398
c445ea15
AL
1399 regnode * const noper = NEXTOPER( cur );
1400 U8 *uc = (U8*)STRING( noper );
1401 const U8 * const e = uc + STR_LEN( noper );
1402 U32 state = 1; /* required init */
1403 U16 charid = 0; /* sanity init */
1404 U8 *scan = (U8*)NULL; /* sanity init */
1405 STRLEN foldlen = 0; /* required init */
07be1b83 1406 U32 wordlen = 0; /* required init */
c445ea15
AL
1407 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1408
3dab1dad 1409 if (OP(noper) != NOTHING) {
786e8c11 1410 for ( ; uc < e ; uc += len ) {
c445ea15 1411
786e8c11 1412 TRIE_READ_CHAR;
c445ea15 1413
786e8c11
YO
1414 if ( uvc < 256 ) {
1415 charid = trie->charmap[ uvc ];
c445ea15 1416 } else {
786e8c11
YO
1417 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1418 if ( !svpp ) {
1419 charid = 0;
1420 } else {
1421 charid=(U16)SvIV( *svpp );
1422 }
c445ea15 1423 }
786e8c11
YO
1424 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1425 if ( charid ) {
a3621e74 1426
786e8c11
YO
1427 U16 check;
1428 U32 newstate = 0;
a3621e74 1429
786e8c11
YO
1430 charid--;
1431 if ( !trie->states[ state ].trans.list ) {
1432 TRIE_LIST_NEW( state );
c445ea15 1433 }
786e8c11
YO
1434 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1435 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1436 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1437 break;
1438 }
1439 }
1440 if ( ! newstate ) {
1441 newstate = next_alloc++;
1442 TRIE_LIST_PUSH( state, charid, newstate );
1443 transcount++;
1444 }
1445 state = newstate;
1446 } else {
1447 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1448 }
a28509cc 1449 }
c445ea15 1450 }
3dab1dad 1451 TRIE_HANDLE_WORD(state);
a3621e74
YO
1452
1453 } /* end second pass */
1454
786e8c11 1455 trie->laststate = next_alloc;
a3621e74
YO
1456 Renew( trie->states, next_alloc, reg_trie_state );
1457
3dab1dad
YO
1458 /* and now dump it out before we compress it */
1459 DEBUG_TRIE_COMPILE_MORE_r(
1460 dump_trie_interim_list(trie,next_alloc,depth+1)
a3621e74 1461 );
a3621e74 1462
a02a5408 1463 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1464 {
1465 U32 state;
a3621e74
YO
1466 U32 tp = 0;
1467 U32 zp = 0;
1468
1469
1470 for( state=1 ; state < next_alloc ; state ++ ) {
1471 U32 base=0;
1472
1473 /*
1474 DEBUG_TRIE_COMPILE_MORE_r(
1475 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1476 );
1477 */
1478
1479 if (trie->states[state].trans.list) {
1480 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1481 U16 maxid=minid;
a28509cc 1482 U16 idx;
a3621e74
YO
1483
1484 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1485 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1486 if ( forid < minid ) {
1487 minid=forid;
1488 } else if ( forid > maxid ) {
1489 maxid=forid;
1490 }
a3621e74
YO
1491 }
1492 if ( transcount < tp + maxid - minid + 1) {
1493 transcount *= 2;
1494 Renew( trie->trans, transcount, reg_trie_trans );
1495 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1496 }
1497 base = trie->uniquecharcount + tp - minid;
1498 if ( maxid == minid ) {
1499 U32 set = 0;
1500 for ( ; zp < tp ; zp++ ) {
1501 if ( ! trie->trans[ zp ].next ) {
1502 base = trie->uniquecharcount + zp - minid;
1503 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1504 trie->trans[ zp ].check = state;
1505 set = 1;
1506 break;
1507 }
1508 }
1509 if ( !set ) {
1510 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1511 trie->trans[ tp ].check = state;
1512 tp++;
1513 zp = tp;
1514 }
1515 } else {
1516 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1517 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1518 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1519 trie->trans[ tid ].check = state;
1520 }
1521 tp += ( maxid - minid + 1 );
1522 }
1523 Safefree(trie->states[ state ].trans.list);
1524 }
1525 /*
1526 DEBUG_TRIE_COMPILE_MORE_r(
1527 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1528 );
1529 */
1530 trie->states[ state ].trans.base=base;
1531 }
cc601c31 1532 trie->lasttrans = tp + 1;
a3621e74
YO
1533 }
1534 } else {
1535 /*
1536 Second Pass -- Flat Table Representation.
1537
1538 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1539 We know that we will need Charcount+1 trans at most to store the data
1540 (one row per char at worst case) So we preallocate both structures
1541 assuming worst case.
1542
1543 We then construct the trie using only the .next slots of the entry
1544 structs.
1545
1546 We use the .check field of the first entry of the node temporarily to
1547 make compression both faster and easier by keeping track of how many non
1548 zero fields are in the node.
1549
1550 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1551 transition.
1552
1553 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1554 number representing the first entry of the node, and state as a
1555 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1556 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1557 are 2 entrys per node. eg:
1558
1559 A B A B
1560 1. 2 4 1. 3 7
1561 2. 0 3 3. 0 5
1562 3. 0 0 5. 0 0
1563 4. 0 0 7. 0 0
1564
1565 The table is internally in the right hand, idx form. However as we also
1566 have to deal with the states array which is indexed by nodenum we have to
1567 use TRIE_NODENUM() to convert.
1568
1569 */
1570
3dab1dad
YO
1571
1572 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1573 reg_trie_trans );
3dab1dad 1574 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1575 next_alloc = trie->uniquecharcount + 1;
1576
3dab1dad 1577
a3621e74
YO
1578 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1579
c445ea15 1580 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1581 const U8 *uc = (U8*)STRING( noper );
1582 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1583
1584 U32 state = 1; /* required init */
1585
1586 U16 charid = 0; /* sanity init */
1587 U32 accept_state = 0; /* sanity init */
1588 U8 *scan = (U8*)NULL; /* sanity init */
1589
1590 STRLEN foldlen = 0; /* required init */
07be1b83 1591 U32 wordlen = 0; /* required init */
a3621e74
YO
1592 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1593
3dab1dad 1594 if ( OP(noper) != NOTHING ) {
786e8c11 1595 for ( ; uc < e ; uc += len ) {
a3621e74 1596
786e8c11 1597 TRIE_READ_CHAR;
a3621e74 1598
786e8c11
YO
1599 if ( uvc < 256 ) {
1600 charid = trie->charmap[ uvc ];
1601 } else {
1602 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1603 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1604 }
786e8c11
YO
1605 if ( charid ) {
1606 charid--;
1607 if ( !trie->trans[ state + charid ].next ) {
1608 trie->trans[ state + charid ].next = next_alloc;
1609 trie->trans[ state ].check++;
1610 next_alloc += trie->uniquecharcount;
1611 }
1612 state = trie->trans[ state + charid ].next;
1613 } else {
1614 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1615 }
1616 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1617 }
a3621e74 1618 }
3dab1dad
YO
1619 accept_state = TRIE_NODENUM( state );
1620 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1621
1622 } /* end second pass */
1623
3dab1dad
YO
1624 /* and now dump it out before we compress it */
1625 DEBUG_TRIE_COMPILE_MORE_r(
1626 dump_trie_interim_table(trie,next_alloc,depth+1)
1627 );
a3621e74 1628
a3621e74
YO
1629 {
1630 /*
1631 * Inplace compress the table.*
1632
1633 For sparse data sets the table constructed by the trie algorithm will
1634 be mostly 0/FAIL transitions or to put it another way mostly empty.
1635 (Note that leaf nodes will not contain any transitions.)
1636
1637 This algorithm compresses the tables by eliminating most such
1638 transitions, at the cost of a modest bit of extra work during lookup:
1639
1640 - Each states[] entry contains a .base field which indicates the
1641 index in the state[] array wheres its transition data is stored.
1642
1643 - If .base is 0 there are no valid transitions from that node.
1644
1645 - If .base is nonzero then charid is added to it to find an entry in
1646 the trans array.
1647
1648 -If trans[states[state].base+charid].check!=state then the
1649 transition is taken to be a 0/Fail transition. Thus if there are fail
1650 transitions at the front of the node then the .base offset will point
1651 somewhere inside the previous nodes data (or maybe even into a node
1652 even earlier), but the .check field determines if the transition is
1653 valid.
1654
786e8c11 1655 XXX - wrong maybe?
a3621e74
YO
1656 The following process inplace converts the table to the compressed
1657 table: We first do not compress the root node 1,and mark its all its
1658 .check pointers as 1 and set its .base pointer as 1 as well. This
1659 allows to do a DFA construction from the compressed table later, and
1660 ensures that any .base pointers we calculate later are greater than
1661 0.
1662
1663 - We set 'pos' to indicate the first entry of the second node.
1664
1665 - We then iterate over the columns of the node, finding the first and
1666 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1667 and set the .check pointers accordingly, and advance pos
1668 appropriately and repreat for the next node. Note that when we copy
1669 the next pointers we have to convert them from the original
1670 NODEIDX form to NODENUM form as the former is not valid post
1671 compression.
1672
1673 - If a node has no transitions used we mark its base as 0 and do not
1674 advance the pos pointer.
1675
1676 - If a node only has one transition we use a second pointer into the
1677 structure to fill in allocated fail transitions from other states.
1678 This pointer is independent of the main pointer and scans forward
1679 looking for null transitions that are allocated to a state. When it
1680 finds one it writes the single transition into the "hole". If the
786e8c11 1681 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1682
1683 - Once compressed we can Renew/realloc the structures to release the
1684 excess space.
1685
1686 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1687 specifically Fig 3.47 and the associated pseudocode.
1688
1689 demq
1690 */
a3b680e6 1691 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1692 U32 state, charid;
a3621e74 1693 U32 pos = 0, zp=0;
786e8c11 1694 trie->laststate = laststate;
a3621e74
YO
1695
1696 for ( state = 1 ; state < laststate ; state++ ) {
1697 U8 flag = 0;
a28509cc
AL
1698 const U32 stateidx = TRIE_NODEIDX( state );
1699 const U32 o_used = trie->trans[ stateidx ].check;
1700 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1701 trie->trans[ stateidx ].check = 0;
1702
1703 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1704 if ( flag || trie->trans[ stateidx + charid ].next ) {
1705 if ( trie->trans[ stateidx + charid ].next ) {
1706 if (o_used == 1) {
1707 for ( ; zp < pos ; zp++ ) {
1708 if ( ! trie->trans[ zp ].next ) {
1709 break;
1710 }
1711 }
1712 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1713 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1714 trie->trans[ zp ].check = state;
1715 if ( ++zp > pos ) pos = zp;
1716 break;
1717 }
1718 used--;
1719 }
1720 if ( !flag ) {
1721 flag = 1;
1722 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1723 }
1724 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1725 trie->trans[ pos ].check = state;
1726 pos++;
1727 }
1728 }
1729 }
cc601c31 1730 trie->lasttrans = pos + 1;
a3621e74
YO
1731 Renew( trie->states, laststate + 1, reg_trie_state);
1732 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1733 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1734 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1735 (int)depth * 2 + 2,"",
1736 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1737 (IV)next_alloc,
1738 (IV)pos,
a3621e74
YO
1739 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1740 );
1741
1742 } /* end table compress */
1743 }
cc601c31
YO
1744 /* resize the trans array to remove unused space */
1745 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1746
3dab1dad
YO
1747 /* and now dump out the compressed format */
1748 DEBUG_TRIE_COMPILE_r(
1749 dump_trie(trie,depth+1)
1750 );
07be1b83 1751
3dab1dad
YO
1752 { /* Modify the program and insert the new TRIE node*/
1753 regnode *convert;
1754 U8 nodetype =(U8)(flags & 0xFF);
1755 char *str=NULL;
786e8c11 1756
07be1b83 1757#ifdef DEBUGGING
a5ca303d 1758 regnode *optimize;
b57a0404
JH
1759 U32 mjd_offset = 0;
1760 U32 mjd_nodelen = 0;
07be1b83 1761#endif
a3621e74 1762 /*
3dab1dad
YO
1763 This means we convert either the first branch or the first Exact,
1764 depending on whether the thing following (in 'last') is a branch
1765 or not and whther first is the startbranch (ie is it a sub part of
1766 the alternation or is it the whole thing.)
1767 Assuming its a sub part we conver the EXACT otherwise we convert
1768 the whole branch sequence, including the first.
a3621e74 1769 */
3dab1dad
YO
1770 /* Find the node we are going to overwrite */
1771 if ( first == startbranch && OP( last ) != BRANCH ) {
07be1b83 1772 /* whole branch chain */
3dab1dad 1773 convert = first;
07be1b83
YO
1774 DEBUG_r({
1775 const regnode *nop = NEXTOPER( convert );
1776 mjd_offset= Node_Offset((nop));
1777 mjd_nodelen= Node_Length((nop));
1778 });
1779 } else {
1780 /* branch sub-chain */
3dab1dad
YO
1781 convert = NEXTOPER( first );
1782 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1783 DEBUG_r({
1784 mjd_offset= Node_Offset((convert));
1785 mjd_nodelen= Node_Length((convert));
1786 });
1787 }
1788 DEBUG_OPTIMISE_r(
1789 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1790 (int)depth * 2 + 2, "",
786e8c11 1791 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1792 );
a3621e74 1793
3dab1dad
YO
1794 /* But first we check to see if there is a common prefix we can
1795 split out as an EXACT and put in front of the TRIE node. */
1796 trie->startstate= 1;
786e8c11 1797 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad
YO
1798 U32 state;
1799 DEBUG_OPTIMISE_r(
8e11feef
RGS
1800 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1801 (int)depth * 2 + 2, "",
786e8c11 1802 (UV)trie->laststate)
8e11feef 1803 );
786e8c11 1804 for ( state = 1 ; state < trie->laststate-1 ; state++ ) {
a3621e74 1805 U32 ofs = 0;
8e11feef
RGS
1806 I32 idx = -1;
1807 U32 count = 0;
1808 const U32 base = trie->states[ state ].trans.base;
a3621e74 1809
3dab1dad 1810 if ( trie->states[state].wordnum )
8e11feef 1811 count = 1;
a3621e74 1812
8e11feef 1813 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1814 if ( ( base + ofs >= trie->uniquecharcount ) &&
1815 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1816 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1817 {
3dab1dad 1818 if ( ++count > 1 ) {
8e11feef 1819 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1820 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1821 if ( state == 1 ) break;
3dab1dad
YO
1822 if ( count == 2 ) {
1823 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1824 DEBUG_OPTIMISE_r(
8e11feef
RGS
1825 PerlIO_printf(Perl_debug_log,
1826 "%*sNew Start State=%"UVuf" Class: [",
1827 (int)depth * 2 + 2, "",
786e8c11 1828 (UV)state));
be8e71aa
YO
1829 if (idx >= 0) {
1830 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1831 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1832
3dab1dad 1833 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1834 if ( folder )
1835 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1836 DEBUG_OPTIMISE_r(
07be1b83 1837 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1838 );
8e11feef
RGS
1839 }
1840 }
1841 TRIE_BITMAP_SET(trie,*ch);
1842 if ( folder )
1843 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1844 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1845 }
1846 idx = ofs;
1847 }
3dab1dad
YO
1848 }
1849 if ( count == 1 ) {
1850 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1851 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1852 DEBUG_OPTIMISE_r(
8e11feef
RGS
1853 PerlIO_printf( Perl_debug_log,
1854 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1855 (int)depth * 2 + 2, "",
786e8c11 1856 (UV)state, (UV)idx, ch)
3dab1dad
YO
1857 );
1858 if ( state==1 ) {
1859 OP( convert ) = nodetype;
1860 str=STRING(convert);
1861 STR_LEN(convert)=0;
1862 }
1863 *str++=*ch;
1864 STR_LEN(convert)++;
a3621e74 1865
8e11feef 1866 } else {
f9049ba1 1867#ifdef DEBUGGING
8e11feef
RGS
1868 if (state>1)
1869 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1870#endif
8e11feef
RGS
1871 break;
1872 }
1873 }
3dab1dad 1874 if (str) {
8e11feef 1875 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1876 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1877 trie->startstate = state;
07be1b83
YO
1878 trie->minlen -= (state - 1);
1879 trie->maxlen -= (state - 1);
1880 DEBUG_r({
1881 regnode *fix = convert;
1882 mjd_nodelen++;
1883 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1884 while( ++fix < n ) {
1885 Set_Node_Offset_Length(fix, 0, 0);
1886 }
1887 });
8e11feef
RGS
1888 if (trie->maxlen) {
1889 convert = n;
1890 } else {
3dab1dad 1891 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1892 DEBUG_r(optimize= n);
3dab1dad
YO
1893 }
1894 }
1895 }
a5ca303d
YO
1896 if (!jumper)
1897 jumper = last;
3dab1dad 1898 if ( trie->maxlen ) {
8e11feef
RGS
1899 NEXT_OFF( convert ) = (U16)(tail - convert);
1900 ARG_SET( convert, data_slot );
786e8c11
YO
1901 /* Store the offset to the first unabsorbed branch in
1902 jump[0], which is otherwise unused by the jump logic.
1903 We use this when dumping a trie and during optimisation. */
1904 if (trie->jump)
1905 trie->jump[0] = (U16)(tail - nextbranch);
a5ca303d 1906
786e8c11
YO
1907 /* XXXX */
1908 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1909 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1910 {
1911 OP( convert ) = TRIEC;
1912 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1913 Safefree(trie->bitmap);
1914 trie->bitmap= NULL;
1915 } else
1916 OP( convert ) = TRIE;
a3621e74 1917
3dab1dad
YO
1918 /* store the type in the flags */
1919 convert->flags = nodetype;
a5ca303d
YO
1920 DEBUG_r({
1921 optimize = convert
1922 + NODE_STEP_REGNODE
1923 + regarglen[ OP( convert ) ];
1924 });
1925 /* XXX We really should free up the resource in trie now,
1926 as we won't use them - (which resources?) dmq */
3dab1dad 1927 }
a3621e74
YO
1928 /* needed for dumping*/
1929 DEBUG_r({
07be1b83
YO
1930 regnode *opt = convert;
1931 while (++opt<optimize) {
1932 Set_Node_Offset_Length(opt,0,0);
1933 }
786e8c11
YO
1934 /*
1935 Try to clean up some of the debris left after the
1936 optimisation.
a3621e74 1937 */
786e8c11 1938 while( optimize < jumper ) {
07be1b83 1939 mjd_nodelen += Node_Length((optimize));
a3621e74 1940 OP( optimize ) = OPTIMIZED;
07be1b83 1941 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1942 optimize++;
1943 }
07be1b83 1944 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1945 });
1946 } /* end node insert */
07be1b83 1947#ifndef DEBUGGING
6e8b4190 1948 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1949#endif
786e8c11
YO
1950 return trie->jump
1951 ? MADE_JUMP_TRIE
1952 : trie->startstate>1
1953 ? MADE_EXACT_TRIE
1954 : MADE_TRIE;
1955}
1956
1957STATIC void
1958S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1959{
1960/* The Trie is constructed and compressed now so we can build a fail array now if its needed
1961
1962 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1963 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1964 ISBN 0-201-10088-6
1965
1966 We find the fail state for each state in the trie, this state is the longest proper
1967 suffix of the current states 'word' that is also a proper prefix of another word in our
1968 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1969 the DFA not to have to restart after its tried and failed a word at a given point, it
1970 simply continues as though it had been matching the other word in the first place.
1971 Consider
1972 'abcdgu'=~/abcdefg|cdgu/
1973 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1974 fail, which would bring use to the state representing 'd' in the second word where we would
1975 try 'g' and succeed, prodceding to match 'cdgu'.
1976 */
1977 /* add a fail transition */
1978 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1979 U32 *q;
1980 const U32 ucharcount = trie->uniquecharcount;
1981 const U32 numstates = trie->laststate;
1982 const U32 ubound = trie->lasttrans + ucharcount;
1983 U32 q_read = 0;
1984 U32 q_write = 0;
1985 U32 charid;
1986 U32 base = trie->states[ 1 ].trans.base;
1987 U32 *fail;
1988 reg_ac_data *aho;
1989 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1990 GET_RE_DEBUG_FLAGS_DECL;
1991#ifndef DEBUGGING
1992 PERL_UNUSED_ARG(depth);
1993#endif
1994
1995
1996 ARG_SET( stclass, data_slot );
1997 Newxz( aho, 1, reg_ac_data );
1998 RExC_rx->data->data[ data_slot ] = (void*)aho;
1999 aho->trie=trie;
2000 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
2001 (trie->laststate+1)*sizeof(reg_trie_state));
2002 Newxz( q, numstates, U32);
2003 Newxz( aho->fail, numstates, U32 );
2004 aho->refcount = 1;
2005 fail = aho->fail;
2006 /* initialize fail[0..1] to be 1 so that we always have
2007 a valid final fail state */
2008 fail[ 0 ] = fail[ 1 ] = 1;
2009
2010 for ( charid = 0; charid < ucharcount ; charid++ ) {
2011 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2012 if ( newstate ) {
2013 q[ q_write ] = newstate;
2014 /* set to point at the root */
2015 fail[ q[ q_write++ ] ]=1;
2016 }
2017 }
2018 while ( q_read < q_write) {
2019 const U32 cur = q[ q_read++ % numstates ];
2020 base = trie->states[ cur ].trans.base;
2021
2022 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2023 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2024 if (ch_state) {
2025 U32 fail_state = cur;
2026 U32 fail_base;
2027 do {
2028 fail_state = fail[ fail_state ];
2029 fail_base = aho->states[ fail_state ].trans.base;
2030 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2031
2032 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2033 fail[ ch_state ] = fail_state;
2034 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2035 {
2036 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2037 }
2038 q[ q_write++ % numstates] = ch_state;
2039 }
2040 }
2041 }
2042 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2043 when we fail in state 1, this allows us to use the
2044 charclass scan to find a valid start char. This is based on the principle
2045 that theres a good chance the string being searched contains lots of stuff
2046 that cant be a start char.
2047 */
2048 fail[ 0 ] = fail[ 1 ] = 0;
2049 DEBUG_TRIE_COMPILE_r({
2050 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable: 0", (int)(depth * 2), "");
2051 for( q_read=1; q_read<numstates; q_read++ ) {
2052 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2053 }
2054 PerlIO_printf(Perl_debug_log, "\n");
2055 });
2056 Safefree(q);
2057 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2058}
2059
786e8c11 2060
a3621e74 2061/*
5d1c421c
JH
2062 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2063 * These need to be revisited when a newer toolchain becomes available.
2064 */
2065#if defined(__sparc64__) && defined(__GNUC__)
2066# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2067# undef SPARC64_GCC_WORKAROUND
2068# define SPARC64_GCC_WORKAROUND 1
2069# endif
2070#endif
2071
07be1b83
YO
2072#define DEBUG_PEEP(str,scan,depth) \
2073 DEBUG_OPTIMISE_r({ \
2074 SV * const mysv=sv_newmortal(); \
2075 regnode *Next = regnext(scan); \
2076 regprop(RExC_rx, mysv, scan); \
2077 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
2078 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2079 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2080 });
2081
1de06328
YO
2082
2083
2084
2085
07be1b83
YO
2086#define JOIN_EXACT(scan,min,flags) \
2087 if (PL_regkind[OP(scan)] == EXACT) \
2088 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2089
be8e71aa 2090STATIC U32
07be1b83
YO
2091S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2092 /* Merge several consecutive EXACTish nodes into one. */
2093 regnode *n = regnext(scan);
2094 U32 stringok = 1;
2095 regnode *next = scan + NODE_SZ_STR(scan);
2096 U32 merged = 0;
2097 U32 stopnow = 0;
2098#ifdef DEBUGGING
2099 regnode *stop = scan;
72f13be8 2100 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2101#else
d47053eb
RGS
2102 PERL_UNUSED_ARG(depth);
2103#endif
2104#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2105 PERL_UNUSED_ARG(flags);
2106 PERL_UNUSED_ARG(val);
07be1b83 2107#endif
07be1b83
YO
2108 DEBUG_PEEP("join",scan,depth);
2109
2110 /* Skip NOTHING, merge EXACT*. */
2111 while (n &&
2112 ( PL_regkind[OP(n)] == NOTHING ||
2113 (stringok && (OP(n) == OP(scan))))
2114 && NEXT_OFF(n)
2115 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2116
2117 if (OP(n) == TAIL || n > next)
2118 stringok = 0;
2119 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2120 DEBUG_PEEP("skip:",n,depth);
2121 NEXT_OFF(scan) += NEXT_OFF(n);
2122 next = n + NODE_STEP_REGNODE;
2123#ifdef DEBUGGING
2124 if (stringok)
2125 stop = n;
2126#endif
2127 n = regnext(n);
2128 }
2129 else if (stringok) {
786e8c11 2130 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2131 regnode * const nnext = regnext(n);
2132
2133 DEBUG_PEEP("merg",n,depth);
2134
2135 merged++;
2136 if (oldl + STR_LEN(n) > U8_MAX)
2137 break;
2138 NEXT_OFF(scan) += NEXT_OFF(n);
2139 STR_LEN(scan) += STR_LEN(n);
2140 next = n + NODE_SZ_STR(n);
2141 /* Now we can overwrite *n : */
2142 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2143#ifdef DEBUGGING
2144 stop = next - 1;
2145#endif
2146 n = nnext;
2147 if (stopnow) break;
2148 }
2149
d47053eb
RGS
2150#ifdef EXPERIMENTAL_INPLACESCAN
2151 if (flags && !NEXT_OFF(n)) {
2152 DEBUG_PEEP("atch", val, depth);
2153 if (reg_off_by_arg[OP(n)]) {
2154 ARG_SET(n, val - n);
2155 }
2156 else {
2157 NEXT_OFF(n) = val - n;
2158 }
2159 stopnow = 1;
2160 }
07be1b83
YO
2161#endif
2162 }
2163
2164 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2165 /*
2166 Two problematic code points in Unicode casefolding of EXACT nodes:
2167
2168 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2169 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2170
2171 which casefold to
2172
2173 Unicode UTF-8
2174
2175 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2176 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2177
2178 This means that in case-insensitive matching (or "loose matching",
2179 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2180 length of the above casefolded versions) can match a target string
2181 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2182 This would rather mess up the minimum length computation.
2183
2184 What we'll do is to look for the tail four bytes, and then peek
2185 at the preceding two bytes to see whether we need to decrease
2186 the minimum length by four (six minus two).
2187
2188 Thanks to the design of UTF-8, there cannot be false matches:
2189 A sequence of valid UTF-8 bytes cannot be a subsequence of
2190 another valid sequence of UTF-8 bytes.
2191
2192 */
2193 char * const s0 = STRING(scan), *s, *t;
2194 char * const s1 = s0 + STR_LEN(scan) - 1;
2195 char * const s2 = s1 - 4;
e294cc5d
JH
2196#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2197 const char t0[] = "\xaf\x49\xaf\x42";
2198#else
07be1b83 2199 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2200#endif
07be1b83
YO
2201 const char * const t1 = t0 + 3;
2202
2203 for (s = s0 + 2;
2204 s < s2 && (t = ninstr(s, s1, t0, t1));
2205 s = t + 4) {
e294cc5d
JH
2206#ifdef EBCDIC
2207 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2208 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2209#else
07be1b83
YO
2210 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2211 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2212#endif
07be1b83
YO
2213 *min -= 4;
2214 }
2215 }
2216
2217#ifdef DEBUGGING
2218 /* Allow dumping */
2219 n = scan + NODE_SZ_STR(scan);
2220 while (n <= stop) {
2221 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2222 OP(n) = OPTIMIZED;
2223 NEXT_OFF(n) = 0;
2224 }
2225 n++;
2226 }
2227#endif
2228 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2229 return stopnow;
2230}
2231
653099ff
GS
2232/* REx optimizer. Converts nodes into quickier variants "in place".
2233 Finds fixed substrings. */
2234
a0288114 2235/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2236 to the position after last scanned or to NULL. */
2237
07be1b83
YO
2238
2239
76e3520e 2240STATIC I32
1de06328
YO
2241S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2242 I32 *minlenp, I32 *deltap,
9a957fbc 2243 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
2244 /* scanp: Start here (read-write). */
2245 /* deltap: Write maxlen-minlen here. */
2246 /* last: Stop before this one. */
2247{
97aff369 2248 dVAR;
c277df42
IZ
2249 I32 min = 0, pars = 0, code;
2250 regnode *scan = *scanp, *next;
2251 I32 delta = 0;
2252 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2253 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2254 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2255 scan_data_t data_fake;
653099ff 2256 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74 2257 SV *re_trie_maxbuff = NULL;
786e8c11
YO
2258 regnode *first_non_open = scan;
2259
a3621e74
YO
2260
2261 GET_RE_DEBUG_FLAGS_DECL;
13a24bad
YO
2262#ifdef DEBUGGING
2263 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2264#endif
786e8c11
YO
2265 if ( depth == 0 ) {
2266 while (first_non_open && OP(first_non_open) == OPEN)
2267 first_non_open=regnext(first_non_open);
2268 }
2269
b81d288d 2270
c277df42
IZ
2271 while (scan && OP(scan) != END && scan < last) {
2272 /* Peephole optimizer: */
1de06328 2273 DEBUG_STUDYDATA(data,depth);
07be1b83 2274 DEBUG_PEEP("Peep",scan,depth);
07be1b83 2275 JOIN_EXACT(scan,&min,0);
a3621e74 2276
653099ff
GS
2277 /* Follow the next-chain of the current node and optimize
2278 away all the NOTHINGs from it. */
c277df42 2279 if (OP(scan) != CURLYX) {
a3b680e6 2280 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2281 ? I32_MAX
2282 /* I32 may be smaller than U16 on CRAYs! */
2283 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2284 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2285 int noff;
2286 regnode *n = scan;
b81d288d 2287
c277df42
IZ
2288 /* Skip NOTHING and LONGJMP. */
2289 while ((n = regnext(n))
3dab1dad 2290 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2291 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2292 && off + noff < max)
2293 off += noff;
2294 if (reg_off_by_arg[OP(scan)])
2295 ARG(scan) = off;
b81d288d 2296 else
c277df42
IZ
2297 NEXT_OFF(scan) = off;
2298 }
a3621e74 2299
07be1b83 2300
3dab1dad 2301
653099ff
GS
2302 /* The principal pseudo-switch. Cannot be a switch, since we
2303 look into several different things. */
b81d288d 2304 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2305 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2306 next = regnext(scan);
2307 code = OP(scan);
a3621e74 2308 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2309
2310 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2311 /* NOTE - There is similar code to this block below for handling
2312 TRIE nodes on a re-study. If you change stuff here check there
2313 too. */
c277df42 2314 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2315 struct regnode_charclass_class accum;
d4c19fe8 2316 regnode * const startbranch=scan;
c277df42 2317
653099ff 2318 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 2319 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
653099ff 2320 if (flags & SCF_DO_STCLASS)
830247a4 2321 cl_init_zero(pRExC_state, &accum);
a3621e74 2322
c277df42 2323 while (OP(scan) == code) {
830247a4 2324 I32 deltanext, minnext, f = 0, fake;
653099ff 2325 struct regnode_charclass_class this_class;
c277df42
IZ
2326
2327 num++;
2328 data_fake.flags = 0;
b81d288d 2329 if (data) {
2c2d71f5 2330 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2331 data_fake.last_closep = data->last_closep;
2332 }
2333 else
2334 data_fake.last_closep = &fake;
c277df42
IZ
2335 next = regnext(scan);
2336 scan = NEXTOPER(scan);
2337 if (code != BRANCH)
2338 scan = NEXTOPER(scan);
653099ff 2339 if (flags & SCF_DO_STCLASS) {
830247a4 2340 cl_init(pRExC_state, &this_class);
653099ff
GS
2341 data_fake.start_class = &this_class;
2342 f = SCF_DO_STCLASS_AND;
b81d288d 2343 }
e1901655
IZ
2344 if (flags & SCF_WHILEM_VISITED_POS)
2345 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2346
653099ff 2347 /* we suppose the run is continuous, last=next...*/
1de06328 2348 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
a3621e74 2349 next, &data_fake, f,depth+1);
b81d288d 2350 if (min1 > minnext)
c277df42
IZ
2351 min1 = minnext;
2352 if (max1 < minnext + deltanext)
2353 max1 = minnext + deltanext;
2354 if (deltanext == I32_MAX)
aca2d497 2355 is_inf = is_inf_internal = 1;
c277df42
IZ
2356 scan = next;
2357 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2358 pars++;
3dab1dad
YO
2359 if (data) {
2360 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2361 data->flags |= SF_HAS_EVAL;
2c2d71f5 2362 data->whilem_c = data_fake.whilem_c;
3dab1dad 2363 }
653099ff 2364 if (flags & SCF_DO_STCLASS)
830247a4 2365 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2366 if (code == SUSPEND)
c277df42
IZ
2367 break;
2368 }
2369 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2370 min1 = 0;
2371 if (flags & SCF_DO_SUBSTR) {
2372 data->pos_min += min1;
2373 data->pos_delta += max1 - min1;
2374 if (max1 != min1 || is_inf)
2375 data->longest = &(data->longest_float);
2376 }
2377 min += min1;
2378 delta += max1 - min1;
653099ff 2379 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2380 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
2381 if (min1) {
2382 cl_and(data->start_class, &and_with);
2383 flags &= ~SCF_DO_STCLASS;
2384 }
2385 }
2386 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2387 if (min1) {
2388 cl_and(data->start_class, &accum);
653099ff 2389 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2390 }
2391 else {
b81d288d 2392 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
2393 * data->start_class */
2394 StructCopy(data->start_class, &and_with,
2395 struct regnode_charclass_class);
2396 flags &= ~SCF_DO_STCLASS_AND;
2397 StructCopy(&accum, data->start_class,
2398 struct regnode_charclass_class);
2399 flags |= SCF_DO_STCLASS_OR;
2400 data->start_class->flags |= ANYOF_EOS;
2401 }
653099ff 2402 }
a3621e74 2403
786e8c11 2404 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2405 /* demq.
2406
2407 Assuming this was/is a branch we are dealing with: 'scan' now
2408 points at the item that follows the branch sequence, whatever
2409 it is. We now start at the beginning of the sequence and look
2410 for subsequences of
2411
786e8c11
YO
2412 BRANCH->EXACT=>x1
2413 BRANCH->EXACT=>x2
2414 tail
a3621e74
YO
2415
2416 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2417
2418 If we can find such a subseqence we need to turn the first
2419 element into a trie and then add the subsequent branch exact
2420 strings to the trie.
2421
2422 We have two cases
2423
786e8c11 2424 1. patterns where the whole set of branch can be converted.
a3621e74 2425
786e8c11 2426 2. patterns where only a subset can be converted.
a3621e74
YO
2427
2428 In case 1 we can replace the whole set with a single regop
2429 for the trie. In case 2 we need to keep the start and end
2430 branchs so
2431
2432 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2433 becomes BRANCH TRIE; BRANCH X;
2434
786e8c11
YO
2435 There is an additional case, that being where there is a
2436 common prefix, which gets split out into an EXACT like node
2437 preceding the TRIE node.
2438
2439 If x(1..n)==tail then we can do a simple trie, if not we make
2440 a "jump" trie, such that when we match the appropriate word
2441 we "jump" to the appopriate tail node. Essentailly we turn
2442 a nested if into a case structure of sorts.
a3621e74
YO
2443
2444 */
786e8c11 2445
3dab1dad 2446 int made=0;
0111c4fd
RGS
2447 if (!re_trie_maxbuff) {
2448 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2449 if (!SvIOK(re_trie_maxbuff))
2450 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2451 }
786e8c11 2452 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2453 regnode *cur;
2454 regnode *first = (regnode *)NULL;
2455 regnode *last = (regnode *)NULL;
2456 regnode *tail = scan;
2457 U8 optype = 0;
2458 U32 count=0;
2459
2460#ifdef DEBUGGING
c445ea15 2461 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2462#endif
2463 /* var tail is used because there may be a TAIL
2464 regop in the way. Ie, the exacts will point to the
2465 thing following the TAIL, but the last branch will
2466 point at the TAIL. So we advance tail. If we
2467 have nested (?:) we may have to move through several
2468 tails.
2469 */
2470
2471 while ( OP( tail ) == TAIL ) {
2472 /* this is the TAIL generated by (?:) */
2473 tail = regnext( tail );
2474 }
2475
3dab1dad 2476
a3621e74 2477 DEBUG_OPTIMISE_r({
32fc9b6a 2478 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2479 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2480 (int)depth * 2 + 2, "",
2481 "Looking for TRIE'able sequences. Tail node is: ",
2482 SvPV_nolen_const( mysv )
a3621e74
YO
2483 );
2484 });
3dab1dad 2485
a3621e74
YO
2486 /*
2487
2488 step through the branches, cur represents each
2489 branch, noper is the first thing to be matched
2490 as part of that branch and noper_next is the
2491 regnext() of that node. if noper is an EXACT
2492 and noper_next is the same as scan (our current
2493 position in the regex) then the EXACT branch is
2494 a possible optimization target. Once we have
2495 two or more consequetive such branches we can
2496 create a trie of the EXACT's contents and stich
2497 it in place. If the sequence represents all of
2498 the branches we eliminate the whole thing and
2499 replace it with a single TRIE. If it is a
2500 subsequence then we need to stitch it in. This
2501 means the first branch has to remain, and needs
2502 to be repointed at the item on the branch chain
2503 following the last branch optimized. This could
2504 be either a BRANCH, in which case the
2505 subsequence is internal, or it could be the
2506 item following the branch sequence in which
2507 case the subsequence is at the end.
2508
2509 */
2510
2511 /* dont use tail as the end marker for this traverse */
2512 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14 2513 regnode * const noper = NEXTOPER( cur );
be981c67 2514#if defined(DEBUGGING) || defined(NOJUMPTRIE)
aec46f14 2515 regnode * const noper_next = regnext( noper );
be981c67 2516#endif
a3621e74 2517
a3621e74 2518 DEBUG_OPTIMISE_r({
32fc9b6a 2519 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2520 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2521 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2522
32fc9b6a 2523 regprop(RExC_rx, mysv, noper);
a3621e74 2524 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2525 SvPV_nolen_const(mysv));
a3621e74
YO
2526
2527 if ( noper_next ) {
32fc9b6a 2528 regprop(RExC_rx, mysv, noper_next );
a3621e74 2529 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2530 SvPV_nolen_const(mysv));
a3621e74 2531 }
3dab1dad
YO
2532 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2533 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2534 });
3dab1dad
YO
2535 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2536 : PL_regkind[ OP( noper ) ] == EXACT )
2537 || OP(noper) == NOTHING )
786e8c11
YO
2538#ifdef NOJUMPTRIE
2539 && noper_next == tail
2540#endif
2541 && count < U16_MAX)
a3621e74
YO
2542 {
2543 count++;
3dab1dad
YO
2544 if ( !first || optype == NOTHING ) {
2545 if (!first) first = cur;
a3621e74
YO
2546 optype = OP( noper );
2547 } else {
a3621e74 2548 last = cur;
a3621e74
YO
2549 }
2550 } else {
2551 if ( last ) {
786e8c11
YO
2552 make_trie( pRExC_state,
2553 startbranch, first, cur, tail, count,
2554 optype, depth+1 );
a3621e74 2555 }
3dab1dad 2556 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2557#ifdef NOJUMPTRIE
2558 && noper_next == tail
2559#endif
2560 ){
a3621e74
YO
2561 count = 1;
2562 first = cur;
2563 optype = OP( noper );
2564 } else {
2565 count = 0;
2566 first = NULL;
2567 optype = 0;
2568 }
2569 last = NULL;
2570 }
2571 }
2572 DEBUG_OPTIMISE_r({
32fc9b6a 2573 regprop(RExC_rx, mysv, cur);
a3621e74 2574 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2575 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2576 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2577
2578 });
2579 if ( last ) {
786e8c11 2580 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2581#ifdef TRIE_STUDY_OPT
786e8c11
YO
2582 if ( ((made == MADE_EXACT_TRIE &&
2583 startbranch == first)
2584 || ( first_non_open == first )) &&
2585 depth==0 )
2586 flags |= SCF_TRIE_RESTUDY;
3dab1dad 2587#endif
07be1b83 2588 }
a3621e74 2589 }
3dab1dad
YO
2590
2591 } /* do trie */
786e8c11 2592
a0ed51b3 2593 }
a3621e74 2594 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2595 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2596 } else /* single branch is optimized. */
c277df42
IZ
2597 scan = NEXTOPER(scan);
2598 continue;
a0ed51b3
LW
2599 }
2600 else if (OP(scan) == EXACT) {
cd439c50 2601 I32 l = STR_LEN(scan);
c445ea15 2602 UV uc;
a0ed51b3 2603 if (UTF) {
a3b680e6 2604 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2605 l = utf8_length(s, s + l);
9041c2e3 2606 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2607 } else {
2608 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2609 }
2610 min += l;
c277df42 2611 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2612 /* The code below prefers earlier match for fixed
2613 offset, later match for variable offset. */
2614 if (data->last_end == -1) { /* Update the start info. */
2615 data->last_start_min = data->pos_min;
2616 data->last_start_max = is_inf
b81d288d 2617 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2618 }
cd439c50 2619 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2620 if (UTF)
2621 SvUTF8_on(data->last_found);
0eda9292 2622 {
9a957fbc 2623 SV * const sv = data->last_found;
a28509cc 2624 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2625 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2626 if (mg && mg->mg_len >= 0)
5e43f467
JH
2627 mg->mg_len += utf8_length((U8*)STRING(scan),
2628 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2629 }
c277df42
IZ
2630 data->last_end = data->pos_min + l;
2631 data->pos_min += l; /* As in the first entry. */
2632 data->flags &= ~SF_BEFORE_EOL;
2633 }
653099ff
GS
2634 if (flags & SCF_DO_STCLASS_AND) {
2635 /* Check whether it is compatible with what we know already! */
2636 int compat = 1;
2637
1aa99e6b 2638 if (uc >= 0x100 ||
516a5887 2639 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2640 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2641 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2642 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2643 )
653099ff
GS
2644 compat = 0;
2645 ANYOF_CLASS_ZERO(data->start_class);
2646 ANYOF_BITMAP_ZERO(data->start_class);
2647 if (compat)
1aa99e6b 2648 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2649 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2650 if (uc < 0x100)
2651 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2652 }
2653 else if (flags & SCF_DO_STCLASS_OR) {
2654 /* false positive possible if the class is case-folded */
1aa99e6b 2655 if (uc < 0x100)
9b877dbb
IH
2656 ANYOF_BITMAP_SET(data->start_class, uc);
2657 else
2658 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2659 data->start_class->flags &= ~ANYOF_EOS;
2660 cl_and(data->start_class, &and_with);
2661 }
2662 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2663 }
3dab1dad 2664 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2665 I32 l = STR_LEN(scan);
1aa99e6b 2666 UV uc = *((U8*)STRING(scan));
653099ff
GS
2667
2668 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2669 if (flags & SCF_DO_SUBSTR) {
2670 assert(data);
1de06328 2671 scan_commit(pRExC_state, data, minlenp);
ecaa9b9c 2672 }
a0ed51b3 2673 if (UTF) {
6136c704 2674 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2675 l = utf8_length(s, s + l);
9041c2e3 2676 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2677 }
2678 min += l;
ecaa9b9c 2679 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2680 data->pos_min += l;
653099ff
GS
2681 if (flags & SCF_DO_STCLASS_AND) {
2682 /* Check whether it is compatible with what we know already! */
2683 int compat = 1;
2684
1aa99e6b 2685 if (uc >= 0x100 ||
516a5887 2686 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2687 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2688 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2689 compat = 0;
2690 ANYOF_CLASS_ZERO(data->start_class);
2691 ANYOF_BITMAP_ZERO(data->start_class);
2692 if (compat) {
1aa99e6b 2693 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2694 data->start_class->flags &= ~ANYOF_EOS;
2695 data->start_class->flags |= ANYOF_FOLD;
2696 if (OP(scan) == EXACTFL)
2697 data->start_class->flags |= ANYOF_LOCALE;
2698 }
2699 }
2700 else if (flags & SCF_DO_STCLASS_OR) {
2701 if (data->start_class->flags & ANYOF_FOLD) {
2702 /* false positive possible if the class is case-folded.
2703 Assume that the locale settings are the same... */
1aa99e6b
IH
2704 if (uc < 0x100)
2705 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2706 data->start_class->flags &= ~ANYOF_EOS;
2707 }
2708 cl_and(data->start_class, &and_with);
2709 }
2710 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2711 }
bfed75c6 2712 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2713 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2714 I32 f = flags, pos_before = 0;
d4c19fe8 2715 regnode * const oscan = scan;
653099ff
GS
2716 struct regnode_charclass_class this_class;
2717 struct regnode_charclass_class *oclass = NULL;
727f22e3 2718 I32 next_is_eval = 0;
653099ff 2719
3dab1dad 2720 switch (PL_regkind[OP(scan)]) {
653099ff 2721 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2722 scan = NEXTOPER(scan);
2723 goto finish;
2724 case PLUS:
653099ff 2725 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2726 next = NEXTOPER(scan);
653099ff 2727 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2728 mincount = 1;
2729 maxcount = REG_INFTY;
c277df42
IZ
2730 next = regnext(scan);
2731 scan = NEXTOPER(scan);
2732 goto do_curly;
2733 }
2734 }
2735 if (flags & SCF_DO_SUBSTR)
2736 data->pos_min++;
2737 min++;
2738 /* Fall through. */
2739 case STAR:
653099ff
GS
2740 if (flags & SCF_DO_STCLASS) {
2741 mincount = 0;
b81d288d 2742 maxcount = REG_INFTY;
653099ff
GS
2743 next = regnext(scan);
2744 scan = NEXTOPER(scan);
2745 goto do_curly;
2746 }
b81d288d 2747 is_inf = is_inf_internal = 1;
c277df42
IZ
2748 scan = regnext(scan);
2749 if (flags & SCF_DO_SUBSTR) {
1de06328 2750 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2751 data->longest = &(data->longest_float);
2752 }
2753 goto optimize_curly_tail;
2754 case CURLY:
b81d288d 2755 mincount = ARG1(scan);
c277df42
IZ
2756 maxcount = ARG2(scan);
2757 next = regnext(scan);
cb434fcc
IZ
2758 if (OP(scan) == CURLYX) {
2759 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2760 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2761 }
c277df42 2762 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2763 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2764 do_curly:
2765 if (flags & SCF_DO_SUBSTR) {
1de06328 2766 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2767 pos_before = data->pos_min;
2768 }
2769 if (data) {
2770 fl = data->flags;
2771 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2772 if (is_inf)
2773 data->flags |= SF_IS_INF;
2774 }
653099ff 2775 if (flags & SCF_DO_STCLASS) {
830247a4 2776 cl_init(pRExC_state, &this_class);
653099ff
GS
2777 oclass = data->start_class;
2778 data->start_class = &this_class;
2779 f |= SCF_DO_STCLASS_AND;
2780 f &= ~SCF_DO_STCLASS_OR;
2781 }
e1901655
IZ
2782 /* These are the cases when once a subexpression
2783 fails at a particular position, it cannot succeed
2784 even after backtracking at the enclosing scope.
b81d288d 2785
e1901655
IZ
2786 XXXX what if minimal match and we are at the
2787 initial run of {n,m}? */
2788 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2789 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2790
c277df42 2791 /* This will finish on WHILEM, setting scan, or on NULL: */
1de06328 2792 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, last, data,
a3621e74
YO
2793 (mincount == 0
2794 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2795
2796 if (flags & SCF_DO_STCLASS)
2797 data->start_class = oclass;
2798 if (mincount == 0 || minnext == 0) {
2799 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2800 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2801 }
2802 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2803 /* Switch to OR mode: cache the old value of
653099ff
GS
2804 * data->start_class */
2805 StructCopy(data->start_class, &and_with,
2806 struct regnode_charclass_class);
2807 flags &= ~SCF_DO_STCLASS_AND;
2808 StructCopy(&this_class, data->start_class,
2809 struct regnode_charclass_class);
2810 flags |= SCF_DO_STCLASS_OR;
2811 data->start_class->flags |= ANYOF_EOS;
2812 }
2813 } else { /* Non-zero len */
2814 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2815 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2816 cl_and(data->start_class, &and_with);
2817 }
2818 else if (flags & SCF_DO_STCLASS_AND)
2819 cl_and(data->start_class, &this_class);
2820 flags &= ~SCF_DO_STCLASS;
2821 }
c277df42
IZ
2822 if (!scan) /* It was not CURLYX, but CURLY. */
2823 scan = next;
041457d9
DM
2824 if ( /* ? quantifier ok, except for (?{ ... }) */
2825 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2826 && (minnext == 0) && (deltanext == 0)
99799961 2827 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2828 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2829 && ckWARN(WARN_REGEXP))
b45f050a 2830 {
830247a4 2831 vWARN(RExC_parse,
b45f050a
JF
2832 "Quantifier unexpected on zero-length expression");
2833 }
2834
c277df42 2835 min += minnext * mincount;
b81d288d 2836 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2837 && (minnext + deltanext) > 0)
2838 || deltanext == I32_MAX);
aca2d497 2839 is_inf |= is_inf_internal;
c277df42
IZ
2840 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2841
2842 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2843 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2844 && data->flags & SF_IN_PAR
2845 && !(data->flags & SF_HAS_EVAL)
2846 && !deltanext && minnext == 1 ) {
2847 /* Try to optimize to CURLYN. */
2848 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2849 regnode * const nxt1 = nxt;
497b47a8
JH
2850#ifdef DEBUGGING
2851 regnode *nxt2;
2852#endif
c277df42
IZ
2853
2854 /* Skip open. */
2855 nxt = regnext(nxt);
bfed75c6 2856 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2857 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2858 && STR_LEN(nxt) == 1))
c277df42 2859 goto nogo;
497b47a8 2860#ifdef DEBUGGING
c277df42 2861 nxt2 = nxt;
497b47a8 2862#endif
c277df42 2863 nxt = regnext(nxt);
b81d288d 2864 if (OP(nxt) != CLOSE)
c277df42
IZ
2865 goto nogo;
2866 /* Now we know that nxt2 is the only contents: */
eb160463 2867 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2868 OP(oscan) = CURLYN;
2869 OP(nxt1) = NOTHING; /* was OPEN. */
2870#ifdef DEBUGGING
2871 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2872 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2873 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2874 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2875 OP(nxt + 1) = OPTIMIZED; /* was count. */
2876 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2877#endif
c277df42 2878 }
c277df42
IZ
2879 nogo:
2880
2881 /* Try optimization CURLYX => CURLYM. */
b81d288d 2882 if ( OP(oscan) == CURLYX && data
c277df42 2883 && !(data->flags & SF_HAS_PAR)
c277df42 2884 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2885 && !deltanext /* atom is fixed width */
2886 && minnext != 0 /* CURLYM can't handle zero width */
2887 ) {
c277df42
IZ
2888 /* XXXX How to optimize if data == 0? */
2889 /* Optimize to a simpler form. */
2890 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2891 regnode *nxt2;
2892
2893 OP(oscan) = CURLYM;
2894 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2895 && (OP(nxt2) != WHILEM))
c277df42
IZ
2896 nxt = nxt2;
2897 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2898 /* Need to optimize away parenths. */
2899 if (data->flags & SF_IN_PAR) {
2900 /* Set the parenth number. */
2901 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2902
b81d288d 2903 if (OP(nxt) != CLOSE)
b45f050a 2904 FAIL("Panic opt close");
eb160463 2905 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2906 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2907 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2908#ifdef DEBUGGING
2909 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2910 OP(nxt + 1) = OPTIMIZED; /* was count. */
2911 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2912 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2913#endif
c277df42
IZ
2914#if 0
2915 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2916 regnode *nnxt = regnext(nxt1);
b81d288d 2917
c277df42
IZ
2918 if (nnxt == nxt) {
2919 if (reg_off_by_arg[OP(nxt1)])
2920 ARG_SET(nxt1, nxt2 - nxt1);
2921 else if (nxt2 - nxt1 < U16_MAX)
2922 NEXT_OFF(nxt1) = nxt2 - nxt1;
2923 else
2924 OP(nxt) = NOTHING; /* Cannot beautify */
2925 }
2926 nxt1 = nnxt;
2927 }
2928#endif
2929 /* Optimize again: */
1de06328 2930 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
a3621e74 2931 NULL, 0,depth+1);
a0ed51b3
LW
2932 }
2933 else
c277df42 2934 oscan->flags = 0;
c277df42 2935 }
e1901655
IZ
2936 else if ((OP(oscan) == CURLYX)
2937 && (flags & SCF_WHILEM_VISITED_POS)
2938 /* See the comment on a similar expression above.
2939 However, this time it not a subexpression
2940 we care about, but the expression itself. */
2941 && (maxcount == REG_INFTY)
2942 && data && ++data->whilem_c < 16) {
2943 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2944 /* Find WHILEM (as in regexec.c) */
2945 regnode *nxt = oscan + NEXT_OFF(oscan);
2946
2947 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2948 nxt += ARG(nxt);
eb160463
GS
2949 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2950 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2951 }
b81d288d 2952 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2953 pars++;
2954 if (flags & SCF_DO_SUBSTR) {
c445ea15 2955 SV *last_str = NULL;
c277df42
IZ
2956 int counted = mincount != 0;
2957
2958 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2959#if defined(SPARC64_GCC_WORKAROUND)
2960 I32 b = 0;
2961 STRLEN l = 0;
cfd0369c 2962 const char *s = NULL;
5d1c421c
JH
2963 I32 old = 0;
2964
2965 if (pos_before >= data->last_start_min)
2966 b = pos_before;
2967 else
2968 b = data->last_start_min;
2969
2970 l = 0;
cfd0369c 2971 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2972 old = b - data->last_start_min;
2973
2974#else
b81d288d 2975 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2976 ? pos_before : data->last_start_min;
2977 STRLEN l;
d4c19fe8 2978 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2979 I32 old = b - data->last_start_min;
5d1c421c 2980#endif
a0ed51b3
LW
2981
2982 if (UTF)
2983 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2984
a0ed51b3 2985 l -= old;
c277df42 2986 /* Get the added string: */
79cb57f6 2987 last_str = newSVpvn(s + old, l);
0e933229
IH
2988 if (UTF)
2989 SvUTF8_on(last_str);
c277df42
IZ
2990 if (deltanext == 0 && pos_before == b) {
2991 /* What was added is a constant string */
2992 if (mincount > 1) {
2993 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2994 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2995 SvPVX_const(last_str), l, mincount - 1);
b162af07 2996 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2997 /* Add additional parts. */
b81d288d 2998 SvCUR_set(data->last_found,
c277df42
IZ
2999 SvCUR(data->last_found) - l);
3000 sv_catsv(data->last_found, last_str);
0eda9292
JH
3001 {
3002 SV * sv = data->last_found;
3003 MAGIC *mg =
3004 SvUTF8(sv) && SvMAGICAL(sv) ?
3005 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3006 if (mg && mg->mg_len >= 0)
3007 mg->mg_len += CHR_SVLEN(last_str);
3008 }
c277df42
IZ
3009 data->last_end += l * (mincount - 1);
3010 }
2a8d9689
HS
3011 } else {
3012 /* start offset must point into the last copy */
3013 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
3014 data->last_start_max += is_inf ? I32_MAX
3015 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
3016 }
3017 }
3018 /* It is counted once already... */
3019 data->pos_min += minnext * (mincount - counted);
3020 data->pos_delta += - counted * deltanext +
3021 (minnext + deltanext) * maxcount - minnext * mincount;
3022 if (mincount != maxcount) {
653099ff
GS
3023 /* Cannot extend fixed substrings found inside
3024 the group. */
1de06328 3025 scan_commit(pRExC_state,data,minlenp);
c277df42 3026 if (mincount && last_str) {
d4c19fe8
AL
3027 SV * const sv = data->last_found;
3028 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
3029 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3030
3031 if (mg)
3032 mg->mg_len = -1;
3033 sv_setsv(sv, last_str);
c277df42 3034 data->last_end = data->pos_min;
b81d288d 3035 data->last_start_min =
a0ed51b3 3036 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
3037 data->last_start_max = is_inf
3038 ? I32_MAX
c277df42 3039 : data->pos_min + data->pos_delta
a0ed51b3 3040 - CHR_SVLEN(last_str);
c277df42
IZ
3041 }
3042 data->longest = &(data->longest_float);
3043 }
aca2d497 3044 SvREFCNT_dec(last_str);
c277df42 3045 }
405ff068 3046 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
3047 data->flags |= SF_HAS_EVAL;
3048 optimize_curly_tail:
c277df42 3049 if (OP(oscan) != CURLYX) {
3dab1dad 3050 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
3051 && NEXT_OFF(next))
3052 NEXT_OFF(oscan) += NEXT_OFF(next);
3053 }
c277df42 3054 continue;
653099ff 3055 default: /* REF and CLUMP only? */
c277df42 3056 if (flags & SCF_DO_SUBSTR) {
1de06328 3057 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
c277df42
IZ
3058 data->longest = &(data->longest_float);
3059 }
aca2d497 3060 is_inf = is_inf_internal = 1;
653099ff 3061 if (flags & SCF_DO_STCLASS_OR)
830247a4 3062 cl_anything(pRExC_state, data->start_class);
653099ff 3063 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
3064 break;
3065 }
a0ed51b3 3066 }
bfed75c6 3067 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 3068 int value = 0;
653099ff 3069
c277df42 3070 if (flags & SCF_DO_SUBSTR) {
1de06328 3071 scan_commit(pRExC_state,data,minlenp);
c277df42
IZ
3072 data->pos_min++;
3073 }
3074 min++;
653099ff
GS
3075 if (flags & SCF_DO_STCLASS) {
3076 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3077
3078 /* Some of the logic below assumes that switching
3079 locale on will only add false positives. */
3dab1dad 3080 switch (PL_regkind[OP(scan)]) {
653099ff 3081 case SANY:
653099ff
GS
3082 default:
3083 do_default:
3084 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3085 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3086 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3087 break;
3088 case REG_ANY:
3089 if (OP(scan) == SANY)
3090 goto do_default;
3091 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3092 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3093 || (data->start_class->flags & ANYOF_CLASS));
830247a4 3094 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3095 }
3096 if (flags & SCF_DO_STCLASS_AND || !value)
3097 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3098 break;
3099 case ANYOF:
3100 if (flags & SCF_DO_STCLASS_AND)
3101 cl_and(data->start_class,
3102 (struct regnode_charclass_class*)scan);
3103 else
830247a4 3104 cl_or(pRExC_state, data->start_class,
653099ff
GS
3105 (struct regnode_charclass_class*)scan);
3106 break;
3107 case ALNUM:
3108 if (flags & SCF_DO_STCLASS_AND) {
3109 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3110 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3111 for (value = 0; value < 256; value++)
3112 if (!isALNUM(value))
3113 ANYOF_BITMAP_CLEAR(data->start_class, value);
3114 }
3115 }
3116 else {
3117 if (data->start_class->flags & ANYOF_LOCALE)
3118 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3119 else {
3120 for (value = 0; value < 256; value++)
3121 if (isALNUM(value))
b81d288d 3122 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3123 }
3124 }
3125 break;
3126 case ALNUML:
3127 if (flags & SCF_DO_STCLASS_AND) {
3128 if (data->start_class->flags & ANYOF_LOCALE)
3129 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3130 }
3131 else {
3132 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3133 data->start_class->flags |= ANYOF_LOCALE;
3134 }
3135 break;
3136 case NALNUM:
3137 if (flags & SCF_DO_STCLASS_AND) {
3138 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3139 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3140 for (value = 0; value < 256; value++)
3141 if (isALNUM(value))
3142 ANYOF_BITMAP_CLEAR(data->start_class, value);
3143 }
3144 }
3145 else {
3146 if (data->start_class->flags & ANYOF_LOCALE)
3147 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3148 else {
3149 for (value = 0; value < 256; value++)
3150 if (!isALNUM(value))
b81d288d 3151 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3152 }
3153 }
3154 break;
3155 case NALNUML:
3156 if (flags & SCF_DO_STCLASS_AND) {
3157 if (data->start_class->flags & ANYOF_LOCALE)
3158 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3159 }
3160 else {
3161 data->start_class->flags |= ANYOF_LOCALE;
3162 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3163 }
3164 break;
3165 case SPACE:
3166 if (flags & SCF_DO_STCLASS_AND) {
3167 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3168 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3169 for (value = 0; value < 256; value++)
3170 if (!isSPACE(value))
3171 ANYOF_BITMAP_CLEAR(data->start_class, value);
3172 }
3173 }
3174 else {
3175 if (data->start_class->flags & ANYOF_LOCALE)
3176 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3177 else {
3178 for (value = 0; value < 256; value++)
3179 if (isSPACE(value))
b81d288d 3180 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3181 }
3182 }
3183 break;
3184 case SPACEL:
3185 if (flags & SCF_DO_STCLASS_AND) {
3186 if (data->start_class->flags & ANYOF_LOCALE)
3187 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3188 }
3189 else {
3190 data->start_class->flags |= ANYOF_LOCALE;
3191 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3192 }
3193 break;
3194 case NSPACE:
3195 if (flags & SCF_DO_STCLASS_AND) {
3196 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3197 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3198 for (value = 0; value < 256; value++)
3199 if (isSPACE(value))
3200 ANYOF_BITMAP_CLEAR(data->start_class, value);
3201 }
3202 }
3203 else {
3204 if (data->start_class->flags & ANYOF_LOCALE)
3205 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3206 else {
3207 for (value = 0; value < 256; value++)
3208 if (!isSPACE(value))
b81d288d 3209 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3210 }
3211 }
3212 break;
3213 case NSPACEL:
3214 if (flags & SCF_DO_STCLASS_AND) {
3215 if (data->start_class->flags & ANYOF_LOCALE) {
3216 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3217 for (value = 0; value < 256; value++)
3218 if (!isSPACE(value))
3219 ANYOF_BITMAP_CLEAR(data->start_class, value);
3220 }
3221 }
3222 else {
3223 data->start_class->flags |= ANYOF_LOCALE;
3224 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3225 }
3226 break;
3227 case DIGIT:
3228 if (flags & SCF_DO_STCLASS_AND) {
3229 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3230 for (value = 0; value < 256; value++)
3231 if (!isDIGIT(value))
3232 ANYOF_BITMAP_CLEAR(data->start_class, value);
3233 }
3234 else {
3235 if (data->start_class->flags & ANYOF_LOCALE)
3236 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3237 else {
3238 for (value = 0; value < 256; value++)
3239 if (isDIGIT(value))
b81d288d 3240 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3241 }
3242 }
3243 break;
3244 case NDIGIT:
3245 if (flags & SCF_DO_STCLASS_AND) {
3246 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3247 for (value = 0; value < 256; value++)
3248 if (isDIGIT(value))
3249 ANYOF_BITMAP_CLEAR(data->start_class, value);
3250 }
3251 else {
3252 if (data->start_class->flags & ANYOF_LOCALE)
3253 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3254 else {
3255 for (value = 0; value < 256; value++)
3256 if (!isDIGIT(value))
b81d288d 3257 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3258 }
3259 }
3260 break;
3261 }
3262 if (flags & SCF_DO_STCLASS_OR)
3263 cl_and(data->start_class, &and_with);
3264 flags &= ~SCF_DO_STCLASS;
3265 }
a0ed51b3 3266 }
3dab1dad 3267 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3268 data->flags |= (OP(scan) == MEOL
3269 ? SF_BEFORE_MEOL
3270 : SF_BEFORE_SEOL);
a0ed51b3 3271 }
3dab1dad 3272 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3273 /* Lookbehind, or need to calculate parens/evals/stclass: */
3274 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3275 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1de06328
YO
3276 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3277 || OP(scan) == UNLESSM )
3278 {
3279 /* Negative Lookahead/lookbehind
3280 In this case we can't do fixed string optimisation.
3281 */
3282
3283 I32 deltanext, minnext, fake = 0;
3284 regnode *nscan;
3285 struct regnode_charclass_class intrnl;
3286 int f = 0;
3287
3288 data_fake.flags = 0;
3289 if (data) {
3290 data_fake.whilem_c = data->whilem_c;
3291 data_fake.last_closep = data->last_closep;
a0ed51b3 3292 }
1de06328
YO
3293 else
3294 data_fake.last_closep = &fake;
3295 if ( flags & SCF_DO_STCLASS && !scan->flags
3296 && OP(scan) == IFMATCH ) { /* Lookahead */
3297 cl_init(pRExC_state, &intrnl);
3298 data_fake.start_class = &intrnl;
3299 f |= SCF_DO_STCLASS_AND;
c277df42 3300 }
1de06328
YO
3301 if (flags & SCF_WHILEM_VISITED_POS)
3302 f |= SCF_WHILEM_VISITED_POS;
3303 next = regnext(scan);
3304 nscan = NEXTOPER(NEXTOPER(scan));
3305 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, last, &data_fake, f,depth+1);
3306 if (scan->flags) {
3307 if (deltanext) {
3308 vFAIL("Variable length lookbehind not implemented");
3309 }
3310 else if (minnext > (I32)U8_MAX) {
3311 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3312 }
3313 scan->flags = (U8)minnext;
3314 }
3315 if (data) {
3316 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3317 pars++;
3318 if (data_fake.flags & SF_HAS_EVAL)
3319 data->flags |= SF_HAS_EVAL;
3320 data->whilem_c = data_fake.whilem_c;
3321 }
3322 if (f & SCF_DO_STCLASS_AND) {
3323 const int was = (data->start_class->flags & ANYOF_EOS);
3324
3325 cl_and(data->start_class, &intrnl);
3326 if (was)
3327 data->start_class->flags |= ANYOF_EOS;
3328 }
be8e71aa 3329 }
1de06328
YO
3330#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3331 else {
3332 /* Positive Lookahead/lookbehind
3333 In this case we can do fixed string optimisation,
3334 but we must be careful about it. Note in the case of
3335 lookbehind the positions will be offset by the minimum
3336 length of the pattern, something we won't know about
3337 until after the recurse.
3338 */
3339 I32 deltanext, fake = 0;
3340 regnode *nscan;
3341 struct regnode_charclass_class intrnl;
3342 int f = 0;
3343 /* We use SAVEFREEPV so that when the full compile
3344 is finished perl will clean up the allocated
3345 minlens when its all done. This was we don't
3346 have to worry about freeing them when we know
3347 they wont be used, which would be a pain.
3348 */
3349 I32 *minnextp;
3350 Newx( minnextp, 1, I32 );
3351 SAVEFREEPV(minnextp);
3352
3353 if (data) {
3354 StructCopy(data, &data_fake, scan_data_t);
3355 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3356 f |= SCF_DO_SUBSTR;
3357 if (scan->flags)
3358 scan_commit(pRExC_state, &data_fake,minlenp);
3359 data_fake.last_found=newSVsv(data->last_found);
3360 }
3361 }
3362 else
3363 data_fake.last_closep = &fake;
3364 data_fake.flags = 0;
3365 if (is_inf)
3366 data_fake.flags |= SF_IS_INF;
3367 if ( flags & SCF_DO_STCLASS && !scan->flags
3368 && OP(scan) == IFMATCH ) { /* Lookahead */
3369 cl_init(pRExC_state, &intrnl);
3370 data_fake.start_class = &intrnl;
3371 f |= SCF_DO_STCLASS_AND;
3372 }
3373 if (flags & SCF_WHILEM_VISITED_POS)
3374 f |= SCF_WHILEM_VISITED_POS;
3375 next = regnext(scan);
3376 nscan = NEXTOPER(NEXTOPER(scan));
3377
3378 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, last, &data_fake, f,depth+1);
3379 if (scan->flags) {
3380 if (deltanext) {
3381 vFAIL("Variable length lookbehind not implemented");
3382 }
3383 else if (*minnextp > (I32)U8_MAX) {
3384 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3385 }
3386 scan->flags = (U8)*minnextp;
3387 }
3388
3389 *minnextp += min;
3390
3391
3392 if (f & SCF_DO_STCLASS_AND) {
3393 const int was = (data->start_class->flags & ANYOF_EOS);
3394
3395 cl_and(data->start_class, &intrnl);
3396 if (was)
3397 data->start_class->flags |= ANYOF_EOS;
3398 }
3399 if (data) {
3400 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3401 pars++;
3402 if (data_fake.flags & SF_HAS_EVAL)
3403 data->flags |= SF_HAS_EVAL;
3404 data->whilem_c = data_fake.whilem_c;
3405 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3406 if (RExC_rx->minlen<*minnextp)
3407 RExC_rx->minlen=*minnextp;
3408 scan_commit(pRExC_state, &data_fake, minnextp);
3409 SvREFCNT_dec(data_fake.last_found);
3410
3411 if ( data_fake.minlen_fixed != minlenp )
3412 {
3413 data->offset_fixed= data_fake.offset_fixed;
3414 data->minlen_fixed= data_fake.minlen_fixed;
3415 data->lookbehind_fixed+= scan->flags;
3416 }
3417 if ( data_fake.minlen_float != minlenp )
3418 {
3419 data->minlen_float= data_fake.minlen_float;
3420 data->offset_float_min=data_fake.offset_float_min;
3421 data->offset_float_max=data_fake.offset_float_max;
3422 data->lookbehind_float+= scan->flags;
3423 }
3424 }
3425 }
3426
653099ff 3427
653099ff 3428 }
1de06328 3429#endif
a0ed51b3
LW
3430 }
3431 else if (OP(scan) == OPEN) {
c277df42 3432 pars++;
a0ed51b3 3433 }
cb434fcc 3434 else if (OP(scan) == CLOSE) {
eb160463 3435 if ((I32)ARG(scan) == is_par) {
cb434fcc 3436 next = regnext(scan);
c277df42 3437
cb434fcc
IZ
3438 if ( next && (OP(next) != WHILEM) && next < last)
3439 is_par = 0; /* Disable optimization */
3440 }
3441 if (data)
3442 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
3443 }
3444 else if (OP(scan) == EVAL) {
c277df42
IZ
3445 if (data)
3446 data->flags |= SF_HAS_EVAL;
3447 }
96776eda 3448 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 3449 if (flags & SCF_DO_SUBSTR) {
1de06328 3450 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3451 data->longest = &(data->longest_float);
3452 }
3453 is_inf = is_inf_internal = 1;
653099ff 3454 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3455 cl_anything(pRExC_state, data->start_class);
96776eda 3456 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3457 }
786e8c11
YO
3458#ifdef TRIE_STUDY_OPT
3459#ifdef FULL_TRIE_STUDY
3460 else if (PL_regkind[OP(scan)] == TRIE) {
3461 /* NOTE - There is similar code to this block above for handling
3462 BRANCH nodes on the initial study. If you change stuff here
3463 check there too. */
3464 regnode *tail= regnext(scan);
3465 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3466 I32 max1 = 0, min1 = I32_MAX;
3467 struct regnode_charclass_class accum;
3468
3469 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 3470 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
786e8c11
YO
3471 if (flags & SCF_DO_STCLASS)
3472 cl_init_zero(pRExC_state, &accum);
3473
3474 if (!trie->jump) {
3475 min1= trie->minlen;
3476 max1= trie->maxlen;
3477 } else {
3478 const regnode *nextbranch= NULL;
3479 U32 word;
3480
3481 for ( word=1 ; word <= trie->wordcount ; word++)
3482 {
3483 I32 deltanext=0, minnext=0, f = 0, fake;
3484 struct regnode_charclass_class this_class;
3485
3486 data_fake.flags = 0;
3487 if (data) {
3488 data_fake.whilem_c = data->whilem_c;
3489 data_fake.last_closep = data->last_closep;
3490 }
3491 else
3492 data_fake.last_closep = &fake;
3493
3494 if (flags & SCF_DO_STCLASS) {
3495 cl_init(pRExC_state, &this_class);
3496 data_fake.start_class = &this_class;
3497 f = SCF_DO_STCLASS_AND;
3498 }
3499 if (flags & SCF_WHILEM_VISITED_POS)
3500 f |= SCF_WHILEM_VISITED_POS;
3501
3502 if (trie->jump[word]) {
3503 if (!nextbranch)
3504 nextbranch = tail - trie->jump[0];
3505 scan= tail - trie->jump[word];
3506 /* We go from the jump point to the branch that follows
3507 it. Note this means we need the vestigal unused branches
3508 even though they arent otherwise used.
3509 */
1de06328 3510 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
786e8c11
YO
3511 (regnode *)nextbranch, &data_fake, f,depth+1);
3512 }
3513 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3514 nextbranch= regnext((regnode*)nextbranch);
3515
3516 if (min1 > (I32)(minnext + trie->minlen))
3517 min1 = minnext + trie->minlen;
3518 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3519 max1 = minnext + deltanext + trie->maxlen;
3520 if (deltanext == I32_MAX)
3521 is_inf = is_inf_internal = 1;
3522
3523 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3524 pars++;
3525
3526 if (data) {
3527 if (data_fake.flags & SF_HAS_EVAL)
3528 data->flags |= SF_HAS_EVAL;
3529 data->whilem_c = data_fake.whilem_c;
3530 }
3531 if (flags & SCF_DO_STCLASS)
3532 cl_or(pRExC_state, &accum, &this_class);
3533 }
3534 }
3535 if (flags & SCF_DO_SUBSTR) {
3536 data->pos_min += min1;
3537 data->pos_delta += max1 - min1;
3538 if (max1 != min1 || is_inf)
3539 data->longest = &(data->longest_float);
3540 }
3541 min += min1;
3542 delta += max1 - min1;
3543 if (flags & SCF_DO_STCLASS_OR) {
3544 cl_or(pRExC_state, data->start_class, &accum);
3545 if (min1) {
3546 cl_and(data->start_class, &and_with);
3547 flags &= ~SCF_DO_STCLASS;
3548 }
3549 }
3550 else if (flags & SCF_DO_STCLASS_AND) {
3551 if (min1) {
3552 cl_and(data->start_class, &accum);
3553 flags &= ~SCF_DO_STCLASS;
3554 }
3555 else {
3556 /* Switch to OR mode: cache the old value of
3557 * data->start_class */
3558 StructCopy(data->start_class, &and_with,
3559 struct regnode_charclass_class);
3560 flags &= ~SCF_DO_STCLASS_AND;
3561 StructCopy(&accum, data->start_class,
3562 struct regnode_charclass_class);
3563 flags |= SCF_DO_STCLASS_OR;
3564 data->start_class->flags |= ANYOF_EOS;
3565 }
3566 }
3567 scan= tail;
3568 continue;
3569 }
3570#else
3571 else if (PL_regkind[OP(scan)] == TRIE) {
3572 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3573 U8*bang=NULL;
3574
3575 min += trie->minlen;
3576 delta += (trie->maxlen - trie->minlen);
3577 flags &= ~SCF_DO_STCLASS; /* xxx */
3578 if (flags & SCF_DO_SUBSTR) {
1de06328 3579 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
786e8c11
YO
3580 data->pos_min += trie->minlen;
3581 data->pos_delta += (trie->maxlen - trie->minlen);
3582 if (trie->maxlen != trie->minlen)
3583 data->longest = &(data->longest_float);
3584 }
3585 if (trie->jump) /* no more substrings -- for now /grr*/
3586 flags &= ~SCF_DO_SUBSTR;
3587 }
3588#endif /* old or new */
3589#endif /* TRIE_STUDY_OPT */
c277df42
IZ
3590 /* Else: zero-length, ignore. */
3591 scan = regnext(scan);
3592 }
3593
3594 finish:
3595 *scanp = scan;
aca2d497 3596 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3597 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3598 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3599 if (is_par > (I32)U8_MAX)
c277df42
IZ
3600 is_par = 0;
3601 if (is_par && pars==1 && data) {
3602 data->flags |= SF_IN_PAR;
3603 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3604 }
3605 else if (pars && data) {
c277df42
IZ
3606 data->flags |= SF_HAS_PAR;
3607 data->flags &= ~SF_IN_PAR;
3608 }
653099ff
GS
3609 if (flags & SCF_DO_STCLASS_OR)
3610 cl_and(data->start_class, &and_with);
786e8c11
YO
3611 if (flags & SCF_TRIE_RESTUDY)
3612 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3613
3614 DEBUG_STUDYDATA(data,depth);
3615
c277df42
IZ
3616 return min;
3617}
3618
76e3520e 3619STATIC I32
5f66b61c 3620S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3621{
830247a4 3622 if (RExC_rx->data) {
b81d288d
AB
3623 Renewc(RExC_rx->data,
3624 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 3625 char, struct reg_data);
830247a4
IZ
3626 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3627 RExC_rx->data->count += n;
a0ed51b3
LW
3628 }
3629 else {
a02a5408 3630 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3631 char, struct reg_data);
a02a5408 3632 Newx(RExC_rx->data->what, n, U8);
830247a4 3633 RExC_rx->data->count = n;
c277df42 3634 }
830247a4
IZ
3635 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3636 return RExC_rx->data->count - n;
c277df42
IZ
3637}
3638
76234dfb 3639#ifndef PERL_IN_XSUB_RE
d88dccdf 3640void
864dbfa3 3641Perl_reginitcolors(pTHX)
d88dccdf 3642{
97aff369 3643 dVAR;
1df70142 3644 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3645 if (s) {
1df70142
AL
3646 char *t = savepv(s);
3647 int i = 0;
3648 PL_colors[0] = t;
d88dccdf 3649 while (++i < 6) {
1df70142
AL
3650 t = strchr(t, '\t');
3651 if (t) {
3652 *t = '\0';
3653 PL_colors[i] = ++t;
d88dccdf
IZ
3654 }
3655 else
1df70142 3656 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3657 }
3658 } else {
1df70142 3659 int i = 0;
b81d288d 3660 while (i < 6)
06b5626a 3661 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3662 }
3663 PL_colorset = 1;
3664}
76234dfb 3665#endif
8615cb43 3666
07be1b83 3667
786e8c11
YO
3668#ifdef TRIE_STUDY_OPT
3669#define CHECK_RESTUDY_GOTO \
3670 if ( \
3671 (data.flags & SCF_TRIE_RESTUDY) \
3672 && ! restudied++ \
3673 ) goto reStudy
3674#else
3675#define CHECK_RESTUDY_GOTO
3676#endif
f9f4320a 3677
a687059c 3678/*
e50aee73 3679 - pregcomp - compile a regular expression into internal code
a687059c
LW
3680 *
3681 * We can't allocate space until we know how big the compiled form will be,
3682 * but we can't compile it (and thus know how big it is) until we've got a
3683 * place to put the code. So we cheat: we compile it twice, once with code
3684 * generation turned off and size counting turned on, and once "for real".
3685 * This also means that we don't allocate space until we are sure that the
3686 * thing really will compile successfully, and we never have to move the
3687 * code and thus invalidate pointers into it. (Note that it has to be in
3688 * one piece because free() must be able to free it all.) [NB: not true in perl]
3689 *
3690 * Beware that the optimization-preparation code in here knows about some
3691 * of the structure of the compiled regexp. [I'll say.]
3692 */
f9f4320a
YO
3693#ifndef PERL_IN_XSUB_RE
3694#define CORE_ONLY_BLOCK(c) {c}{
3695#define RE_ENGINE_PTR &PL_core_reg_engine
3696#else
3697#define CORE_ONLY_BLOCK(c) {
3698extern const struct regexp_engine my_reg_engine;
3699#define RE_ENGINE_PTR &my_reg_engine
3700#endif
3701#define END_BLOCK }
3702
a687059c 3703regexp *
864dbfa3 3704Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3705{
97aff369 3706 dVAR;
f9f4320a
YO
3707 GET_RE_DEBUG_FLAGS_DECL;
3708 DEBUG_r(if (!PL_colorset) reginitcolors());
3709 CORE_ONLY_BLOCK(
3710 /* Dispatch a request to compile a regexp to correct
3711 regexp engine. */
3712 HV * const table = GvHV(PL_hintgv);
3713 if (table) {
3714 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
3715 if (ptr && SvIOK(*ptr)) {
3716 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3717 DEBUG_COMPILE_r({
8d8756e7 3718 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3719 SvIV(*ptr));
3720 });
f2f78491 3721 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a
YO
3722 }
3723 })
a0d0e21e 3724 register regexp *r;
c277df42 3725 regnode *scan;
c277df42 3726 regnode *first;
a0d0e21e 3727 I32 flags;
a0d0e21e
LW
3728 I32 minlen = 0;
3729 I32 sawplus = 0;
3730 I32 sawopen = 0;
2c2d71f5 3731 scan_data_t data;
830247a4 3732 RExC_state_t RExC_state;
be8e71aa 3733 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3734#ifdef TRIE_STUDY_OPT
3735 int restudied= 0;
3736 RExC_state_t copyRExC_state;
3737#endif
a0d0e21e 3738 if (exp == NULL)
c277df42 3739 FAIL("NULL regexp argument");
a0d0e21e 3740
a5961de5 3741 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3742
5cfc7842 3743 RExC_precomp = exp;
a3621e74 3744 DEBUG_COMPILE_r({
ab3bbdeb
YO
3745 SV *dsv= sv_newmortal();
3746 RE_PV_QUOTED_DECL(s, RExC_utf8,
3747 dsv, RExC_precomp, (xend - exp), 60);
3748 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3749 PL_colors[4],PL_colors[5],s);
a5961de5 3750 });
e2509266 3751 RExC_flags = pm->op_pmflags;
830247a4 3752 RExC_sawback = 0;
bbce6d69 3753
830247a4
IZ
3754 RExC_seen = 0;
3755 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3756 RExC_seen_evals = 0;
3757 RExC_extralen = 0;
c277df42 3758
bbce6d69 3759 /* First pass: determine size, legality. */
830247a4 3760 RExC_parse = exp;
fac92740 3761 RExC_start = exp;
830247a4
IZ
3762 RExC_end = xend;
3763 RExC_naughty = 0;
3764 RExC_npar = 1;
3765 RExC_size = 0L;
3766 RExC_emit = &PL_regdummy;
3767 RExC_whilem_seen = 0;
fc8cd66c
YO
3768 RExC_charnames = NULL;
3769
85ddcde9
JH
3770#if 0 /* REGC() is (currently) a NOP at the first pass.
3771 * Clever compilers notice this and complain. --jhi */
830247a4 3772 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3773#endif
3dab1dad
YO
3774 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3775 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3776 RExC_precomp = NULL;
a0d0e21e
LW
3777 return(NULL);
3778 }
3dab1dad
YO
3779 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3780 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3781 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
07be1b83
YO
3782 DEBUG_PARSE_r({
3783 RExC_lastnum=0;
3784 RExC_lastparse=NULL;
3785 });
c277df42 3786
07be1b83 3787
c277df42
IZ
3788 /* Small enough for pointer-storage convention?
3789 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3790 if (RExC_size >= 0x10000L && RExC_extralen)
3791 RExC_size += RExC_extralen;
c277df42 3792 else
830247a4
IZ
3793 RExC_extralen = 0;
3794 if (RExC_whilem_seen > 15)
3795 RExC_whilem_seen = 15;
a0d0e21e 3796
f9f4320a
YO
3797 /* Allocate space and zero-initialize. Note, the two step process
3798 of zeroing when in debug mode, thus anything assigned has to
3799 happen after that */
a02a5408 3800 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3801 char, regexp);
a0d0e21e 3802 if (r == NULL)
b45f050a 3803 FAIL("Regexp out of space");
0f79a09d
GS
3804#ifdef DEBUGGING
3805 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3806 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3807#endif
f9f4320a
YO
3808 /* initialization begins here */
3809 r->engine= RE_ENGINE_PTR;
c277df42 3810 r->refcnt = 1;
bbce6d69 3811 r->prelen = xend - exp;
5cfc7842 3812 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3813 r->subbeg = NULL;
f8c7b90f 3814#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3815 r->saved_copy = NULL;
ed252734 3816#endif
cf93c79d 3817 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3818 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3819 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3820
3821 r->substrs = 0; /* Useful during FAIL. */
3822 r->startp = 0; /* Useful during FAIL. */
3823 r->endp = 0; /* Useful during FAIL. */
3824
a02a5408 3825 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3826 if (r->offsets) {
2af232bd 3827 r->offsets[0] = RExC_size;
fac92740 3828 }
a3621e74 3829 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3830 "%s %"UVuf" bytes for offset annotations.\n",
3831 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3832 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 3833
830247a4 3834 RExC_rx = r;
bbce6d69 3835
3836 /* Second pass: emit code. */
e2509266 3837 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
3838 RExC_parse = exp;
3839 RExC_end = xend;
3840 RExC_naughty = 0;
3841 RExC_npar = 1;
fac92740 3842 RExC_emit_start = r->program;
830247a4 3843 RExC_emit = r->program;
2cd61cdb 3844 /* Store the count of eval-groups for security checks: */
786e8c11 3845 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 3846 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 3847 r->data = 0;
3dab1dad 3848 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 3849 return(NULL);
07be1b83
YO
3850 /* XXXX To minimize changes to RE engine we always allocate
3851 3-units-long substrs field. */
3852 Newx(r->substrs, 1, struct reg_substr_data);
a0d0e21e 3853
07be1b83 3854reStudy:
1de06328 3855 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83
YO
3856 Zero(r->substrs, 1, struct reg_substr_data);
3857 StructCopy(&zero_scan_data, &data, scan_data_t);
a3621e74 3858
07be1b83
YO
3859#ifdef TRIE_STUDY_OPT
3860 if ( restudied ) {
3861 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3862 RExC_state=copyRExC_state;
1de06328 3863 if (data.last_found) {
07be1b83 3864 SvREFCNT_dec(data.longest_fixed);
07be1b83 3865 SvREFCNT_dec(data.longest_float);
07be1b83 3866 SvREFCNT_dec(data.last_found);
1de06328 3867 }
07be1b83
YO
3868 } else {
3869 copyRExC_state=RExC_state;
3870 }
3871#endif
fc8cd66c 3872
a0d0e21e 3873 /* Dig out information for optimizations. */
cf93c79d 3874 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 3875 pm->op_pmflags = RExC_flags;
a0ed51b3 3876 if (UTF)
5ff6fc6d 3877 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 3878 r->regstclass = NULL;
830247a4 3879 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 3880 r->reganch |= ROPT_NAUGHTY;
c277df42 3881 scan = r->program + 1; /* First BRANCH. */
2779dcf1 3882
1de06328
YO
3883 /* testing for BRANCH here tells us whether there is "must appear"
3884 data in the pattern. If there is then we can use it for optimisations */
c277df42 3885 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 3886 I32 fake;
c5254dd6 3887 STRLEN longest_float_length, longest_fixed_length;
07be1b83 3888 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 3889 int stclass_flag;
07be1b83 3890 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
3891
3892 first = scan;
c277df42 3893 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 3894 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 3895 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 3896 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
3897 /* for now we can't handle lookbehind IFMATCH*/
3898 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
3899 (OP(first) == PLUS) ||
3900 (OP(first) == MINMOD) ||
653099ff 3901 /* An {n,m} with n>0 */
07be1b83
YO
3902 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3903 {
786e8c11 3904
a0d0e21e
LW
3905 if (OP(first) == PLUS)
3906 sawplus = 1;
3907 else
3dab1dad 3908 first += regarglen[OP(first)];
07be1b83
YO
3909 if (OP(first) == IFMATCH) {
3910 first = NEXTOPER(first);
3911 first += EXTRA_STEP_2ARGS;
7c167cea 3912 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 3913 first = NEXTOPER(first);
a687059c
LW
3914 }
3915
a0d0e21e
LW
3916 /* Starting-point info. */
3917 again:
786e8c11 3918 DEBUG_PEEP("first:",first,0);
07be1b83 3919 /* Ignore EXACT as we deal with it later. */
3dab1dad 3920 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 3921 if (OP(first) == EXACT)
6f207bd3 3922 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 3923 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
3924 r->regstclass = first;
3925 }
07be1b83 3926#ifdef TRIE_STCLASS
786e8c11 3927 else if (PL_regkind[OP(first)] == TRIE &&
07be1b83
YO
3928 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3929 {
786e8c11 3930 regnode *trie_op;
07be1b83 3931 /* this can happen only on restudy */
786e8c11
YO
3932 if ( OP(first) == TRIE ) {
3933 struct regnode_1 *trieop;
3934 Newxz(trieop,1,struct regnode_1);
3935 StructCopy(first,trieop,struct regnode_1);
3936 trie_op=(regnode *)trieop;
3937 } else {
3938 struct regnode_charclass *trieop;
3939 Newxz(trieop,1,struct regnode_charclass);
3940 StructCopy(first,trieop,struct regnode_charclass);
3941 trie_op=(regnode *)trieop;
3942 }
1de06328 3943 OP(trie_op)+=2;
786e8c11
YO
3944 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
3945 r->regstclass = trie_op;
07be1b83
YO
3946 }
3947#endif
bfed75c6 3948 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 3949 r->regstclass = first;
3dab1dad
YO
3950 else if (PL_regkind[OP(first)] == BOUND ||
3951 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 3952 r->regstclass = first;
3dab1dad 3953 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
3954 r->reganch |= (OP(first) == MBOL
3955 ? ROPT_ANCH_MBOL
3956 : (OP(first) == SBOL
3957 ? ROPT_ANCH_SBOL
3958 : ROPT_ANCH_BOL));
a0d0e21e 3959 first = NEXTOPER(first);
774d564b 3960 goto again;
3961 }
3962 else if (OP(first) == GPOS) {
3963 r->reganch |= ROPT_ANCH_GPOS;
3964 first = NEXTOPER(first);
3965 goto again;
a0d0e21e 3966 }
e09294f4 3967 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 3968 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3969 !(r->reganch & ROPT_ANCH) )
3970 {
3971 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
3972 const int type =
3973 (OP(NEXTOPER(first)) == REG_ANY)
3974 ? ROPT_ANCH_MBOL
3975 : ROPT_ANCH_SBOL;
cad2e5aa 3976 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3977 first = NEXTOPER(first);
774d564b 3978 goto again;
a0d0e21e 3979 }
b81d288d 3980 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3981 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3982 /* x+ must match at the 1st pos of run of x's */
3983 r->reganch |= ROPT_SKIP;
a0d0e21e 3984
c277df42 3985 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa
YO
3986#ifdef TRIE_STUDY_OPT
3987 DEBUG_COMPILE_r(
3988 if (!restudied)
3989 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3990 (IV)(first - scan + 1))
3991 );
3992#else
3993 DEBUG_COMPILE_r(
3994 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3995 (IV)(first - scan + 1))
3996 );
3997#endif
3998
3999
a0d0e21e
LW
4000 /*
4001 * If there's something expensive in the r.e., find the
4002 * longest literal string that must appear and make it the
4003 * regmust. Resolve ties in favor of later strings, since
4004 * the regstart check works with the beginning of the r.e.
4005 * and avoiding duplication strengthens checking. Not a
4006 * strong reason, but sufficient in the absence of others.
4007 * [Now we resolve ties in favor of the earlier string if
c277df42 4008 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4009 * earlier string may buy us something the later one won't.]
4010 */
a0d0e21e 4011 minlen = 0;
a687059c 4012
396482e1
GA
4013 data.longest_fixed = newSVpvs("");
4014 data.longest_float = newSVpvs("");
4015 data.last_found = newSVpvs("");
c277df42
IZ
4016 data.longest = &(data.longest_fixed);
4017 first = scan;
653099ff 4018 if (!r->regstclass) {
830247a4 4019 cl_init(pRExC_state, &ch_class);
653099ff
GS
4020 data.start_class = &ch_class;
4021 stclass_flag = SCF_DO_STCLASS_AND;
4022 } else /* XXXX Check for BOUND? */
4023 stclass_flag = 0;
cb434fcc 4024 data.last_closep = &last_close;
653099ff 4025
1de06328 4026 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
a3621e74 4027 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4028
07be1b83 4029
786e8c11
YO
4030 CHECK_RESTUDY_GOTO;
4031
4032
830247a4 4033 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4034 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
4035 && !RExC_seen_zerolen
4036 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 4037 r->reganch |= ROPT_CHECK_ALL;
1de06328 4038 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4039 SvREFCNT_dec(data.last_found);
4040
1de06328
YO
4041 /* Note that code very similar to this but for anchored string
4042 follows immediately below, changes may need to be made to both.
4043 Be careful.
4044 */
a0ed51b3 4045 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4046 if (longest_float_length
c277df42
IZ
4047 || (data.flags & SF_FL_BEFORE_EOL
4048 && (!(data.flags & SF_FL_BEFORE_MEOL)
1de06328
YO
4049 || (RExC_flags & PMf_MULTILINE))))
4050 {
1182767e 4051 I32 t,ml;
cf93c79d 4052
1de06328 4053 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4054 && data.offset_fixed == data.offset_float_min
4055 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4056 goto remove_float; /* As in (a)+. */
4057
1de06328
YO
4058 /* copy the information about the longest float from the reg_scan_data
4059 over to the program. */
33b8afdf
JH
4060 if (SvUTF8(data.longest_float)) {
4061 r->float_utf8 = data.longest_float;
c445ea15 4062 r->float_substr = NULL;
33b8afdf
JH
4063 } else {
4064 r->float_substr = data.longest_float;
c445ea15 4065 r->float_utf8 = NULL;
33b8afdf 4066 }
1de06328
YO
4067 /* float_end_shift is how many chars that must be matched that
4068 follow this item. We calculate it ahead of time as once the
4069 lookbehind offset is added in we lose the ability to correctly
4070 calculate it.*/
4071 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4072 : (I32)longest_float_length;
1de06328
YO
4073 r->float_end_shift = ml - data.offset_float_min
4074 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4075 + data.lookbehind_float;
4076 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4077 r->float_max_offset = data.offset_float_max;
1182767e 4078 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4079 r->float_max_offset -= data.lookbehind_float;
4080
cf93c79d
IZ
4081 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4082 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 4083 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4084 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4085 }
4086 else {
aca2d497 4087 remove_float:
c445ea15 4088 r->float_substr = r->float_utf8 = NULL;
c277df42 4089 SvREFCNT_dec(data.longest_float);
c5254dd6 4090 longest_float_length = 0;
a0d0e21e 4091 }
c277df42 4092
1de06328
YO
4093 /* Note that code very similar to this but for floating string
4094 is immediately above, changes may need to be made to both.
4095 Be careful.
4096 */
a0ed51b3 4097 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4098 if (longest_fixed_length
c277df42
IZ
4099 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4100 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1de06328
YO
4101 || (RExC_flags & PMf_MULTILINE))))
4102 {
1182767e 4103 I32 t,ml;
cf93c79d 4104
1de06328
YO
4105 /* copy the information about the longest fixed
4106 from the reg_scan_data over to the program. */
33b8afdf
JH
4107 if (SvUTF8(data.longest_fixed)) {
4108 r->anchored_utf8 = data.longest_fixed;
c445ea15 4109 r->anchored_substr = NULL;
33b8afdf
JH
4110 } else {
4111 r->anchored_substr = data.longest_fixed;
c445ea15 4112 r->anchored_utf8 = NULL;
33b8afdf 4113 }
1de06328
YO
4114 /* fixed_end_shift is how many chars that must be matched that
4115 follow this item. We calculate it ahead of time as once the
4116 lookbehind offset is added in we lose the ability to correctly
4117 calculate it.*/
4118 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4119 : (I32)longest_fixed_length;
1de06328
YO
4120 r->anchored_end_shift = ml - data.offset_fixed
4121 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4122 + data.lookbehind_fixed;
4123 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4124
cf93c79d
IZ
4125 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4126 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 4127 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4128 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4129 }
4130 else {
c445ea15 4131 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4132 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4133 longest_fixed_length = 0;
a0d0e21e 4134 }
b81d288d 4135 if (r->regstclass
ffc61ed2 4136 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 4137 r->regstclass = NULL;
33b8afdf
JH
4138 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4139 && stclass_flag
653099ff 4140 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4141 && !cl_is_anything(data.start_class))
4142 {
1df70142 4143 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4144
a02a5408 4145 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4146 struct regnode_charclass_class);
4147 StructCopy(data.start_class,
830247a4 4148 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4149 struct regnode_charclass_class);
830247a4 4150 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4151 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4152 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4153 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4154 PerlIO_printf(Perl_debug_log,
a0288114 4155 "synthetic stclass \"%s\".\n",
3f7c398e 4156 SvPVX_const(sv));});
653099ff 4157 }
c277df42
IZ
4158
4159 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4160 if (longest_fixed_length > longest_float_length) {
1de06328 4161 r->check_end_shift = r->anchored_end_shift;
c277df42 4162 r->check_substr = r->anchored_substr;
33b8afdf 4163 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
4164 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4165 if (r->reganch & ROPT_ANCH_SINGLE)
4166 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
4167 }
4168 else {
1de06328 4169 r->check_end_shift = r->float_end_shift;
c277df42 4170 r->check_substr = r->float_substr;
33b8afdf 4171 r->check_utf8 = r->float_utf8;
1de06328
YO
4172 r->check_offset_min = r->float_min_offset;
4173 r->check_offset_max = r->float_max_offset;
a0d0e21e 4174 }
30382c73
IZ
4175 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4176 This should be changed ASAP! */
33b8afdf 4177 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 4178 r->reganch |= RE_USE_INTUIT;
33b8afdf 4179 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
4180 r->reganch |= RE_INTUIT_TAIL;
4181 }
1de06328
YO
4182 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4183 if ( (STRLEN)minlen < longest_float_length )
4184 minlen= longest_float_length;
4185 if ( (STRLEN)minlen < longest_fixed_length )
4186 minlen= longest_fixed_length;
4187 */
a0ed51b3
LW
4188 }
4189 else {
c277df42
IZ
4190 /* Several toplevels. Best we can is to set minlen. */
4191 I32 fake;
653099ff 4192 struct regnode_charclass_class ch_class;
cb434fcc 4193 I32 last_close = 0;
c277df42 4194
a3621e74 4195 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
07be1b83 4196
c277df42 4197 scan = r->program + 1;
830247a4 4198 cl_init(pRExC_state, &ch_class);
653099ff 4199 data.start_class = &ch_class;
cb434fcc 4200 data.last_closep = &last_close;
07be1b83 4201
1de06328 4202 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
07be1b83
YO
4203 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4204
786e8c11 4205 CHECK_RESTUDY_GOTO;
07be1b83 4206
33b8afdf 4207 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4208 = r->float_substr = r->float_utf8 = NULL;
653099ff 4209 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4210 && !cl_is_anything(data.start_class))
4211 {
1df70142 4212 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4213
a02a5408 4214 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4215 struct regnode_charclass_class);
4216 StructCopy(data.start_class,
830247a4 4217 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4218 struct regnode_charclass_class);
830247a4 4219 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4220 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4221 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4222 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4223 PerlIO_printf(Perl_debug_log,
a0288114 4224 "synthetic stclass \"%s\".\n",
3f7c398e 4225 SvPVX_const(sv));});
653099ff 4226 }
a0d0e21e
LW
4227 }
4228
1de06328
YO
4229 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4230 the "real" pattern. */
4231 if (r->minlen < minlen)
4232 r->minlen = minlen;
4233
b81d288d 4234 if (RExC_seen & REG_SEEN_GPOS)
c277df42 4235 r->reganch |= ROPT_GPOS_SEEN;
830247a4 4236 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 4237 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 4238 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 4239 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
4240 if (RExC_seen & REG_SEEN_CANY)
4241 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
4242 Newxz(r->startp, RExC_npar, I32);
4243 Newxz(r->endp, RExC_npar, I32);
f9f4320a
YO
4244
4245
fc8cd66c
YO
4246 if (RExC_charnames)
4247 SvREFCNT_dec((SV*)(RExC_charnames));
07be1b83 4248
f2278c82 4249 DEBUG_r( RX_DEBUG_on(r) );
be8e71aa
YO
4250 DEBUG_DUMP_r({
4251 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4252 regdump(r);
4253 });
8e9a8a48
YO
4254 DEBUG_OFFSETS_r(if (r->offsets) {
4255 const U32 len = r->offsets[0];
4256 U32 i;
4257 GET_RE_DEBUG_FLAGS_DECL;
4258 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4259 for (i = 1; i <= len; i++) {
4260 if (r->offsets[i*2-1] || r->offsets[i*2])
4261 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
786e8c11 4262 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
8e9a8a48
YO
4263 }
4264 PerlIO_printf(Perl_debug_log, "\n");
4265 });
a0d0e21e 4266 return(r);
f9f4320a 4267 END_BLOCK
a687059c
LW
4268}
4269
f9f4320a
YO
4270#undef CORE_ONLY_BLOCK
4271#undef END_BLOCK
4272#undef RE_ENGINE_PTR
3dab1dad
YO
4273
4274#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4275 int rem=(int)(RExC_end - RExC_parse); \
4276 int cut; \
4277 int num; \
4278 int iscut=0; \
4279 if (rem>10) { \
4280 rem=10; \
4281 iscut=1; \
4282 } \
4283 cut=10-rem; \
4284 if (RExC_lastparse!=RExC_parse) \
4285 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4286 rem, RExC_parse, \
4287 cut + 4, \
4288 iscut ? "..." : "<" \
4289 ); \
4290 else \
4291 PerlIO_printf(Perl_debug_log,"%16s",""); \
4292 \
4293 if (SIZE_ONLY) \
4294 num=RExC_size; \
4295 else \
4296 num=REG_NODE_NUM(RExC_emit); \
4297 if (RExC_lastnum!=num) \
be8e71aa 4298 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4299 else \
be8e71aa
YO
4300 PerlIO_printf(Perl_debug_log,"|%4s",""); \
4301 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4302 (int)((depth*2)), "", \
3dab1dad
YO
4303 (funcname) \
4304 ); \
4305 RExC_lastnum=num; \
4306 RExC_lastparse=RExC_parse; \
4307})
4308
07be1b83
YO
4309
4310
3dab1dad
YO
4311#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4312 DEBUG_PARSE_MSG((funcname)); \
4313 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4314})
a687059c
LW
4315/*
4316 - reg - regular expression, i.e. main body or parenthesized thing
4317 *
4318 * Caller must absorb opening parenthesis.
4319 *
4320 * Combining parenthesis handling with the base level of regular expression
4321 * is a trifle forced, but the need to tie the tails of the branches to what
4322 * follows makes it hard to avoid.
4323 */
07be1b83
YO
4324#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4325#ifdef DEBUGGING
4326#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4327#else
4328#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4329#endif
3dab1dad 4330
76e3520e 4331STATIC regnode *
3dab1dad 4332S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4333 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4334{
27da23d5 4335 dVAR;
c277df42
IZ
4336 register regnode *ret; /* Will be the head of the group. */
4337 register regnode *br;
4338 register regnode *lastbr;
cbbf8932 4339 register regnode *ender = NULL;
a0d0e21e 4340 register I32 parno = 0;
cbbf8932
AL
4341 I32 flags;
4342 const I32 oregflags = RExC_flags;
6136c704
AL
4343 bool have_branch = 0;
4344 bool is_open = 0;
9d1d55b5
JP
4345
4346 /* for (?g), (?gc), and (?o) warnings; warning
4347 about (?c) will warn about (?g) -- japhy */
4348
6136c704
AL
4349#define WASTED_O 0x01
4350#define WASTED_G 0x02
4351#define WASTED_C 0x04
4352#define WASTED_GC (0x02|0x04)
cbbf8932 4353 I32 wastedflags = 0x00;
9d1d55b5 4354
fac92740 4355 char * parse_start = RExC_parse; /* MJD */
a28509cc 4356 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4357
3dab1dad
YO
4358 GET_RE_DEBUG_FLAGS_DECL;
4359 DEBUG_PARSE("reg ");
4360
4361
821b33a5 4362 *flagp = 0; /* Tentatively. */
a0d0e21e 4363
9d1d55b5 4364
a0d0e21e
LW
4365 /* Make an OPEN node, if parenthesized. */
4366 if (paren) {
fac92740 4367 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4368 U32 posflags = 0, negflags = 0;
4369 U32 *flagsp = &posflags;
6136c704 4370 bool is_logical = 0;
a28509cc 4371 const char * const seqstart = RExC_parse;
ca9dfc88 4372
830247a4
IZ
4373 RExC_parse++;
4374 paren = *RExC_parse++;
c277df42 4375 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4376 switch (paren) {
fac92740 4377 case '<': /* (?<...) */
830247a4 4378 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 4379 if (*RExC_parse == '!')
c277df42 4380 paren = ',';
b81d288d 4381 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 4382 goto unknown;
830247a4 4383 RExC_parse++;
fac92740
MJD
4384 case '=': /* (?=...) */
4385 case '!': /* (?!...) */
830247a4 4386 RExC_seen_zerolen++;
fac92740
MJD
4387 case ':': /* (?:...) */
4388 case '>': /* (?>...) */
a0d0e21e 4389 break;
fac92740
MJD
4390 case '$': /* (?$...) */
4391 case '@': /* (?@...) */
8615cb43 4392 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4393 break;
fac92740 4394 case '#': /* (?#...) */
830247a4
IZ
4395 while (*RExC_parse && *RExC_parse != ')')
4396 RExC_parse++;
4397 if (*RExC_parse != ')')
c277df42 4398 FAIL("Sequence (?#... not terminated");
830247a4 4399 nextchar(pRExC_state);
a0d0e21e
LW
4400 *flagp = TRYAGAIN;
4401 return NULL;
fac92740 4402 case 'p': /* (?p...) */
9014280d 4403 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 4404 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 4405 /* FALL THROUGH*/
fac92740 4406 case '?': /* (??...) */
6136c704 4407 is_logical = 1;
438a3801
YST
4408 if (*RExC_parse != '{')
4409 goto unknown;
830247a4 4410 paren = *RExC_parse++;
0f5d15d6 4411 /* FALL THROUGH */
fac92740 4412 case '{': /* (?{...}) */
c277df42 4413 {
c277df42
IZ
4414 I32 count = 1, n = 0;
4415 char c;
830247a4 4416 char *s = RExC_parse;
c277df42 4417
830247a4
IZ
4418 RExC_seen_zerolen++;
4419 RExC_seen |= REG_SEEN_EVAL;
4420 while (count && (c = *RExC_parse)) {
6136c704
AL
4421 if (c == '\\') {
4422 if (RExC_parse[1])
4423 RExC_parse++;
4424 }
b81d288d 4425 else if (c == '{')
c277df42 4426 count++;
b81d288d 4427 else if (c == '}')
c277df42 4428 count--;
830247a4 4429 RExC_parse++;
c277df42 4430 }
6136c704 4431 if (*RExC_parse != ')') {
b81d288d 4432 RExC_parse = s;
b45f050a
JF
4433 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
4434 }
c277df42 4435 if (!SIZE_ONLY) {
f3548bdc 4436 PAD *pad;
6136c704
AL
4437 OP_4tree *sop, *rop;
4438 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 4439
569233ed
SB
4440 ENTER;
4441 Perl_save_re_context(aTHX);
f3548bdc 4442 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
4443 sop->op_private |= OPpREFCOUNTED;
4444 /* re_dup will OpREFCNT_inc */
4445 OpREFCNT_set(sop, 1);
569233ed 4446 LEAVE;
c277df42 4447
830247a4
IZ
4448 n = add_data(pRExC_state, 3, "nop");
4449 RExC_rx->data->data[n] = (void*)rop;
4450 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 4451 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 4452 SvREFCNT_dec(sv);
a0ed51b3 4453 }
e24b16f9 4454 else { /* First pass */
830247a4 4455 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 4456 && IN_PERL_RUNTIME)
2cd61cdb
IZ
4457 /* No compiled RE interpolated, has runtime
4458 components ===> unsafe. */
4459 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 4460 if (PL_tainting && PL_tainted)
cc6b7395 4461 FAIL("Eval-group in insecure regular expression");
54df2634 4462#if PERL_VERSION > 8
923e4eb5 4463 if (IN_PERL_COMPILETIME)
b5c19bd7 4464 PL_cv_has_eval = 1;
54df2634 4465#endif
c277df42 4466 }
b5c19bd7 4467
830247a4 4468 nextchar(pRExC_state);
6136c704 4469 if (is_logical) {
830247a4 4470 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4471 if (!SIZE_ONLY)
4472 ret->flags = 2;
3dab1dad 4473 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 4474 /* deal with the length of this later - MJD */
0f5d15d6
IZ
4475 return ret;
4476 }
ccb2c380
MP
4477 ret = reganode(pRExC_state, EVAL, n);
4478 Set_Node_Length(ret, RExC_parse - parse_start + 1);
4479 Set_Node_Offset(ret, parse_start);
4480 return ret;
c277df42 4481 }
fac92740 4482 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 4483 {
fac92740 4484 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
4485 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
4486 || RExC_parse[1] == '<'
830247a4 4487 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
4488 I32 flag;
4489
830247a4 4490 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4491 if (!SIZE_ONLY)
4492 ret->flags = 1;
3dab1dad 4493 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 4494 goto insert_if;
b81d288d 4495 }
a0ed51b3 4496 }
830247a4 4497 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 4498 /* (?(1)...) */
6136c704 4499 char c;
830247a4 4500 parno = atoi(RExC_parse++);
c277df42 4501
830247a4
IZ
4502 while (isDIGIT(*RExC_parse))
4503 RExC_parse++;
fac92740 4504 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 4505
830247a4 4506 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 4507 vFAIL("Switch condition not recognized");
c277df42 4508 insert_if:
3dab1dad
YO
4509 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
4510 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 4511 if (br == NULL)
830247a4 4512 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 4513 else
3dab1dad 4514 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 4515 c = *nextchar(pRExC_state);
d1b80229
IZ
4516 if (flags&HASWIDTH)
4517 *flagp |= HASWIDTH;
c277df42 4518 if (c == '|') {
830247a4 4519 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
4520 regbranch(pRExC_state, &flags, 1,depth+1);
4521 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
4522 if (flags&HASWIDTH)
4523 *flagp |= HASWIDTH;
830247a4 4524 c = *nextchar(pRExC_state);
a0ed51b3
LW
4525 }
4526 else
c277df42
IZ
4527 lastbr = NULL;
4528 if (c != ')')
8615cb43 4529 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 4530 ender = reg_node(pRExC_state, TAIL);
3dab1dad 4531 REGTAIL(pRExC_state, br, ender);
c277df42 4532 if (lastbr) {
3dab1dad
YO
4533 REGTAIL(pRExC_state, lastbr, ender);
4534 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
4535 }
4536 else
3dab1dad 4537 REGTAIL(pRExC_state, ret, ender);
c277df42 4538 return ret;
a0ed51b3
LW
4539 }
4540 else {
830247a4 4541 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
4542 }
4543 }
1b1626e4 4544 case 0:
830247a4 4545 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 4546 vFAIL("Sequence (? incomplete");
1b1626e4 4547 break;
a0d0e21e 4548 default:
830247a4 4549 --RExC_parse;
fac92740 4550 parse_flags: /* (?i) */
830247a4 4551 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
4552 /* (?g), (?gc) and (?o) are useless here
4553 and must be globally applied -- japhy */
4554
4555 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
4556 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 4557 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
4558 if (! (wastedflags & wflagbit) ) {
4559 wastedflags |= wflagbit;
4560 vWARN5(
4561 RExC_parse + 1,
4562 "Useless (%s%c) - %suse /%c modifier",
4563 flagsp == &negflags ? "?-" : "?",
4564 *RExC_parse,
4565 flagsp == &negflags ? "don't " : "",
4566 *RExC_parse
4567 );
4568 }
4569 }
4570 }
4571 else if (*RExC_parse == 'c') {
4572 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
4573 if (! (wastedflags & WASTED_C) ) {
4574 wastedflags |= WASTED_GC;
9d1d55b5
JP
4575 vWARN3(
4576 RExC_parse + 1,
4577 "Useless (%sc) - %suse /gc modifier",
4578 flagsp == &negflags ? "?-" : "?",
4579 flagsp == &negflags ? "don't " : ""
4580 );
4581 }
4582 }
4583 }
4584 else { pmflag(flagsp, *RExC_parse); }
4585
830247a4 4586 ++RExC_parse;
ca9dfc88 4587 }
830247a4 4588 if (*RExC_parse == '-') {
ca9dfc88 4589 flagsp = &negflags;
9d1d55b5 4590 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 4591 ++RExC_parse;
ca9dfc88 4592 goto parse_flags;
48c036b1 4593 }
e2509266
JH
4594 RExC_flags |= posflags;
4595 RExC_flags &= ~negflags;
830247a4
IZ
4596 if (*RExC_parse == ':') {
4597 RExC_parse++;
ca9dfc88
IZ
4598 paren = ':';
4599 break;
4600 }
c277df42 4601 unknown:
830247a4
IZ
4602 if (*RExC_parse != ')') {
4603 RExC_parse++;
4604 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 4605 }
830247a4 4606 nextchar(pRExC_state);
a0d0e21e
LW
4607 *flagp = TRYAGAIN;
4608 return NULL;
4609 }
4610 }
fac92740 4611 else { /* (...) */
830247a4
IZ
4612 parno = RExC_npar;
4613 RExC_npar++;
4614 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
4615 Set_Node_Length(ret, 1); /* MJD */
4616 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 4617 is_open = 1;
a0d0e21e 4618 }
a0ed51b3 4619 }
fac92740 4620 else /* ! paren */
a0d0e21e
LW
4621 ret = NULL;
4622
4623 /* Pick up the branches, linking them together. */
fac92740 4624 parse_start = RExC_parse; /* MJD */
3dab1dad 4625 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 4626 /* branch_len = (paren != 0); */
2af232bd 4627
a0d0e21e
LW
4628 if (br == NULL)
4629 return(NULL);
830247a4
IZ
4630 if (*RExC_parse == '|') {
4631 if (!SIZE_ONLY && RExC_extralen) {
4632 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 4633 }
fac92740 4634 else { /* MJD */
830247a4 4635 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
4636 Set_Node_Length(br, paren != 0);
4637 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4638 }
c277df42
IZ
4639 have_branch = 1;
4640 if (SIZE_ONLY)
830247a4 4641 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
4642 }
4643 else if (paren == ':') {
c277df42
IZ
4644 *flagp |= flags&SIMPLE;
4645 }
6136c704 4646 if (is_open) { /* Starts with OPEN. */
3dab1dad 4647 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
4648 }
4649 else if (paren != '?') /* Not Conditional */
a0d0e21e 4650 ret = br;
32a0ca98 4651 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 4652 lastbr = br;
830247a4
IZ
4653 while (*RExC_parse == '|') {
4654 if (!SIZE_ONLY && RExC_extralen) {
4655 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 4656 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
4657 }
4658 if (SIZE_ONLY)
830247a4
IZ
4659 RExC_extralen += 2; /* Account for LONGJMP. */
4660 nextchar(pRExC_state);
3dab1dad 4661 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 4662
a687059c 4663 if (br == NULL)
a0d0e21e 4664 return(NULL);
3dab1dad 4665 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 4666 lastbr = br;
821b33a5
IZ
4667 if (flags&HASWIDTH)
4668 *flagp |= HASWIDTH;
a687059c 4669 *flagp |= flags&SPSTART;
a0d0e21e
LW
4670 }
4671
c277df42
IZ
4672 if (have_branch || paren != ':') {
4673 /* Make a closing node, and hook it on the end. */
4674 switch (paren) {
4675 case ':':
830247a4 4676 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
4677 break;
4678 case 1:
830247a4 4679 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
4680 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4681 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
4682 break;
4683 case '<':
c277df42
IZ
4684 case ',':
4685 case '=':
4686 case '!':
c277df42 4687 *flagp &= ~HASWIDTH;
821b33a5
IZ
4688 /* FALL THROUGH */
4689 case '>':
830247a4 4690 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
4691 break;
4692 case 0:
830247a4 4693 ender = reg_node(pRExC_state, END);
c277df42
IZ
4694 break;
4695 }
07be1b83 4696 REGTAIL_STUDY(pRExC_state, lastbr, ender);
a0d0e21e 4697
9674d46a 4698 if (have_branch && !SIZE_ONLY) {
c277df42 4699 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
4700 for (br = ret; br; br = regnext(br)) {
4701 const U8 op = PL_regkind[OP(br)];
4702 if (op == BRANCH) {
07be1b83 4703 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
4704 }
4705 else if (op == BRANCHJ) {
07be1b83 4706 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 4707 }
c277df42
IZ
4708 }
4709 }
a0d0e21e 4710 }
c277df42
IZ
4711
4712 {
e1ec3a88
AL
4713 const char *p;
4714 static const char parens[] = "=!<,>";
c277df42
IZ
4715
4716 if (paren && (p = strchr(parens, paren))) {
eb160463 4717 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
4718 int flag = (p - parens) > 1;
4719
4720 if (paren == '>')
4721 node = SUSPEND, flag = 0;
830247a4 4722 reginsert(pRExC_state, node,ret);
45948336
EP
4723 Set_Node_Cur_Length(ret);
4724 Set_Node_Offset(ret, parse_start + 1);
c277df42 4725 ret->flags = flag;
07be1b83 4726 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 4727 }
a0d0e21e
LW
4728 }
4729
4730 /* Check for proper termination. */
ce3e6498 4731 if (paren) {
e2509266 4732 RExC_flags = oregflags;
830247a4
IZ
4733 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4734 RExC_parse = oregcomp_parse;
380a0633 4735 vFAIL("Unmatched (");
ce3e6498 4736 }
a0ed51b3 4737 }
830247a4
IZ
4738 else if (!paren && RExC_parse < RExC_end) {
4739 if (*RExC_parse == ')') {
4740 RExC_parse++;
380a0633 4741 vFAIL("Unmatched )");
a0ed51b3
LW
4742 }
4743 else
b45f050a 4744 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
4745 /* NOTREACHED */
4746 }
a687059c 4747
a0d0e21e 4748 return(ret);
a687059c
LW
4749}
4750
4751/*
4752 - regbranch - one alternative of an | operator
4753 *
4754 * Implements the concatenation operator.
4755 */
76e3520e 4756STATIC regnode *
3dab1dad 4757S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 4758{
97aff369 4759 dVAR;
c277df42
IZ
4760 register regnode *ret;
4761 register regnode *chain = NULL;
4762 register regnode *latest;
4763 I32 flags = 0, c = 0;
3dab1dad
YO
4764 GET_RE_DEBUG_FLAGS_DECL;
4765 DEBUG_PARSE("brnc");
b81d288d 4766 if (first)
c277df42
IZ
4767 ret = NULL;
4768 else {
b81d288d 4769 if (!SIZE_ONLY && RExC_extralen)
830247a4 4770 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 4771 else {
830247a4 4772 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
4773 Set_Node_Length(ret, 1);
4774 }
c277df42
IZ
4775 }
4776
b81d288d 4777 if (!first && SIZE_ONLY)
830247a4 4778 RExC_extralen += 1; /* BRANCHJ */
b81d288d 4779
c277df42 4780 *flagp = WORST; /* Tentatively. */
a0d0e21e 4781
830247a4
IZ
4782 RExC_parse--;
4783 nextchar(pRExC_state);
4784 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 4785 flags &= ~TRYAGAIN;
3dab1dad 4786 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4787 if (latest == NULL) {
4788 if (flags & TRYAGAIN)
4789 continue;
4790 return(NULL);
a0ed51b3
LW
4791 }
4792 else if (ret == NULL)
c277df42 4793 ret = latest;
a0d0e21e 4794 *flagp |= flags&HASWIDTH;
c277df42 4795 if (chain == NULL) /* First piece. */
a0d0e21e
LW
4796 *flagp |= flags&SPSTART;
4797 else {
830247a4 4798 RExC_naughty++;
3dab1dad 4799 REGTAIL(pRExC_state, chain, latest);
a687059c 4800 }
a0d0e21e 4801 chain = latest;
c277df42
IZ
4802 c++;
4803 }
4804 if (chain == NULL) { /* Loop ran zero times. */
830247a4 4805 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
4806 if (ret == NULL)
4807 ret = chain;
4808 }
4809 if (c == 1) {
4810 *flagp |= flags&SIMPLE;
a0d0e21e 4811 }
a687059c 4812
d4c19fe8 4813 return ret;
a687059c
LW
4814}
4815
4816/*
4817 - regpiece - something followed by possible [*+?]
4818 *
4819 * Note that the branching code sequences used for ? and the general cases
4820 * of * and + are somewhat optimized: they use the same NOTHING node as
4821 * both the endmarker for their branch list and the body of the last branch.
4822 * It might seem that this node could be dispensed with entirely, but the
4823 * endmarker role is not redundant.
4824 */
76e3520e 4825STATIC regnode *
3dab1dad 4826S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4827{
97aff369 4828 dVAR;
c277df42 4829 register regnode *ret;
a0d0e21e
LW
4830 register char op;
4831 register char *next;
4832 I32 flags;
1df70142 4833 const char * const origparse = RExC_parse;
a0d0e21e 4834 I32 min;
c277df42 4835 I32 max = REG_INFTY;
fac92740 4836 char *parse_start;
10edeb5d 4837 const char *maxpos = NULL;
3dab1dad
YO
4838 GET_RE_DEBUG_FLAGS_DECL;
4839 DEBUG_PARSE("piec");
a0d0e21e 4840
3dab1dad 4841 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4842 if (ret == NULL) {
4843 if (flags & TRYAGAIN)
4844 *flagp |= TRYAGAIN;
4845 return(NULL);
4846 }
4847
830247a4 4848 op = *RExC_parse;
a0d0e21e 4849
830247a4 4850 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 4851 maxpos = NULL;
fac92740 4852 parse_start = RExC_parse; /* MJD */
830247a4 4853 next = RExC_parse + 1;
a0d0e21e
LW
4854 while (isDIGIT(*next) || *next == ',') {
4855 if (*next == ',') {
4856 if (maxpos)
4857 break;
4858 else
4859 maxpos = next;
a687059c 4860 }
a0d0e21e
LW
4861 next++;
4862 }
4863 if (*next == '}') { /* got one */
4864 if (!maxpos)
4865 maxpos = next;
830247a4
IZ
4866 RExC_parse++;
4867 min = atoi(RExC_parse);
a0d0e21e
LW
4868 if (*maxpos == ',')
4869 maxpos++;
4870 else
830247a4 4871 maxpos = RExC_parse;
a0d0e21e
LW
4872 max = atoi(maxpos);
4873 if (!max && *maxpos != '0')
c277df42
IZ
4874 max = REG_INFTY; /* meaning "infinity" */
4875 else if (max >= REG_INFTY)
8615cb43 4876 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
4877 RExC_parse = next;
4878 nextchar(pRExC_state);
a0d0e21e
LW
4879
4880 do_curly:
4881 if ((flags&SIMPLE)) {
830247a4
IZ
4882 RExC_naughty += 2 + RExC_naughty / 2;
4883 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
4884 Set_Node_Offset(ret, parse_start+1); /* MJD */
4885 Set_Node_Cur_Length(ret);
a0d0e21e
LW
4886 }
4887 else {
3dab1dad 4888 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
4889
4890 w->flags = 0;
3dab1dad 4891 REGTAIL(pRExC_state, ret, w);
830247a4
IZ
4892 if (!SIZE_ONLY && RExC_extralen) {
4893 reginsert(pRExC_state, LONGJMP,ret);
4894 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
4895 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4896 }
830247a4 4897 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
4898 /* MJD hk */
4899 Set_Node_Offset(ret, parse_start+1);
2af232bd 4900 Set_Node_Length(ret,
fac92740 4901 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 4902
830247a4 4903 if (!SIZE_ONLY && RExC_extralen)
c277df42 4904 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 4905 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 4906 if (SIZE_ONLY)
830247a4
IZ
4907 RExC_whilem_seen++, RExC_extralen += 3;
4908 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 4909 }
c277df42 4910 ret->flags = 0;
a0d0e21e
LW
4911
4912 if (min > 0)
821b33a5
IZ
4913 *flagp = WORST;
4914 if (max > 0)
4915 *flagp |= HASWIDTH;
a0d0e21e 4916 if (max && max < min)
8615cb43 4917 vFAIL("Can't do {n,m} with n > m");
c277df42 4918 if (!SIZE_ONLY) {
eb160463
GS
4919 ARG1_SET(ret, (U16)min);
4920 ARG2_SET(ret, (U16)max);
a687059c 4921 }
a687059c 4922
a0d0e21e 4923 goto nest_check;
a687059c 4924 }
a0d0e21e 4925 }
a687059c 4926
a0d0e21e
LW
4927 if (!ISMULT1(op)) {
4928 *flagp = flags;
a687059c 4929 return(ret);
a0d0e21e 4930 }
bb20fd44 4931
c277df42 4932#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
4933
4934 /* if this is reinstated, don't forget to put this back into perldiag:
4935
4936 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4937
4938 (F) The part of the regexp subject to either the * or + quantifier
4939 could match an empty string. The {#} shows in the regular
4940 expression about where the problem was discovered.
4941
4942 */
4943
bb20fd44 4944 if (!(flags&HASWIDTH) && op != '?')
b45f050a 4945 vFAIL("Regexp *+ operand could be empty");
b81d288d 4946#endif
bb20fd44 4947
fac92740 4948 parse_start = RExC_parse;
830247a4 4949 nextchar(pRExC_state);
a0d0e21e 4950
821b33a5 4951 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
4952
4953 if (op == '*' && (flags&SIMPLE)) {
830247a4 4954 reginsert(pRExC_state, STAR, ret);
c277df42 4955 ret->flags = 0;
830247a4 4956 RExC_naughty += 4;
a0d0e21e
LW
4957 }
4958 else if (op == '*') {
4959 min = 0;
4960 goto do_curly;
a0ed51b3
LW
4961 }
4962 else if (op == '+' && (flags&SIMPLE)) {
830247a4 4963 reginsert(pRExC_state, PLUS, ret);
c277df42 4964 ret->flags = 0;
830247a4 4965 RExC_naughty += 3;
a0d0e21e
LW
4966 }
4967 else if (op == '+') {
4968 min = 1;
4969 goto do_curly;
a0ed51b3
LW
4970 }
4971 else if (op == '?') {
a0d0e21e
LW
4972 min = 0; max = 1;
4973 goto do_curly;
4974 }
4975 nest_check:
041457d9 4976 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 4977 vWARN3(RExC_parse,
b45f050a 4978 "%.*s matches null string many times",
afd78fd5 4979 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 4980 origparse);
a0d0e21e
LW
4981 }
4982
830247a4
IZ
4983 if (*RExC_parse == '?') {
4984 nextchar(pRExC_state);
4985 reginsert(pRExC_state, MINMOD, ret);
3dab1dad 4986 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 4987 }
830247a4
IZ
4988 if (ISMULT2(RExC_parse)) {
4989 RExC_parse++;
b45f050a
JF
4990 vFAIL("Nested quantifiers");
4991 }
a0d0e21e
LW
4992
4993 return(ret);
a687059c
LW
4994}
4995
fc8cd66c
YO
4996
4997/* reg_namedseq(pRExC_state,UVp)
4998
4999 This is expected to be called by a parser routine that has
5000 recognized'\N' and needs to handle the rest. RExC_parse is
5001 expected to point at the first char following the N at the time
5002 of the call.
5003
5004 If valuep is non-null then it is assumed that we are parsing inside
5005 of a charclass definition and the first codepoint in the resolved
5006 string is returned via *valuep and the routine will return NULL.
5007 In this mode if a multichar string is returned from the charnames
5008 handler a warning will be issued, and only the first char in the
5009 sequence will be examined. If the string returned is zero length
5010 then the value of *valuep is undefined and NON-NULL will
5011 be returned to indicate failure. (This will NOT be a valid pointer
5012 to a regnode.)
5013
5014 If value is null then it is assumed that we are parsing normal text
5015 and inserts a new EXACT node into the program containing the resolved
5016 string and returns a pointer to the new node. If the string is
5017 zerolength a NOTHING node is emitted.
5018
5019 On success RExC_parse is set to the char following the endbrace.
5020 Parsing failures will generate a fatal errorvia vFAIL(...)
5021
5022 NOTE: We cache all results from the charnames handler locally in
5023 the RExC_charnames hash (created on first use) to prevent a charnames
5024 handler from playing silly-buggers and returning a short string and
5025 then a long string for a given pattern. Since the regexp program
5026 size is calculated during an initial parse this would result
5027 in a buffer overrun so we cache to prevent the charname result from
5028 changing during the course of the parse.
5029
5030 */
5031STATIC regnode *
5032S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5033{
5034 char * name; /* start of the content of the name */
5035 char * endbrace; /* endbrace following the name */
5036 SV *sv_str = NULL;
5037 SV *sv_name = NULL;
5038 STRLEN len; /* this has various purposes throughout the code */
5039 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5040 regnode *ret = NULL;
5041
5042 if (*RExC_parse != '{') {
5043 vFAIL("Missing braces on \\N{}");
5044 }
5045 name = RExC_parse+1;
5046 endbrace = strchr(RExC_parse, '}');
5047 if ( ! endbrace ) {
5048 RExC_parse++;
5049 vFAIL("Missing right brace on \\N{}");
5050 }
5051 RExC_parse = endbrace + 1;
5052
5053
5054 /* RExC_parse points at the beginning brace,
5055 endbrace points at the last */
5056 if ( name[0]=='U' && name[1]=='+' ) {
5057 /* its a "unicode hex" notation {U+89AB} */
5058 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5059 | PERL_SCAN_DISALLOW_PREFIX
5060 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5061 UV cp;
196f1508 5062 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5063 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5064 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5065 cp = 0xFFFD;
5066 }
5067 if (cp > 0xff)
5068 RExC_utf8 = 1;
5069 if ( valuep ) {
5070 *valuep = cp;
5071 return NULL;
5072 }
5073 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5074 } else {
5075 /* fetch the charnames handler for this scope */
5076 HV * const table = GvHV(PL_hintgv);
5077 SV **cvp= table ?
5078 hv_fetchs(table, "charnames", FALSE) :
5079 NULL;
5080 SV *cv= cvp ? *cvp : NULL;
5081 HE *he_str;
5082 int count;
5083 /* create an SV with the name as argument */
5084 sv_name = newSVpvn(name, endbrace - name);
5085
5086 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5087 vFAIL2("Constant(\\N{%s}) unknown: "
5088 "(possibly a missing \"use charnames ...\")",
5089 SvPVX(sv_name));
5090 }
5091 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5092 vFAIL2("Constant(\\N{%s}): "
5093 "$^H{charnames} is not defined",SvPVX(sv_name));
5094 }
5095
5096
5097
5098 if (!RExC_charnames) {
5099 /* make sure our cache is allocated */
5100 RExC_charnames = newHV();
5101 }
5102 /* see if we have looked this one up before */
5103 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5104 if ( he_str ) {
5105 sv_str = HeVAL(he_str);
5106 cached = 1;
5107 } else {
5108 dSP ;
5109
5110 ENTER ;
5111 SAVETMPS ;
5112 PUSHMARK(SP) ;
5113
5114 XPUSHs(sv_name);
5115
5116 PUTBACK ;
5117
5118 count= call_sv(cv, G_SCALAR);
5119
5120 if (count == 1) { /* XXXX is this right? dmq */
5121 sv_str = POPs;
5122 SvREFCNT_inc_simple_void(sv_str);
5123 }
5124
5125 SPAGAIN ;
5126 PUTBACK ;
5127 FREETMPS ;
5128 LEAVE ;
5129
5130 if ( !sv_str || !SvOK(sv_str) ) {
5131 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5132 "did not return a defined value",SvPVX(sv_name));
5133 }
5134 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5135 cached = 1;
5136 }
5137 }
5138 if (valuep) {
5139 char *p = SvPV(sv_str, len);
5140 if (len) {
5141 STRLEN numlen = 1;
5142 if ( SvUTF8(sv_str) ) {
196f1508 5143 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5144 if (*valuep > 0x7F)
5145 RExC_utf8 = 1;
5146 /* XXXX
5147 We have to turn on utf8 for high bit chars otherwise
5148 we get failures with
5149
5150 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5151 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5152
5153 This is different from what \x{} would do with the same
5154 codepoint, where the condition is > 0xFF.
5155 - dmq
5156 */
5157
5158
5159 } else {
5160 *valuep = (UV)*p;
5161 /* warn if we havent used the whole string? */
5162 }
5163 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5164 vWARN2(RExC_parse,
5165 "Ignoring excess chars from \\N{%s} in character class",
5166 SvPVX(sv_name)
5167 );
5168 }
5169 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5170 vWARN2(RExC_parse,
5171 "Ignoring zero length \\N{%s} in character class",
5172 SvPVX(sv_name)
5173 );
5174 }
5175 if (sv_name)
5176 SvREFCNT_dec(sv_name);
5177 if (!cached)
5178 SvREFCNT_dec(sv_str);
5179 return len ? NULL : (regnode *)&len;
5180 } else if(SvCUR(sv_str)) {
5181
5182 char *s;
5183 char *p, *pend;
5184 STRLEN charlen = 1;
5185 char * parse_start = name-3; /* needed for the offsets */
5186 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5187
5188 ret = reg_node(pRExC_state,
5189 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5190 s= STRING(ret);
5191
5192 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5193 sv_utf8_upgrade(sv_str);
5194 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5195 RExC_utf8= 1;
5196 }
5197
5198 p = SvPV(sv_str, len);
5199 pend = p + len;
5200 /* len is the length written, charlen is the size the char read */
5201 for ( len = 0; p < pend; p += charlen ) {
5202 if (UTF) {
196f1508 5203 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5204 if (FOLD) {
5205 STRLEN foldlen,numlen;
5206 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5207 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5208 /* Emit all the Unicode characters. */
5209
5210 for (foldbuf = tmpbuf;
5211 foldlen;
5212 foldlen -= numlen)
5213 {
5214 uvc = utf8_to_uvchr(foldbuf, &numlen);
5215 if (numlen > 0) {
5216 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5217 s += unilen;
5218 len += unilen;
5219 /* In EBCDIC the numlen
5220 * and unilen can differ. */
5221 foldbuf += numlen;
5222 if (numlen >= foldlen)
5223 break;
5224 }
5225 else
5226 break; /* "Can't happen." */
5227 }
5228 } else {
5229 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5230 if (unilen > 0) {
5231 s += unilen;
5232 len += unilen;
5233 }
5234 }
5235 } else {
5236 len++;
5237 REGC(*p, s++);
5238 }
5239 }
5240 if (SIZE_ONLY) {
5241 RExC_size += STR_SZ(len);
5242 } else {
5243 STR_LEN(ret) = len;
5244 RExC_emit += STR_SZ(len);
5245 }
5246 Set_Node_Cur_Length(ret); /* MJD */
5247 RExC_parse--;
5248 nextchar(pRExC_state);
5249 } else {
5250 ret = reg_node(pRExC_state,NOTHING);
5251 }
5252 if (!cached) {
5253 SvREFCNT_dec(sv_str);
5254 }
5255 if (sv_name) {
5256 SvREFCNT_dec(sv_name);
5257 }
5258 return ret;
5259
5260}
5261
5262
5263
a687059c
LW
5264/*
5265 - regatom - the lowest level
5266 *
5267 * Optimization: gobbles an entire sequence of ordinary characters so that
5268 * it can turn them into a single node, which is smaller to store and
5269 * faster to run. Backslashed characters are exceptions, each becoming a
5270 * separate node; the code is simpler that way and it's not worth fixing.
5271 *
7f6f358c
YO
5272 * [Yes, it is worth fixing, some scripts can run twice the speed.]
5273 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
5274 */
76e3520e 5275STATIC regnode *
3dab1dad 5276S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5277{
97aff369 5278 dVAR;
cbbf8932 5279 register regnode *ret = NULL;
a0d0e21e 5280 I32 flags;
45948336 5281 char *parse_start = RExC_parse;
3dab1dad
YO
5282 GET_RE_DEBUG_FLAGS_DECL;
5283 DEBUG_PARSE("atom");
a0d0e21e
LW
5284 *flagp = WORST; /* Tentatively. */
5285
5286tryagain:
830247a4 5287 switch (*RExC_parse) {
a0d0e21e 5288 case '^':
830247a4
IZ
5289 RExC_seen_zerolen++;
5290 nextchar(pRExC_state);
e2509266 5291 if (RExC_flags & PMf_MULTILINE)
830247a4 5292 ret = reg_node(pRExC_state, MBOL);
e2509266 5293 else if (RExC_flags & PMf_SINGLELINE)
830247a4 5294 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 5295 else
830247a4 5296 ret = reg_node(pRExC_state, BOL);
fac92740 5297 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5298 break;
5299 case '$':
830247a4 5300 nextchar(pRExC_state);
b81d288d 5301 if (*RExC_parse)
830247a4 5302 RExC_seen_zerolen++;
e2509266 5303 if (RExC_flags & PMf_MULTILINE)
830247a4 5304 ret = reg_node(pRExC_state, MEOL);
e2509266 5305 else if (RExC_flags & PMf_SINGLELINE)
830247a4 5306 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 5307 else
830247a4 5308 ret = reg_node(pRExC_state, EOL);
fac92740 5309 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5310 break;
5311 case '.':
830247a4 5312 nextchar(pRExC_state);
e2509266 5313 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
5314 ret = reg_node(pRExC_state, SANY);
5315 else
5316 ret = reg_node(pRExC_state, REG_ANY);
5317 *flagp |= HASWIDTH|SIMPLE;
830247a4 5318 RExC_naughty++;
fac92740 5319 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5320 break;
5321 case '[':
b45f050a 5322 {
3dab1dad
YO
5323 char * const oregcomp_parse = ++RExC_parse;
5324 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
5325 if (*RExC_parse != ']') {
5326 RExC_parse = oregcomp_parse;
b45f050a
JF
5327 vFAIL("Unmatched [");
5328 }
830247a4 5329 nextchar(pRExC_state);
a0d0e21e 5330 *flagp |= HASWIDTH|SIMPLE;
fac92740 5331 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 5332 break;
b45f050a 5333 }
a0d0e21e 5334 case '(':
830247a4 5335 nextchar(pRExC_state);
3dab1dad 5336 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 5337 if (ret == NULL) {
bf93d4cc 5338 if (flags & TRYAGAIN) {
830247a4 5339 if (RExC_parse == RExC_end) {
bf93d4cc
GS
5340 /* Make parent create an empty node if needed. */
5341 *flagp |= TRYAGAIN;
5342 return(NULL);
5343 }
a0d0e21e 5344 goto tryagain;
bf93d4cc 5345 }
a0d0e21e
LW
5346 return(NULL);
5347 }
c277df42 5348 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
5349 break;
5350 case '|':
5351 case ')':
5352 if (flags & TRYAGAIN) {
5353 *flagp |= TRYAGAIN;
5354 return NULL;
5355 }
b45f050a 5356 vFAIL("Internal urp");
a0d0e21e
LW
5357 /* Supposed to be caught earlier. */
5358 break;
85afd4ae 5359 case '{':
830247a4
IZ
5360 if (!regcurly(RExC_parse)) {
5361 RExC_parse++;
85afd4ae
CS
5362 goto defchar;
5363 }
5364 /* FALL THROUGH */
a0d0e21e
LW
5365 case '?':
5366 case '+':
5367 case '*':
830247a4 5368 RExC_parse++;
b45f050a 5369 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
5370 break;
5371 case '\\':
830247a4 5372 switch (*++RExC_parse) {
a0d0e21e 5373 case 'A':
830247a4
IZ
5374 RExC_seen_zerolen++;
5375 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 5376 *flagp |= SIMPLE;
830247a4 5377 nextchar(pRExC_state);
fac92740 5378 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5379 break;
5380 case 'G':
830247a4
IZ
5381 ret = reg_node(pRExC_state, GPOS);
5382 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 5383 *flagp |= SIMPLE;
830247a4 5384 nextchar(pRExC_state);
fac92740 5385 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5386 break;
5387 case 'Z':
830247a4 5388 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 5389 *flagp |= SIMPLE;
a1917ab9 5390 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 5391 nextchar(pRExC_state);
a0d0e21e 5392 break;
b85d18e9 5393 case 'z':
830247a4 5394 ret = reg_node(pRExC_state, EOS);
b85d18e9 5395 *flagp |= SIMPLE;
830247a4
IZ
5396 RExC_seen_zerolen++; /* Do not optimize RE away */
5397 nextchar(pRExC_state);
fac92740 5398 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 5399 break;
4a2d328f 5400 case 'C':
f33976b4
DB
5401 ret = reg_node(pRExC_state, CANY);
5402 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 5403 *flagp |= HASWIDTH|SIMPLE;
830247a4 5404 nextchar(pRExC_state);
fac92740 5405 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
5406 break;
5407 case 'X':
830247a4 5408 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 5409 *flagp |= HASWIDTH;
830247a4 5410 nextchar(pRExC_state);
fac92740 5411 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 5412 break;
a0d0e21e 5413 case 'w':
eb160463 5414 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 5415 *flagp |= HASWIDTH|SIMPLE;
830247a4 5416 nextchar(pRExC_state);
fac92740 5417 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5418 break;
5419 case 'W':
eb160463 5420 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 5421 *flagp |= HASWIDTH|SIMPLE;
830247a4 5422 nextchar(pRExC_state);
fac92740 5423 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5424 break;
5425 case 'b':
830247a4
IZ
5426 RExC_seen_zerolen++;
5427 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 5428 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 5429 *flagp |= SIMPLE;
830247a4 5430 nextchar(pRExC_state);
fac92740 5431 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5432 break;
5433 case 'B':
830247a4
IZ
5434 RExC_seen_zerolen++;
5435 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 5436 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 5437 *flagp |= SIMPLE;
830247a4 5438 nextchar(pRExC_state);
fac92740 5439 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5440 break;
5441 case 's':
eb160463 5442 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 5443 *flagp |= HASWIDTH|SIMPLE;
830247a4 5444 nextchar(pRExC_state);
fac92740 5445 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5446 break;
5447 case 'S':
eb160463 5448 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 5449 *flagp |= HASWIDTH|SIMPLE;
830247a4 5450 nextchar(pRExC_state);
fac92740 5451 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5452 break;
5453 case 'd':
ffc61ed2 5454 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 5455 *flagp |= HASWIDTH|SIMPLE;
830247a4 5456 nextchar(pRExC_state);
fac92740 5457 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5458 break;
5459 case 'D':
ffc61ed2 5460 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 5461 *flagp |= HASWIDTH|SIMPLE;
830247a4 5462 nextchar(pRExC_state);
fac92740 5463 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 5464 break;
a14b48bc
LW
5465 case 'p':
5466 case 'P':
3568d838 5467 {
3dab1dad 5468 char* const oldregxend = RExC_end;
ccb2c380 5469 char* parse_start = RExC_parse - 2;
a14b48bc 5470
830247a4 5471 if (RExC_parse[1] == '{') {
3568d838 5472 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
5473 RExC_end = strchr(RExC_parse, '}');
5474 if (!RExC_end) {
3dab1dad 5475 const U8 c = (U8)*RExC_parse;
830247a4
IZ
5476 RExC_parse += 2;
5477 RExC_end = oldregxend;
0da60cf5 5478 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 5479 }
830247a4 5480 RExC_end++;
a14b48bc 5481 }
af6f566e 5482 else {
830247a4 5483 RExC_end = RExC_parse + 2;
af6f566e
HS
5484 if (RExC_end > oldregxend)
5485 RExC_end = oldregxend;
5486 }
830247a4 5487 RExC_parse--;
a14b48bc 5488
3dab1dad 5489 ret = regclass(pRExC_state,depth+1);
a14b48bc 5490
830247a4
IZ
5491 RExC_end = oldregxend;
5492 RExC_parse--;
ccb2c380
MP
5493
5494 Set_Node_Offset(ret, parse_start + 2);
5495 Set_Node_Cur_Length(ret);
830247a4 5496 nextchar(pRExC_state);
a14b48bc
LW
5497 *flagp |= HASWIDTH|SIMPLE;
5498 }
5499 break;
fc8cd66c
YO
5500 case 'N':
5501 /* Handle \N{NAME} here and not below because it can be
5502 multicharacter. join_exact() will join them up later on.
5503 Also this makes sure that things like /\N{BLAH}+/ and
5504 \N{BLAH} being multi char Just Happen. dmq*/
5505 ++RExC_parse;
5506 ret= reg_namedseq(pRExC_state, NULL);
5507 break;
a0d0e21e
LW
5508 case 'n':
5509 case 'r':
5510 case 't':
5511 case 'f':
5512 case 'e':
5513 case 'a':
5514 case 'x':
5515 case 'c':
5516 case '0':
5517 goto defchar;
5518 case '1': case '2': case '3': case '4':
5519 case '5': case '6': case '7': case '8': case '9':
5520 {
1df70142 5521 const I32 num = atoi(RExC_parse);
a0d0e21e 5522
830247a4 5523 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
5524 goto defchar;
5525 else {
3dab1dad 5526 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
5527 while (isDIGIT(*RExC_parse))
5528 RExC_parse++;
b45f050a 5529
eb160463 5530 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 5531 vFAIL("Reference to nonexistent group");
830247a4 5532 RExC_sawback = 1;
eb160463
GS
5533 ret = reganode(pRExC_state,
5534 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
5535 num);
a0d0e21e 5536 *flagp |= HASWIDTH;
2af232bd 5537
fac92740 5538 /* override incorrect value set in reganode MJD */
2af232bd 5539 Set_Node_Offset(ret, parse_start+1);
fac92740 5540 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
5541 RExC_parse--;
5542 nextchar(pRExC_state);
a0d0e21e
LW
5543 }
5544 }
5545 break;
5546 case '\0':
830247a4 5547 if (RExC_parse >= RExC_end)
b45f050a 5548 FAIL("Trailing \\");
a0d0e21e
LW
5549 /* FALL THROUGH */
5550 default:
a0288114 5551 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 5552 back into the quick-grab loop below */
45948336 5553 parse_start--;
a0d0e21e
LW
5554 goto defchar;
5555 }
5556 break;
4633a7c4
LW
5557
5558 case '#':
e2509266 5559 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
5560 while (RExC_parse < RExC_end && *RExC_parse != '\n')
5561 RExC_parse++;
830247a4 5562 if (RExC_parse < RExC_end)
4633a7c4
LW
5563 goto tryagain;
5564 }
5565 /* FALL THROUGH */
5566
a0d0e21e 5567 default: {
ba210ebe 5568 register STRLEN len;
58ae7d3f 5569 register UV ender;
a0d0e21e 5570 register char *p;
3dab1dad 5571 char *s;
80aecb99 5572 STRLEN foldlen;
89ebb4a3 5573 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
5574
5575 parse_start = RExC_parse - 1;
a0d0e21e 5576
830247a4 5577 RExC_parse++;
a0d0e21e
LW
5578
5579 defchar:
58ae7d3f 5580 ender = 0;
eb160463
GS
5581 ret = reg_node(pRExC_state,
5582 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 5583 s = STRING(ret);
830247a4
IZ
5584 for (len = 0, p = RExC_parse - 1;
5585 len < 127 && p < RExC_end;
a0d0e21e
LW
5586 len++)
5587 {
3dab1dad 5588 char * const oldp = p;
5b5a24f7 5589
e2509266 5590 if (RExC_flags & PMf_EXTENDED)
830247a4 5591 p = regwhite(p, RExC_end);
a0d0e21e
LW
5592 switch (*p) {
5593 case '^':
5594 case '$':
5595 case '.':
5596 case '[':
5597 case '(':
5598 case ')':
5599 case '|':
5600 goto loopdone;
5601 case '\\':
5602 switch (*++p) {
5603 case 'A':
1ed8eac0
JF
5604 case 'C':
5605 case 'X':
a0d0e21e
LW
5606 case 'G':
5607 case 'Z':
b85d18e9 5608 case 'z':
a0d0e21e
LW
5609 case 'w':
5610 case 'W':
5611 case 'b':
5612 case 'B':
5613 case 's':
5614 case 'S':
5615 case 'd':
5616 case 'D':
a14b48bc
LW
5617 case 'p':
5618 case 'P':
fc8cd66c 5619 case 'N':
a0d0e21e
LW
5620 --p;
5621 goto loopdone;
5622 case 'n':
5623 ender = '\n';
5624 p++;
a687059c 5625 break;
a0d0e21e
LW
5626 case 'r':
5627 ender = '\r';
5628 p++;
a687059c 5629 break;
a0d0e21e
LW
5630 case 't':
5631 ender = '\t';
5632 p++;
a687059c 5633 break;
a0d0e21e
LW
5634 case 'f':
5635 ender = '\f';
5636 p++;
a687059c 5637 break;
a0d0e21e 5638 case 'e':
c7f1f016 5639 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 5640 p++;
a687059c 5641 break;
a0d0e21e 5642 case 'a':
c7f1f016 5643 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 5644 p++;
a687059c 5645 break;
a0d0e21e 5646 case 'x':
a0ed51b3 5647 if (*++p == '{') {
1df70142 5648 char* const e = strchr(p, '}');
b81d288d 5649
b45f050a 5650 if (!e) {
830247a4 5651 RExC_parse = p + 1;
b45f050a
JF
5652 vFAIL("Missing right brace on \\x{}");
5653 }
de5f0749 5654 else {
a4c04bdc
NC
5655 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5656 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 5657 STRLEN numlen = e - p - 1;
53305cf1 5658 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
5659 if (ender > 0xff)
5660 RExC_utf8 = 1;
a0ed51b3
LW
5661 p = e + 1;
5662 }
a0ed51b3
LW
5663 }
5664 else {
a4c04bdc 5665 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 5666 STRLEN numlen = 2;
53305cf1 5667 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
5668 p += numlen;
5669 }
a687059c 5670 break;
a0d0e21e
LW
5671 case 'c':
5672 p++;
bbce6d69 5673 ender = UCHARAT(p++);
5674 ender = toCTRL(ender);
a687059c 5675 break;
a0d0e21e
LW
5676 case '0': case '1': case '2': case '3':case '4':
5677 case '5': case '6': case '7': case '8':case '9':
5678 if (*p == '0' ||
830247a4 5679 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 5680 I32 flags = 0;
1df70142 5681 STRLEN numlen = 3;
53305cf1 5682 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
5683 p += numlen;
5684 }
5685 else {
5686 --p;
5687 goto loopdone;
a687059c
LW
5688 }
5689 break;
a0d0e21e 5690 case '\0':
830247a4 5691 if (p >= RExC_end)
b45f050a 5692 FAIL("Trailing \\");
a687059c 5693 /* FALL THROUGH */
a0d0e21e 5694 default:
041457d9 5695 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 5696 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 5697 goto normal_default;
a0d0e21e
LW
5698 }
5699 break;
a687059c 5700 default:
a0ed51b3 5701 normal_default:
fd400ab9 5702 if (UTF8_IS_START(*p) && UTF) {
1df70142 5703 STRLEN numlen;
5e12f4fb 5704 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 5705 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
5706 p += numlen;
5707 }
5708 else
5709 ender = *p++;
a0d0e21e 5710 break;
a687059c 5711 }
e2509266 5712 if (RExC_flags & PMf_EXTENDED)
830247a4 5713 p = regwhite(p, RExC_end);
60a8b682
JH
5714 if (UTF && FOLD) {
5715 /* Prime the casefolded buffer. */
ac7e0132 5716 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 5717 }
a0d0e21e
LW
5718 if (ISMULT2(p)) { /* Back off on ?+*. */
5719 if (len)
5720 p = oldp;
16ea2a2e 5721 else if (UTF) {
80aecb99 5722 if (FOLD) {
60a8b682 5723 /* Emit all the Unicode characters. */
1df70142 5724 STRLEN numlen;
80aecb99
JH
5725 for (foldbuf = tmpbuf;
5726 foldlen;
5727 foldlen -= numlen) {
5728 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 5729 if (numlen > 0) {
71207a34 5730 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
5731 s += unilen;
5732 len += unilen;
5733 /* In EBCDIC the numlen
5734 * and unilen can differ. */
9dc45d57 5735 foldbuf += numlen;
47654450
JH
5736 if (numlen >= foldlen)
5737 break;
9dc45d57
JH
5738 }
5739 else
5740 break; /* "Can't happen." */
80aecb99
JH
5741 }
5742 }
5743 else {
71207a34 5744 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 5745 if (unilen > 0) {
0ebc6274
JH
5746 s += unilen;
5747 len += unilen;
9dc45d57 5748 }
80aecb99 5749 }
a0ed51b3 5750 }
a0d0e21e
LW
5751 else {
5752 len++;
eb160463 5753 REGC((char)ender, s++);
a0d0e21e
LW
5754 }
5755 break;
a687059c 5756 }
16ea2a2e 5757 if (UTF) {
80aecb99 5758 if (FOLD) {
60a8b682 5759 /* Emit all the Unicode characters. */
1df70142 5760 STRLEN numlen;
80aecb99
JH
5761 for (foldbuf = tmpbuf;
5762 foldlen;
5763 foldlen -= numlen) {
5764 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 5765 if (numlen > 0) {
71207a34 5766 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
5767 len += unilen;
5768 s += unilen;
5769 /* In EBCDIC the numlen
5770 * and unilen can differ. */
9dc45d57 5771 foldbuf += numlen;
47654450
JH
5772 if (numlen >= foldlen)
5773 break;
9dc45d57
JH
5774 }
5775 else
5776 break;
80aecb99
JH
5777 }
5778 }
5779 else {
71207a34 5780 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 5781 if (unilen > 0) {
0ebc6274
JH
5782 s += unilen;
5783 len += unilen;
9dc45d57 5784 }
80aecb99
JH
5785 }
5786 len--;
a0ed51b3
LW
5787 }
5788 else
eb160463 5789 REGC((char)ender, s++);
a0d0e21e
LW
5790 }
5791 loopdone:
830247a4 5792 RExC_parse = p - 1;
fac92740 5793 Set_Node_Cur_Length(ret); /* MJD */
830247a4 5794 nextchar(pRExC_state);
793db0cb
JH
5795 {
5796 /* len is STRLEN which is unsigned, need to copy to signed */
5797 IV iv = len;
5798 if (iv < 0)
5799 vFAIL("Internal disaster");
5800 }
a0d0e21e
LW
5801 if (len > 0)
5802 *flagp |= HASWIDTH;
090f7165 5803 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 5804 *flagp |= SIMPLE;
3dab1dad 5805
cd439c50 5806 if (SIZE_ONLY)
830247a4 5807 RExC_size += STR_SZ(len);
3dab1dad
YO
5808 else {
5809 STR_LEN(ret) = len;
830247a4 5810 RExC_emit += STR_SZ(len);
07be1b83 5811 }
3dab1dad 5812 }
a0d0e21e
LW
5813 break;
5814 }
a687059c 5815
60a8b682
JH
5816 /* If the encoding pragma is in effect recode the text of
5817 * any EXACT-kind nodes. */
fc8cd66c 5818 if (ret && PL_encoding && PL_regkind[OP(ret)] == EXACT) {
3dab1dad
YO
5819 const STRLEN oldlen = STR_LEN(ret);
5820 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
d0063567
DK
5821
5822 if (RExC_utf8)
5823 SvUTF8_on(sv);
5824 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
5825 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
5826 const STRLEN newlen = SvCUR(sv);
d0063567
DK
5827
5828 if (SvUTF8(sv))
5829 RExC_utf8 = 1;
5830 if (!SIZE_ONLY) {
a3621e74
YO
5831 GET_RE_DEBUG_FLAGS_DECL;
5832 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
5833 (int)oldlen, STRING(ret),
5834 (int)newlen, s));
5835 Copy(s, STRING(ret), newlen, char);
5836 STR_LEN(ret) += newlen - oldlen;
5837 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
5838 } else
5839 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
5840 }
a72c7584
JH
5841 }
5842
a0d0e21e 5843 return(ret);
a687059c
LW
5844}
5845
873ef191 5846STATIC char *
5f66b61c 5847S_regwhite(char *p, const char *e)
5b5a24f7
CS
5848{
5849 while (p < e) {
5850 if (isSPACE(*p))
5851 ++p;
5852 else if (*p == '#') {
5853 do {
5854 p++;
5855 } while (p < e && *p != '\n');
5856 }
5857 else
5858 break;
5859 }
5860 return p;
5861}
5862
b8c5462f
JH
5863/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
5864 Character classes ([:foo:]) can also be negated ([:^foo:]).
5865 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
5866 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 5867 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
5868
5869#define POSIXCC_DONE(c) ((c) == ':')
5870#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
5871#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
5872
b8c5462f 5873STATIC I32
830247a4 5874S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 5875{
97aff369 5876 dVAR;
936ed897 5877 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 5878
830247a4 5879 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 5880 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 5881 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 5882 const char c = UCHARAT(RExC_parse);
097eb12c 5883 char* const s = RExC_parse++;
b81d288d 5884
9a86a77b 5885 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
5886 RExC_parse++;
5887 if (RExC_parse == RExC_end)
620e46c5 5888 /* Grandfather lone [:, [=, [. */
830247a4 5889 RExC_parse = s;
620e46c5 5890 else {
3dab1dad 5891 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
5892 assert(*t == c);
5893
9a86a77b 5894 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 5895 const char *posixcc = s + 1;
830247a4 5896 RExC_parse++; /* skip over the ending ] */
3dab1dad 5897
b8c5462f 5898 if (*s == ':') {
1df70142
AL
5899 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5900 const I32 skip = t - posixcc;
80916619
NC
5901
5902 /* Initially switch on the length of the name. */
5903 switch (skip) {
5904 case 4:
3dab1dad
YO
5905 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5906 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 5907 break;
80916619
NC
5908 case 5:
5909 /* Names all of length 5. */
5910 /* alnum alpha ascii blank cntrl digit graph lower
5911 print punct space upper */
5912 /* Offset 4 gives the best switch position. */
5913 switch (posixcc[4]) {
5914 case 'a':
3dab1dad
YO
5915 if (memEQ(posixcc, "alph", 4)) /* alpha */
5916 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
5917 break;
5918 case 'e':
3dab1dad
YO
5919 if (memEQ(posixcc, "spac", 4)) /* space */
5920 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
5921 break;
5922 case 'h':
3dab1dad
YO
5923 if (memEQ(posixcc, "grap", 4)) /* graph */
5924 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
5925 break;
5926 case 'i':
3dab1dad
YO
5927 if (memEQ(posixcc, "asci", 4)) /* ascii */
5928 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
5929 break;
5930 case 'k':
3dab1dad
YO
5931 if (memEQ(posixcc, "blan", 4)) /* blank */
5932 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
5933 break;
5934 case 'l':
3dab1dad
YO
5935 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5936 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
5937 break;
5938 case 'm':
3dab1dad
YO
5939 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5940 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
5941 break;
5942 case 'r':
3dab1dad
YO
5943 if (memEQ(posixcc, "lowe", 4)) /* lower */
5944 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5945 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5946 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
5947 break;
5948 case 't':
3dab1dad
YO
5949 if (memEQ(posixcc, "digi", 4)) /* digit */
5950 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5951 else if (memEQ(posixcc, "prin", 4)) /* print */
5952 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5953 else if (memEQ(posixcc, "punc", 4)) /* punct */
5954 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 5955 break;
b8c5462f
JH
5956 }
5957 break;
80916619 5958 case 6:
3dab1dad
YO
5959 if (memEQ(posixcc, "xdigit", 6))
5960 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
5961 break;
5962 }
80916619
NC
5963
5964 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
5965 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5966 t - s - 1, s + 1);
80916619
NC
5967 assert (posixcc[skip] == ':');
5968 assert (posixcc[skip+1] == ']');
b45f050a 5969 } else if (!SIZE_ONLY) {
b8c5462f 5970 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 5971
830247a4 5972 /* adjust RExC_parse so the warning shows after
b45f050a 5973 the class closes */
9a86a77b 5974 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 5975 RExC_parse++;
b45f050a
JF
5976 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5977 }
b8c5462f
JH
5978 } else {
5979 /* Maternal grandfather:
5980 * "[:" ending in ":" but not in ":]" */
830247a4 5981 RExC_parse = s;
767d463e 5982 }
620e46c5
JH
5983 }
5984 }
5985
b8c5462f
JH
5986 return namedclass;
5987}
5988
5989STATIC void
830247a4 5990S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 5991{
97aff369 5992 dVAR;
3dab1dad 5993 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
5994 const char *s = RExC_parse;
5995 const char c = *s++;
b8c5462f 5996
3dab1dad 5997 while (isALNUM(*s))
b8c5462f
JH
5998 s++;
5999 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6000 if (ckWARN(WARN_REGEXP))
6001 vWARN3(s+2,
6002 "POSIX syntax [%c %c] belongs inside character classes",
6003 c, c);
b45f050a
JF
6004
6005 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6006 if (POSIXCC_NOTYET(c)) {
830247a4 6007 /* adjust RExC_parse so the error shows after
b45f050a 6008 the class closes */
9a86a77b 6009 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6010 NOOP;
b45f050a
JF
6011 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6012 }
b8c5462f
JH
6013 }
6014 }
620e46c5
JH
6015}
6016
7f6f358c
YO
6017
6018/*
6019 parse a class specification and produce either an ANYOF node that
6020 matches the pattern. If the pattern matches a single char only and
6021 that char is < 256 then we produce an EXACT node instead.
6022*/
76e3520e 6023STATIC regnode *
3dab1dad 6024S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6025{
97aff369 6026 dVAR;
9ef43ace 6027 register UV value = 0;
9a86a77b 6028 register UV nextvalue;
3568d838 6029 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6030 register IV range = 0;
c277df42 6031 register regnode *ret;
ba210ebe 6032 STRLEN numlen;
ffc61ed2 6033 IV namedclass;
cbbf8932 6034 char *rangebegin = NULL;
936ed897 6035 bool need_class = 0;
c445ea15 6036 SV *listsv = NULL;
ffc61ed2 6037 UV n;
9e55ce06 6038 bool optimize_invert = TRUE;
cbbf8932 6039 AV* unicode_alternate = NULL;
1b2d223b
JH
6040#ifdef EBCDIC
6041 UV literal_endpoint = 0;
6042#endif
7f6f358c 6043 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6044
3dab1dad 6045 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6046 case we need to change the emitted regop to an EXACT. */
07be1b83 6047 const char * orig_parse = RExC_parse;
72f13be8 6048 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6049#ifndef DEBUGGING
6050 PERL_UNUSED_ARG(depth);
6051#endif
72f13be8 6052
3dab1dad 6053 DEBUG_PARSE("clas");
7f6f358c
YO
6054
6055 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6056 ret = reganode(pRExC_state, ANYOF, 0);
6057
6058 if (!SIZE_ONLY)
6059 ANYOF_FLAGS(ret) = 0;
6060
9a86a77b 6061 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6062 RExC_naughty++;
6063 RExC_parse++;
6064 if (!SIZE_ONLY)
6065 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6066 }
a0d0e21e 6067
73060fc4 6068 if (SIZE_ONLY) {
830247a4 6069 RExC_size += ANYOF_SKIP;
73060fc4
JH
6070 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6071 }
936ed897 6072 else {
830247a4 6073 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6074 if (FOLD)
6075 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6076 if (LOC)
6077 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6078 ANYOF_BITMAP_ZERO(ret);
396482e1 6079 listsv = newSVpvs("# comment\n");
a0d0e21e 6080 }
b8c5462f 6081
9a86a77b
JH
6082 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6083
b938889d 6084 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6085 checkposixcc(pRExC_state);
b8c5462f 6086
f064b6ad
HS
6087 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6088 if (UCHARAT(RExC_parse) == ']')
6089 goto charclassloop;
ffc61ed2 6090
fc8cd66c 6091parseit:
9a86a77b 6092 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6093
6094 charclassloop:
6095
6096 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6097
73b437c8 6098 if (!range)
830247a4 6099 rangebegin = RExC_parse;
ffc61ed2 6100 if (UTF) {
5e12f4fb 6101 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6102 RExC_end - RExC_parse,
9f7f3913 6103 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6104 RExC_parse += numlen;
6105 }
6106 else
6107 value = UCHARAT(RExC_parse++);
7f6f358c 6108
9a86a77b
JH
6109 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6110 if (value == '[' && POSIXCC(nextvalue))
830247a4 6111 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6112 else if (value == '\\') {
ffc61ed2 6113 if (UTF) {
5e12f4fb 6114 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6115 RExC_end - RExC_parse,
9f7f3913 6116 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6117 RExC_parse += numlen;
6118 }
6119 else
6120 value = UCHARAT(RExC_parse++);
470c3474 6121 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6122 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6123 * be a problem later if we want switch on Unicode.
6124 * A similar issue a little bit later when switching on
6125 * namedclass. --jhi */
ffc61ed2 6126 switch ((I32)value) {
b8c5462f
JH
6127 case 'w': namedclass = ANYOF_ALNUM; break;
6128 case 'W': namedclass = ANYOF_NALNUM; break;
6129 case 's': namedclass = ANYOF_SPACE; break;
6130 case 'S': namedclass = ANYOF_NSPACE; break;
6131 case 'd': namedclass = ANYOF_DIGIT; break;
6132 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6133 case 'N': /* Handle \N{NAME} in class */
6134 {
6135 /* We only pay attention to the first char of
6136 multichar strings being returned. I kinda wonder
6137 if this makes sense as it does change the behaviour
6138 from earlier versions, OTOH that behaviour was broken
6139 as well. */
6140 UV v; /* value is register so we cant & it /grrr */
6141 if (reg_namedseq(pRExC_state, &v)) {
6142 goto parseit;
6143 }
6144 value= v;
6145 }
6146 break;
ffc61ed2
JH
6147 case 'p':
6148 case 'P':
3dab1dad
YO
6149 {
6150 char *e;
af6f566e 6151 if (RExC_parse >= RExC_end)
2a4859cd 6152 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 6153 if (*RExC_parse == '{') {
1df70142 6154 const U8 c = (U8)value;
ffc61ed2
JH
6155 e = strchr(RExC_parse++, '}');
6156 if (!e)
0da60cf5 6157 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
6158 while (isSPACE(UCHARAT(RExC_parse)))
6159 RExC_parse++;
6160 if (e == RExC_parse)
0da60cf5 6161 vFAIL2("Empty \\%c{}", c);
ffc61ed2 6162 n = e - RExC_parse;
ab13f0c7
JH
6163 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6164 n--;
ffc61ed2
JH
6165 }
6166 else {
6167 e = RExC_parse;
6168 n = 1;
6169 }
6170 if (!SIZE_ONLY) {
ab13f0c7
JH
6171 if (UCHARAT(RExC_parse) == '^') {
6172 RExC_parse++;
6173 n--;
6174 value = value == 'p' ? 'P' : 'p'; /* toggle */
6175 while (isSPACE(UCHARAT(RExC_parse))) {
6176 RExC_parse++;
6177 n--;
6178 }
6179 }
097eb12c
AL
6180 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
6181 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
6182 }
6183 RExC_parse = e + 1;
6184 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 6185 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 6186 }
f81125e2 6187 break;
b8c5462f
JH
6188 case 'n': value = '\n'; break;
6189 case 'r': value = '\r'; break;
6190 case 't': value = '\t'; break;
6191 case 'f': value = '\f'; break;
6192 case 'b': value = '\b'; break;
c7f1f016
NIS
6193 case 'e': value = ASCII_TO_NATIVE('\033');break;
6194 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 6195 case 'x':
ffc61ed2 6196 if (*RExC_parse == '{') {
a4c04bdc
NC
6197 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6198 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 6199 char * const e = strchr(RExC_parse++, '}');
b81d288d 6200 if (!e)
ffc61ed2 6201 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
6202
6203 numlen = e - RExC_parse;
6204 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6205 RExC_parse = e + 1;
6206 }
6207 else {
a4c04bdc 6208 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
6209 numlen = 2;
6210 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6211 RExC_parse += numlen;
6212 }
b8c5462f
JH
6213 break;
6214 case 'c':
830247a4 6215 value = UCHARAT(RExC_parse++);
b8c5462f
JH
6216 value = toCTRL(value);
6217 break;
6218 case '0': case '1': case '2': case '3': case '4':
6219 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
6220 {
6221 I32 flags = 0;
6222 numlen = 3;
6223 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 6224 RExC_parse += numlen;
b8c5462f 6225 break;
53305cf1 6226 }
1028017a 6227 default:
041457d9 6228 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
6229 vWARN2(RExC_parse,
6230 "Unrecognized escape \\%c in character class passed through",
6231 (int)value);
1028017a 6232 break;
b8c5462f 6233 }
ffc61ed2 6234 } /* end of \blah */
1b2d223b
JH
6235#ifdef EBCDIC
6236 else
6237 literal_endpoint++;
6238#endif
ffc61ed2
JH
6239
6240 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
6241
6242 if (!SIZE_ONLY && !need_class)
936ed897 6243 ANYOF_CLASS_ZERO(ret);
ffc61ed2 6244
936ed897 6245 need_class = 1;
ffc61ed2
JH
6246
6247 /* a bad range like a-\d, a-[:digit:] ? */
6248 if (range) {
73b437c8 6249 if (!SIZE_ONLY) {
afd78fd5 6250 if (ckWARN(WARN_REGEXP)) {
097eb12c 6251 const int w =
afd78fd5
JH
6252 RExC_parse >= rangebegin ?
6253 RExC_parse - rangebegin : 0;
830247a4 6254 vWARN4(RExC_parse,
b45f050a 6255 "False [] range \"%*.*s\"",
097eb12c 6256 w, w, rangebegin);
afd78fd5 6257 }
3568d838
JH
6258 if (prevvalue < 256) {
6259 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
6260 ANYOF_BITMAP_SET(ret, '-');
6261 }
6262 else {
6263 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
6264 Perl_sv_catpvf(aTHX_ listsv,
3568d838 6265 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 6266 }
b8c5462f 6267 }
ffc61ed2
JH
6268
6269 range = 0; /* this was not a true range */
73b437c8 6270 }
ffc61ed2 6271
73b437c8 6272 if (!SIZE_ONLY) {
c49a72a9
NC
6273 const char *what = NULL;
6274 char yesno = 0;
6275
3568d838
JH
6276 if (namedclass > OOB_NAMEDCLASS)
6277 optimize_invert = FALSE;
e2962f66
JH
6278 /* Possible truncation here but in some 64-bit environments
6279 * the compiler gets heartburn about switch on 64-bit values.
6280 * A similar issue a little earlier when switching on value.
98f323fa 6281 * --jhi */
e2962f66 6282 switch ((I32)namedclass) {
73b437c8
JH
6283 case ANYOF_ALNUM:
6284 if (LOC)
936ed897 6285 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
6286 else {
6287 for (value = 0; value < 256; value++)
6288 if (isALNUM(value))
936ed897 6289 ANYOF_BITMAP_SET(ret, value);
73b437c8 6290 }
c49a72a9
NC
6291 yesno = '+';
6292 what = "Word";
73b437c8
JH
6293 break;
6294 case ANYOF_NALNUM:
6295 if (LOC)
936ed897 6296 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
6297 else {
6298 for (value = 0; value < 256; value++)
6299 if (!isALNUM(value))
936ed897 6300 ANYOF_BITMAP_SET(ret, value);
73b437c8 6301 }
c49a72a9
NC
6302 yesno = '!';
6303 what = "Word";
73b437c8 6304 break;
ffc61ed2 6305 case ANYOF_ALNUMC:
73b437c8 6306 if (LOC)
ffc61ed2 6307 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
6308 else {
6309 for (value = 0; value < 256; value++)
ffc61ed2 6310 if (isALNUMC(value))
936ed897 6311 ANYOF_BITMAP_SET(ret, value);
73b437c8 6312 }
c49a72a9
NC
6313 yesno = '+';
6314 what = "Alnum";
73b437c8
JH
6315 break;
6316 case ANYOF_NALNUMC:
6317 if (LOC)
936ed897 6318 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
6319 else {
6320 for (value = 0; value < 256; value++)
6321 if (!isALNUMC(value))
936ed897 6322 ANYOF_BITMAP_SET(ret, value);
73b437c8 6323 }
c49a72a9
NC
6324 yesno = '!';
6325 what = "Alnum";
73b437c8
JH
6326 break;
6327 case ANYOF_ALPHA:
6328 if (LOC)
936ed897 6329 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
6330 else {
6331 for (value = 0; value < 256; value++)
6332 if (isALPHA(value))
936ed897 6333 ANYOF_BITMAP_SET(ret, value);
73b437c8 6334 }
c49a72a9
NC
6335 yesno = '+';
6336 what = "Alpha";
73b437c8
JH
6337 break;
6338 case ANYOF_NALPHA:
6339 if (LOC)
936ed897 6340 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
6341 else {
6342 for (value = 0; value < 256; value++)
6343 if (!isALPHA(value))
936ed897 6344 ANYOF_BITMAP_SET(ret, value);
73b437c8 6345 }
c49a72a9
NC
6346 yesno = '!';
6347 what = "Alpha";
73b437c8
JH
6348 break;
6349 case ANYOF_ASCII:
6350 if (LOC)
936ed897 6351 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 6352 else {
c7f1f016 6353#ifndef EBCDIC
1ba5c669
JH
6354 for (value = 0; value < 128; value++)
6355 ANYOF_BITMAP_SET(ret, value);
6356#else /* EBCDIC */
ffbc6a93 6357 for (value = 0; value < 256; value++) {
3a3c4447
JH
6358 if (isASCII(value))
6359 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 6360 }
1ba5c669 6361#endif /* EBCDIC */
73b437c8 6362 }
c49a72a9
NC
6363 yesno = '+';
6364 what = "ASCII";
73b437c8
JH
6365 break;
6366 case ANYOF_NASCII:
6367 if (LOC)
936ed897 6368 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 6369 else {
c7f1f016 6370#ifndef EBCDIC
1ba5c669
JH
6371 for (value = 128; value < 256; value++)
6372 ANYOF_BITMAP_SET(ret, value);
6373#else /* EBCDIC */
ffbc6a93 6374 for (value = 0; value < 256; value++) {
3a3c4447
JH
6375 if (!isASCII(value))
6376 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 6377 }
1ba5c669 6378#endif /* EBCDIC */
73b437c8 6379 }
c49a72a9
NC
6380 yesno = '!';
6381 what = "ASCII";
73b437c8 6382 break;
aaa51d5e
JF
6383 case ANYOF_BLANK:
6384 if (LOC)
6385 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
6386 else {
6387 for (value = 0; value < 256; value++)
6388 if (isBLANK(value))
6389 ANYOF_BITMAP_SET(ret, value);
6390 }
c49a72a9
NC
6391 yesno = '+';
6392 what = "Blank";
aaa51d5e
JF
6393 break;
6394 case ANYOF_NBLANK:
6395 if (LOC)
6396 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
6397 else {
6398 for (value = 0; value < 256; value++)
6399 if (!isBLANK(value))
6400 ANYOF_BITMAP_SET(ret, value);
6401 }
c49a72a9
NC
6402 yesno = '!';
6403 what = "Blank";
aaa51d5e 6404 break;
73b437c8
JH
6405 case ANYOF_CNTRL:
6406 if (LOC)
936ed897 6407 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
6408 else {
6409 for (value = 0; value < 256; value++)
6410 if (isCNTRL(value))
936ed897 6411 ANYOF_BITMAP_SET(ret, value);
73b437c8 6412 }
c49a72a9
NC
6413 yesno = '+';
6414 what = "Cntrl";
73b437c8
JH
6415 break;
6416 case ANYOF_NCNTRL:
6417 if (LOC)
936ed897 6418 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
6419 else {
6420 for (value = 0; value < 256; value++)
6421 if (!isCNTRL(value))
936ed897 6422 ANYOF_BITMAP_SET(ret, value);
73b437c8 6423 }
c49a72a9
NC
6424 yesno = '!';
6425 what = "Cntrl";
ffc61ed2
JH
6426 break;
6427 case ANYOF_DIGIT:
6428 if (LOC)
6429 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
6430 else {
6431 /* consecutive digits assumed */
6432 for (value = '0'; value <= '9'; value++)
6433 ANYOF_BITMAP_SET(ret, value);
6434 }
c49a72a9
NC
6435 yesno = '+';
6436 what = "Digit";
ffc61ed2
JH
6437 break;
6438 case ANYOF_NDIGIT:
6439 if (LOC)
6440 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
6441 else {
6442 /* consecutive digits assumed */
6443 for (value = 0; value < '0'; value++)
6444 ANYOF_BITMAP_SET(ret, value);
6445 for (value = '9' + 1; value < 256; value++)
6446 ANYOF_BITMAP_SET(ret, value);
6447 }
c49a72a9
NC
6448 yesno = '!';
6449 what = "Digit";
73b437c8
JH
6450 break;
6451 case ANYOF_GRAPH:
6452 if (LOC)
936ed897 6453 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
6454 else {
6455 for (value = 0; value < 256; value++)
6456 if (isGRAPH(value))
936ed897 6457 ANYOF_BITMAP_SET(ret, value);
73b437c8 6458 }
c49a72a9
NC
6459 yesno = '+';
6460 what = "Graph";
73b437c8
JH
6461 break;
6462 case ANYOF_NGRAPH:
6463 if (LOC)
936ed897 6464 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
6465 else {
6466 for (value = 0; value < 256; value++)
6467 if (!isGRAPH(value))
936ed897 6468 ANYOF_BITMAP_SET(ret, value);
73b437c8 6469 }
c49a72a9
NC
6470 yesno = '!';
6471 what = "Graph";
73b437c8
JH
6472 break;
6473 case ANYOF_LOWER:
6474 if (LOC)
936ed897 6475 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
6476 else {
6477 for (value = 0; value < 256; value++)
6478 if (isLOWER(value))
936ed897 6479 ANYOF_BITMAP_SET(ret, value);
73b437c8 6480 }
c49a72a9
NC
6481 yesno = '+';
6482 what = "Lower";
73b437c8
JH
6483 break;
6484 case ANYOF_NLOWER:
6485 if (LOC)
936ed897 6486 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
6487 else {
6488 for (value = 0; value < 256; value++)
6489 if (!isLOWER(value))
936ed897 6490 ANYOF_BITMAP_SET(ret, value);
73b437c8 6491 }
c49a72a9
NC
6492 yesno = '!';
6493 what = "Lower";
73b437c8
JH
6494 break;
6495 case ANYOF_PRINT:
6496 if (LOC)
936ed897 6497 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
6498 else {
6499 for (value = 0; value < 256; value++)
6500 if (isPRINT(value))
936ed897 6501 ANYOF_BITMAP_SET(ret, value);
73b437c8 6502 }
c49a72a9
NC
6503 yesno = '+';
6504 what = "Print";
73b437c8
JH
6505 break;
6506 case ANYOF_NPRINT:
6507 if (LOC)
936ed897 6508 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
6509 else {
6510 for (value = 0; value < 256; value++)
6511 if (!isPRINT(value))
936ed897 6512 ANYOF_BITMAP_SET(ret, value);
73b437c8 6513 }
c49a72a9
NC
6514 yesno = '!';
6515 what = "Print";
73b437c8 6516 break;
aaa51d5e
JF
6517 case ANYOF_PSXSPC:
6518 if (LOC)
6519 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
6520 else {
6521 for (value = 0; value < 256; value++)
6522 if (isPSXSPC(value))
6523 ANYOF_BITMAP_SET(ret, value);
6524 }
c49a72a9
NC
6525 yesno = '+';
6526 what = "Space";
aaa51d5e
JF
6527 break;
6528 case ANYOF_NPSXSPC:
6529 if (LOC)
6530 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
6531 else {
6532 for (value = 0; value < 256; value++)
6533 if (!isPSXSPC(value))
6534 ANYOF_BITMAP_SET(ret, value);
6535 }
c49a72a9
NC
6536 yesno = '!';
6537 what = "Space";
aaa51d5e 6538 break;
73b437c8
JH
6539 case ANYOF_PUNCT:
6540 if (LOC)
936ed897 6541 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
6542 else {
6543 for (value = 0; value < 256; value++)
6544 if (isPUNCT(value))
936ed897 6545 ANYOF_BITMAP_SET(ret, value);
73b437c8 6546 }
c49a72a9
NC
6547 yesno = '+';
6548 what = "Punct";
73b437c8
JH
6549 break;
6550 case ANYOF_NPUNCT:
6551 if (LOC)
936ed897 6552 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
6553 else {
6554 for (value = 0; value < 256; value++)
6555 if (!isPUNCT(value))
936ed897 6556 ANYOF_BITMAP_SET(ret, value);
73b437c8 6557 }
c49a72a9
NC
6558 yesno = '!';
6559 what = "Punct";
ffc61ed2
JH
6560 break;
6561 case ANYOF_SPACE:
6562 if (LOC)
6563 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
6564 else {
6565 for (value = 0; value < 256; value++)
6566 if (isSPACE(value))
6567 ANYOF_BITMAP_SET(ret, value);
6568 }
c49a72a9
NC
6569 yesno = '+';
6570 what = "SpacePerl";
ffc61ed2
JH
6571 break;
6572 case ANYOF_NSPACE:
6573 if (LOC)
6574 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
6575 else {
6576 for (value = 0; value < 256; value++)
6577 if (!isSPACE(value))
6578 ANYOF_BITMAP_SET(ret, value);
6579 }
c49a72a9
NC
6580 yesno = '!';
6581 what = "SpacePerl";
73b437c8
JH
6582 break;
6583 case ANYOF_UPPER:
6584 if (LOC)
936ed897 6585 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
6586 else {
6587 for (value = 0; value < 256; value++)
6588 if (isUPPER(value))
936ed897 6589 ANYOF_BITMAP_SET(ret, value);
73b437c8 6590 }
c49a72a9
NC
6591 yesno = '+';
6592 what = "Upper";
73b437c8
JH
6593 break;
6594 case ANYOF_NUPPER:
6595 if (LOC)
936ed897 6596 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
6597 else {
6598 for (value = 0; value < 256; value++)
6599 if (!isUPPER(value))
936ed897 6600 ANYOF_BITMAP_SET(ret, value);
73b437c8 6601 }
c49a72a9
NC
6602 yesno = '!';
6603 what = "Upper";
73b437c8
JH
6604 break;
6605 case ANYOF_XDIGIT:
6606 if (LOC)
936ed897 6607 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
6608 else {
6609 for (value = 0; value < 256; value++)
6610 if (isXDIGIT(value))
936ed897 6611 ANYOF_BITMAP_SET(ret, value);
73b437c8 6612 }
c49a72a9
NC
6613 yesno = '+';
6614 what = "XDigit";
73b437c8
JH
6615 break;
6616 case ANYOF_NXDIGIT:
6617 if (LOC)
936ed897 6618 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
6619 else {
6620 for (value = 0; value < 256; value++)
6621 if (!isXDIGIT(value))
936ed897 6622 ANYOF_BITMAP_SET(ret, value);
73b437c8 6623 }
c49a72a9
NC
6624 yesno = '!';
6625 what = "XDigit";
73b437c8 6626 break;
f81125e2
JP
6627 case ANYOF_MAX:
6628 /* this is to handle \p and \P */
6629 break;
73b437c8 6630 default:
b45f050a 6631 vFAIL("Invalid [::] class");
73b437c8 6632 break;
b8c5462f 6633 }
c49a72a9
NC
6634 if (what) {
6635 /* Strings such as "+utf8::isWord\n" */
6636 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
6637 }
b8c5462f 6638 if (LOC)
936ed897 6639 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 6640 continue;
a0d0e21e 6641 }
ffc61ed2
JH
6642 } /* end of namedclass \blah */
6643
a0d0e21e 6644 if (range) {
eb160463 6645 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
6646 const int w = RExC_parse - rangebegin;
6647 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 6648 range = 0; /* not a valid range */
73b437c8 6649 }
a0d0e21e
LW
6650 }
6651 else {
3568d838 6652 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
6653 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
6654 RExC_parse[1] != ']') {
6655 RExC_parse++;
ffc61ed2
JH
6656
6657 /* a bad range like \w-, [:word:]- ? */
6658 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 6659 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 6660 const int w =
afd78fd5
JH
6661 RExC_parse >= rangebegin ?
6662 RExC_parse - rangebegin : 0;
830247a4 6663 vWARN4(RExC_parse,
b45f050a 6664 "False [] range \"%*.*s\"",
097eb12c 6665 w, w, rangebegin);
afd78fd5 6666 }
73b437c8 6667 if (!SIZE_ONLY)
936ed897 6668 ANYOF_BITMAP_SET(ret, '-');
73b437c8 6669 } else
ffc61ed2
JH
6670 range = 1; /* yeah, it's a range! */
6671 continue; /* but do it the next time */
a0d0e21e 6672 }
a687059c 6673 }
ffc61ed2 6674
93733859 6675 /* now is the next time */
07be1b83 6676 /*stored += (value - prevvalue + 1);*/
ae5c130c 6677 if (!SIZE_ONLY) {
3568d838 6678 if (prevvalue < 256) {
1df70142 6679 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 6680 IV i;
3568d838 6681#ifdef EBCDIC
1b2d223b
JH
6682 /* In EBCDIC [\x89-\x91] should include
6683 * the \x8e but [i-j] should not. */
6684 if (literal_endpoint == 2 &&
6685 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
6686 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 6687 {
3568d838
JH
6688 if (isLOWER(prevvalue)) {
6689 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
6690 if (isLOWER(i))
6691 ANYOF_BITMAP_SET(ret, i);
6692 } else {
3568d838 6693 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
6694 if (isUPPER(i))
6695 ANYOF_BITMAP_SET(ret, i);
6696 }
8ada0baa 6697 }
ffc61ed2 6698 else
8ada0baa 6699#endif
07be1b83
YO
6700 for (i = prevvalue; i <= ceilvalue; i++) {
6701 if (!ANYOF_BITMAP_TEST(ret,i)) {
6702 stored++;
6703 ANYOF_BITMAP_SET(ret, i);
6704 }
6705 }
3568d838 6706 }
a5961de5 6707 if (value > 255 || UTF) {
1df70142
AL
6708 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
6709 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 6710 stored+=2; /* can't optimize this class */
ffc61ed2 6711 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 6712 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 6713 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
6714 prevnatvalue, natvalue);
6715 }
6716 else if (prevnatvalue == natvalue) {
6717 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 6718 if (FOLD) {
89ebb4a3 6719 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 6720 STRLEN foldlen;
1df70142 6721 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 6722
e294cc5d
JH
6723#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
6724 if (RExC_precomp[0] == ':' &&
6725 RExC_precomp[1] == '[' &&
6726 (f == 0xDF || f == 0x92)) {
6727 f = NATIVE_TO_UNI(f);
6728 }
6729#endif
c840d2a2
JH
6730 /* If folding and foldable and a single
6731 * character, insert also the folded version
6732 * to the charclass. */
9e55ce06 6733 if (f != value) {
e294cc5d
JH
6734#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
6735 if ((RExC_precomp[0] == ':' &&
6736 RExC_precomp[1] == '[' &&
6737 (f == 0xA2 &&
6738 (value == 0xFB05 || value == 0xFB06))) ?
6739 foldlen == ((STRLEN)UNISKIP(f) - 1) :
6740 foldlen == (STRLEN)UNISKIP(f) )
6741#else
eb160463 6742 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 6743#endif
9e55ce06
JH
6744 Perl_sv_catpvf(aTHX_ listsv,
6745 "%04"UVxf"\n", f);
6746 else {
6747 /* Any multicharacter foldings
6748 * require the following transform:
6749 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
6750 * where E folds into "pq" and F folds
6751 * into "rst", all other characters
6752 * fold to single characters. We save
6753 * away these multicharacter foldings,
6754 * to be later saved as part of the
6755 * additional "s" data. */
6756 SV *sv;
6757
6758 if (!unicode_alternate)
6759 unicode_alternate = newAV();
6760 sv = newSVpvn((char*)foldbuf, foldlen);
6761 SvUTF8_on(sv);
6762 av_push(unicode_alternate, sv);
6763 }
6764 }
254ba52a 6765
60a8b682
JH
6766 /* If folding and the value is one of the Greek
6767 * sigmas insert a few more sigmas to make the
6768 * folding rules of the sigmas to work right.
6769 * Note that not all the possible combinations
6770 * are handled here: some of them are handled
9e55ce06
JH
6771 * by the standard folding rules, and some of
6772 * them (literal or EXACTF cases) are handled
6773 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
6774 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
6775 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 6776 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 6777 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 6778 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
6779 }
6780 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
6781 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 6782 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
6783 }
6784 }
ffc61ed2 6785 }
1b2d223b
JH
6786#ifdef EBCDIC
6787 literal_endpoint = 0;
6788#endif
8ada0baa 6789 }
ffc61ed2
JH
6790
6791 range = 0; /* this range (if it was one) is done now */
a0d0e21e 6792 }
ffc61ed2 6793
936ed897 6794 if (need_class) {
4f66b38d 6795 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 6796 if (SIZE_ONLY)
830247a4 6797 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 6798 else
830247a4 6799 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 6800 }
ffc61ed2 6801
7f6f358c
YO
6802
6803 if (SIZE_ONLY)
6804 return ret;
6805 /****** !SIZE_ONLY AFTER HERE *********/
6806
6807 if( stored == 1 && value < 256
6808 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
6809 ) {
6810 /* optimize single char class to an EXACT node
6811 but *only* when its not a UTF/high char */
07be1b83
YO
6812 const char * cur_parse= RExC_parse;
6813 RExC_emit = (regnode *)orig_emit;
6814 RExC_parse = (char *)orig_parse;
7f6f358c
YO
6815 ret = reg_node(pRExC_state,
6816 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 6817 RExC_parse = (char *)cur_parse;
7f6f358c
YO
6818 *STRING(ret)= (char)value;
6819 STR_LEN(ret)= 1;
6820 RExC_emit += STR_SZ(1);
6821 return ret;
6822 }
ae5c130c 6823 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 6824 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
6825 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
6826 ) {
a0ed51b3 6827 for (value = 0; value < 256; ++value) {
936ed897 6828 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 6829 UV fold = PL_fold[value];
ffc61ed2
JH
6830
6831 if (fold != value)
6832 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
6833 }
6834 }
936ed897 6835 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 6836 }
ffc61ed2 6837
ae5c130c 6838 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 6839 if (optimize_invert &&
ffc61ed2
JH
6840 /* If the only flag is inversion. */
6841 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 6842 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 6843 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 6844 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 6845 }
7f6f358c 6846 {
097eb12c 6847 AV * const av = newAV();
ffc61ed2 6848 SV *rv;
9e55ce06 6849 /* The 0th element stores the character class description
6a0407ee 6850 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
6851 * to initialize the appropriate swash (which gets stored in
6852 * the 1st element), and also useful for dumping the regnode.
6853 * The 2nd element stores the multicharacter foldings,
6a0407ee 6854 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
6855 av_store(av, 0, listsv);
6856 av_store(av, 1, NULL);
9e55ce06 6857 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 6858 rv = newRV_noinc((SV*)av);
19860706 6859 n = add_data(pRExC_state, 1, "s");
830247a4 6860 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 6861 ARG_SET(ret, n);
a0ed51b3 6862 }
a0ed51b3
LW
6863 return ret;
6864}
6865
76e3520e 6866STATIC char*
830247a4 6867S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 6868{
097eb12c 6869 char* const retval = RExC_parse++;
a0d0e21e 6870
4633a7c4 6871 for (;;) {
830247a4
IZ
6872 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
6873 RExC_parse[2] == '#') {
e994fd66
AE
6874 while (*RExC_parse != ')') {
6875 if (RExC_parse == RExC_end)
6876 FAIL("Sequence (?#... not terminated");
830247a4 6877 RExC_parse++;
e994fd66 6878 }
830247a4 6879 RExC_parse++;
4633a7c4
LW
6880 continue;
6881 }
e2509266 6882 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
6883 if (isSPACE(*RExC_parse)) {
6884 RExC_parse++;
748a9306
LW
6885 continue;
6886 }
830247a4 6887 else if (*RExC_parse == '#') {
e994fd66
AE
6888 while (RExC_parse < RExC_end)
6889 if (*RExC_parse++ == '\n') break;
748a9306
LW
6890 continue;
6891 }
748a9306 6892 }
4633a7c4 6893 return retval;
a0d0e21e 6894 }
a687059c
LW
6895}
6896
6897/*
c277df42 6898- reg_node - emit a node
a0d0e21e 6899*/
76e3520e 6900STATIC regnode * /* Location. */
830247a4 6901S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 6902{
97aff369 6903 dVAR;
c277df42 6904 register regnode *ptr;
504618e9 6905 regnode * const ret = RExC_emit;
07be1b83 6906 GET_RE_DEBUG_FLAGS_DECL;
a687059c 6907
c277df42 6908 if (SIZE_ONLY) {
830247a4
IZ
6909 SIZE_ALIGN(RExC_size);
6910 RExC_size += 1;
a0d0e21e
LW
6911 return(ret);
6912 }
c277df42 6913 NODE_ALIGN_FILL(ret);
a0d0e21e 6914 ptr = ret;
c277df42 6915 FILL_ADVANCE_NODE(ptr, op);
fac92740 6916 if (RExC_offsets) { /* MJD */
07be1b83 6917 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
6918 "reg_node", __LINE__,
6919 reg_name[op],
07be1b83
YO
6920 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6921 ? "Overwriting end of array!\n" : "OK",
6922 (UV)(RExC_emit - RExC_emit_start),
6923 (UV)(RExC_parse - RExC_start),
6924 (UV)RExC_offsets[0]));
ccb2c380 6925 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 6926 }
07be1b83 6927
830247a4 6928 RExC_emit = ptr;
a687059c 6929
a0d0e21e 6930 return(ret);
a687059c
LW
6931}
6932
6933/*
a0d0e21e
LW
6934- reganode - emit a node with an argument
6935*/
76e3520e 6936STATIC regnode * /* Location. */
830247a4 6937S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 6938{
97aff369 6939 dVAR;
c277df42 6940 register regnode *ptr;
504618e9 6941 regnode * const ret = RExC_emit;
07be1b83 6942 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 6943
c277df42 6944 if (SIZE_ONLY) {
830247a4
IZ
6945 SIZE_ALIGN(RExC_size);
6946 RExC_size += 2;
a0d0e21e
LW
6947 return(ret);
6948 }
fe14fcc3 6949
c277df42 6950 NODE_ALIGN_FILL(ret);
a0d0e21e 6951 ptr = ret;
c277df42 6952 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 6953 if (RExC_offsets) { /* MJD */
07be1b83 6954 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6955 "reganode",
ccb2c380
MP
6956 __LINE__,
6957 reg_name[op],
07be1b83 6958 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 6959 "Overwriting end of array!\n" : "OK",
07be1b83
YO
6960 (UV)(RExC_emit - RExC_emit_start),
6961 (UV)(RExC_parse - RExC_start),
6962 (UV)RExC_offsets[0]));
ccb2c380 6963 Set_Cur_Node_Offset;
fac92740
MJD
6964 }
6965
830247a4 6966 RExC_emit = ptr;
fe14fcc3 6967
a0d0e21e 6968 return(ret);
fe14fcc3
LW
6969}
6970
6971/*
cd439c50 6972- reguni - emit (if appropriate) a Unicode character
a0ed51b3 6973*/
71207a34
AL
6974STATIC STRLEN
6975S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 6976{
97aff369 6977 dVAR;
71207a34 6978 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
6979}
6980
6981/*
a0d0e21e
LW
6982- reginsert - insert an operator in front of already-emitted operand
6983*
6984* Means relocating the operand.
6985*/
76e3520e 6986STATIC void
830247a4 6987S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 6988{
97aff369 6989 dVAR;
c277df42
IZ
6990 register regnode *src;
6991 register regnode *dst;
6992 register regnode *place;
504618e9 6993 const int offset = regarglen[(U8)op];
07be1b83 6994 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 6995/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
6996
6997 if (SIZE_ONLY) {
830247a4 6998 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
6999 return;
7000 }
a687059c 7001
830247a4
IZ
7002 src = RExC_emit;
7003 RExC_emit += NODE_STEP_REGNODE + offset;
7004 dst = RExC_emit;
fac92740 7005 while (src > opnd) {
c277df42 7006 StructCopy(--src, --dst, regnode);
fac92740 7007 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7008 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7009 "reg_insert",
ccb2c380
MP
7010 __LINE__,
7011 reg_name[op],
07be1b83
YO
7012 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7013 ? "Overwriting end of array!\n" : "OK",
7014 (UV)(src - RExC_emit_start),
7015 (UV)(dst - RExC_emit_start),
7016 (UV)RExC_offsets[0]));
ccb2c380
MP
7017 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7018 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7019 }
7020 }
7021
a0d0e21e
LW
7022
7023 place = opnd; /* Op node, where operand used to be. */
fac92740 7024 if (RExC_offsets) { /* MJD */
07be1b83 7025 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7026 "reginsert",
ccb2c380
MP
7027 __LINE__,
7028 reg_name[op],
07be1b83 7029 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7030 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7031 (UV)(place - RExC_emit_start),
7032 (UV)(RExC_parse - RExC_start),
786e8c11 7033 (UV)RExC_offsets[0]));
ccb2c380 7034 Set_Node_Offset(place, RExC_parse);
45948336 7035 Set_Node_Length(place, 1);
fac92740 7036 }
c277df42
IZ
7037 src = NEXTOPER(place);
7038 FILL_ADVANCE_NODE(place, op);
7039 Zero(src, offset, regnode);
a687059c
LW
7040}
7041
7042/*
c277df42 7043- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7044- SEE ALSO: regtail_study
a0d0e21e 7045*/
097eb12c 7046/* TODO: All three parms should be const */
76e3520e 7047STATIC void
3dab1dad 7048S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7049{
97aff369 7050 dVAR;
c277df42 7051 register regnode *scan;
72f13be8 7052 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7053#ifndef DEBUGGING
7054 PERL_UNUSED_ARG(depth);
7055#endif
a0d0e21e 7056
c277df42 7057 if (SIZE_ONLY)
a0d0e21e
LW
7058 return;
7059
7060 /* Find last node. */
7061 scan = p;
7062 for (;;) {
504618e9 7063 regnode * const temp = regnext(scan);
3dab1dad
YO
7064 DEBUG_PARSE_r({
7065 SV * const mysv=sv_newmortal();
7066 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7067 regprop(RExC_rx, mysv, scan);
7068 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
7069 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
7070 });
7071 if (temp == NULL)
7072 break;
7073 scan = temp;
7074 }
7075
7076 if (reg_off_by_arg[OP(scan)]) {
7077 ARG_SET(scan, val - scan);
7078 }
7079 else {
7080 NEXT_OFF(scan) = val - scan;
7081 }
7082}
7083
07be1b83 7084#ifdef DEBUGGING
3dab1dad
YO
7085/*
7086- regtail_study - set the next-pointer at the end of a node chain of p to val.
7087- Look for optimizable sequences at the same time.
7088- currently only looks for EXACT chains.
07be1b83
YO
7089
7090This is expermental code. The idea is to use this routine to perform
7091in place optimizations on branches and groups as they are constructed,
7092with the long term intention of removing optimization from study_chunk so
7093that it is purely analytical.
7094
7095Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7096to control which is which.
7097
3dab1dad
YO
7098*/
7099/* TODO: All four parms should be const */
07be1b83 7100
3dab1dad
YO
7101STATIC U8
7102S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7103{
7104 dVAR;
7105 register regnode *scan;
07be1b83
YO
7106 U8 exact = PSEUDO;
7107#ifdef EXPERIMENTAL_INPLACESCAN
7108 I32 min = 0;
7109#endif
7110
3dab1dad
YO
7111 GET_RE_DEBUG_FLAGS_DECL;
7112
07be1b83 7113
3dab1dad
YO
7114 if (SIZE_ONLY)
7115 return exact;
7116
7117 /* Find last node. */
7118
7119 scan = p;
7120 for (;;) {
7121 regnode * const temp = regnext(scan);
07be1b83
YO
7122#ifdef EXPERIMENTAL_INPLACESCAN
7123 if (PL_regkind[OP(scan)] == EXACT)
7124 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7125 return EXACT;
7126#endif
3dab1dad
YO
7127 if ( exact ) {
7128 switch (OP(scan)) {
7129 case EXACT:
7130 case EXACTF:
7131 case EXACTFL:
7132 if( exact == PSEUDO )
7133 exact= OP(scan);
07be1b83
YO
7134 else if ( exact != OP(scan) )
7135 exact= 0;
3dab1dad
YO
7136 case NOTHING:
7137 break;
7138 default:
7139 exact= 0;
7140 }
7141 }
7142 DEBUG_PARSE_r({
7143 SV * const mysv=sv_newmortal();
7144 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
7145 regprop(RExC_rx, mysv, scan);
7146 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
7147 SvPV_nolen_const(mysv),
7148 reg_name[exact],
7149 REG_NODE_NUM(scan));
7150 });
a0d0e21e
LW
7151 if (temp == NULL)
7152 break;
7153 scan = temp;
7154 }
07be1b83
YO
7155 DEBUG_PARSE_r({
7156 SV * const mysv_val=sv_newmortal();
7157 DEBUG_PARSE_MSG("");
7158 regprop(RExC_rx, mysv_val, val);
7159 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
7160 SvPV_nolen_const(mysv_val),
7161 REG_NODE_NUM(val),
7162 val - scan
7163 );
7164 });
c277df42
IZ
7165 if (reg_off_by_arg[OP(scan)]) {
7166 ARG_SET(scan, val - scan);
a0ed51b3
LW
7167 }
7168 else {
c277df42
IZ
7169 NEXT_OFF(scan) = val - scan;
7170 }
3dab1dad
YO
7171
7172 return exact;
a687059c 7173}
07be1b83 7174#endif
a687059c
LW
7175
7176/*
a687059c
LW
7177 - regcurly - a little FSA that accepts {\d+,?\d*}
7178 */
79072805 7179STATIC I32
5f66b61c 7180S_regcurly(register const char *s)
a687059c
LW
7181{
7182 if (*s++ != '{')
7183 return FALSE;
f0fcb552 7184 if (!isDIGIT(*s))
a687059c 7185 return FALSE;
f0fcb552 7186 while (isDIGIT(*s))
a687059c
LW
7187 s++;
7188 if (*s == ',')
7189 s++;
f0fcb552 7190 while (isDIGIT(*s))
a687059c
LW
7191 s++;
7192 if (*s != '}')
7193 return FALSE;
7194 return TRUE;
7195}
7196
a687059c
LW
7197
7198/*
fd181c75 7199 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
7200 */
7201void
097eb12c 7202Perl_regdump(pTHX_ const regexp *r)
a687059c 7203{
35ff7856 7204#ifdef DEBUGGING
97aff369 7205 dVAR;
c445ea15 7206 SV * const sv = sv_newmortal();
ab3bbdeb 7207 SV *dsv= sv_newmortal();
a687059c 7208
786e8c11 7209 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
7210
7211 /* Header fields of interest. */
ab3bbdeb
YO
7212 if (r->anchored_substr) {
7213 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
7214 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 7215 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7216 "anchored %s%s at %"IVdf" ",
7217 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 7218 (IV)r->anchored_offset);
ab3bbdeb
YO
7219 } else if (r->anchored_utf8) {
7220 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
7221 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 7222 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7223 "anchored utf8 %s%s at %"IVdf" ",
7224 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 7225 (IV)r->anchored_offset);
ab3bbdeb
YO
7226 }
7227 if (r->float_substr) {
7228 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
7229 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 7230 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7231 "floating %s%s at %"IVdf"..%"UVuf" ",
7232 s, RE_SV_TAIL(r->float_substr),
7b0972df 7233 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
7234 } else if (r->float_utf8) {
7235 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
7236 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 7237 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7238 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
7239 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 7240 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 7241 }
33b8afdf 7242 if (r->check_substr || r->check_utf8)
b81d288d 7243 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
7244 (const char *)
7245 (r->check_substr == r->float_substr
7246 && r->check_utf8 == r->float_utf8
7247 ? "(checking floating" : "(checking anchored"));
c277df42
IZ
7248 if (r->reganch & ROPT_NOSCAN)
7249 PerlIO_printf(Perl_debug_log, " noscan");
7250 if (r->reganch & ROPT_CHECK_ALL)
7251 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 7252 if (r->check_substr || r->check_utf8)
c277df42
IZ
7253 PerlIO_printf(Perl_debug_log, ") ");
7254
46fc3d4c 7255 if (r->regstclass) {
32fc9b6a 7256 regprop(r, sv, r->regstclass);
1de06328 7257 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 7258 }
774d564b 7259 if (r->reganch & ROPT_ANCH) {
7260 PerlIO_printf(Perl_debug_log, "anchored");
7261 if (r->reganch & ROPT_ANCH_BOL)
7262 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
7263 if (r->reganch & ROPT_ANCH_MBOL)
7264 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
7265 if (r->reganch & ROPT_ANCH_SBOL)
7266 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 7267 if (r->reganch & ROPT_ANCH_GPOS)
7268 PerlIO_printf(Perl_debug_log, "(GPOS)");
7269 PerlIO_putc(Perl_debug_log, ' ');
7270 }
c277df42
IZ
7271 if (r->reganch & ROPT_GPOS_SEEN)
7272 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 7273 if (r->reganch & ROPT_SKIP)
760ac839 7274 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 7275 if (r->reganch & ROPT_IMPLICIT)
760ac839 7276 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 7277 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
7278 if (r->reganch & ROPT_EVAL_SEEN)
7279 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 7280 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 7281#else
96a5add6 7282 PERL_UNUSED_CONTEXT;
65e66c80 7283 PERL_UNUSED_ARG(r);
17c3b450 7284#endif /* DEBUGGING */
a687059c
LW
7285}
7286
7287/*
a0d0e21e
LW
7288- regprop - printable representation of opcode
7289*/
46fc3d4c 7290void
32fc9b6a 7291Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 7292{
35ff7856 7293#ifdef DEBUGGING
97aff369 7294 dVAR;
9b155405 7295 register int k;
1de06328 7296 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 7297
54dc92de 7298 sv_setpvn(sv, "", 0);
03363afd 7299 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
7300 /* It would be nice to FAIL() here, but this may be called from
7301 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 7302 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 7303 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 7304
3dab1dad 7305 k = PL_regkind[OP(o)];
9b155405 7306
2a782b5b 7307 if (k == EXACT) {
396482e1 7308 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
7309 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
7310 * is a crude hack but it may be the best for now since
7311 * we have no flag "this EXACTish node was UTF-8"
7312 * --jhi */
7313 const char * const s =
ddc5bc0f 7314 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
7315 PL_colors[0], PL_colors[1],
7316 PERL_PV_ESCAPE_UNI_DETECT |
7317 PERL_PV_PRETTY_ELIPSES |
7318 PERL_PV_PRETTY_LTGT
7319 );
7320 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 7321 } else if (k == TRIE) {
3dab1dad 7322 /* print the details of the trie in dumpuntil instead, as
4f639d21 7323 * prog->data isn't available here */
1de06328
YO
7324 const char op = OP(o);
7325 const I32 n = ARG(o);
7326 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
7327 (reg_ac_data *)prog->data->data[n] :
7328 NULL;
7329 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
7330 (reg_trie_data*)prog->data->data[n] :
7331 ac->trie;
7332
7333 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
7334 DEBUG_TRIE_COMPILE_r(
7335 Perl_sv_catpvf(aTHX_ sv,
7336 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
7337 (UV)trie->startstate,
7338 (IV)trie->laststate-1,
7339 (UV)trie->wordcount,
7340 (UV)trie->minlen,
7341 (UV)trie->maxlen,
7342 (UV)TRIE_CHARCOUNT(trie),
7343 (UV)trie->uniquecharcount
7344 )
7345 );
7346 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
7347 int i;
7348 int rangestart = -1;
f46cb337 7349 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
7350 Perl_sv_catpvf(aTHX_ sv, "[");
7351 for (i = 0; i <= 256; i++) {
7352 if (i < 256 && BITMAP_TEST(bitmap,i)) {
7353 if (rangestart == -1)
7354 rangestart = i;
7355 } else if (rangestart != -1) {
7356 if (i <= rangestart + 3)
7357 for (; rangestart < i; rangestart++)
7358 put_byte(sv, rangestart);
7359 else {
7360 put_byte(sv, rangestart);
7361 sv_catpvs(sv, "-");
7362 put_byte(sv, i - 1);
7363 }
7364 rangestart = -1;
7365 }
7366 }
7367 Perl_sv_catpvf(aTHX_ sv, "]");
7368 }
7369
a3621e74 7370 } else if (k == CURLY) {
cb434fcc 7371 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
7372 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
7373 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 7374 }
2c2d71f5
JH
7375 else if (k == WHILEM && o->flags) /* Ordinal/of */
7376 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 7377 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 7378 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 7379 else if (k == LOGICAL)
04ebc1ab 7380 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
7381 else if (k == ANYOF) {
7382 int i, rangestart = -1;
2d03de9c 7383 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
7384
7385 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
7386 static const char * const anyofs[] = {
653099ff
GS
7387 "\\w",
7388 "\\W",
7389 "\\s",
7390 "\\S",
7391 "\\d",
7392 "\\D",
7393 "[:alnum:]",
7394 "[:^alnum:]",
7395 "[:alpha:]",
7396 "[:^alpha:]",
7397 "[:ascii:]",
7398 "[:^ascii:]",
7399 "[:ctrl:]",
7400 "[:^ctrl:]",
7401 "[:graph:]",
7402 "[:^graph:]",
7403 "[:lower:]",
7404 "[:^lower:]",
7405 "[:print:]",
7406 "[:^print:]",
7407 "[:punct:]",
7408 "[:^punct:]",
7409 "[:upper:]",
aaa51d5e 7410 "[:^upper:]",
653099ff 7411 "[:xdigit:]",
aaa51d5e
JF
7412 "[:^xdigit:]",
7413 "[:space:]",
7414 "[:^space:]",
7415 "[:blank:]",
7416 "[:^blank:]"
653099ff
GS
7417 };
7418
19860706 7419 if (flags & ANYOF_LOCALE)
396482e1 7420 sv_catpvs(sv, "{loc}");
19860706 7421 if (flags & ANYOF_FOLD)
396482e1 7422 sv_catpvs(sv, "{i}");
653099ff 7423 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 7424 if (flags & ANYOF_INVERT)
396482e1 7425 sv_catpvs(sv, "^");
ffc61ed2
JH
7426 for (i = 0; i <= 256; i++) {
7427 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
7428 if (rangestart == -1)
7429 rangestart = i;
7430 } else if (rangestart != -1) {
7431 if (i <= rangestart + 3)
7432 for (; rangestart < i; rangestart++)
653099ff 7433 put_byte(sv, rangestart);
ffc61ed2
JH
7434 else {
7435 put_byte(sv, rangestart);
396482e1 7436 sv_catpvs(sv, "-");
ffc61ed2 7437 put_byte(sv, i - 1);
653099ff 7438 }
ffc61ed2 7439 rangestart = -1;
653099ff 7440 }
847a199f 7441 }
ffc61ed2
JH
7442
7443 if (o->flags & ANYOF_CLASS)
bb7a0f54 7444 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
7445 if (ANYOF_CLASS_TEST(o,i))
7446 sv_catpv(sv, anyofs[i]);
7447
7448 if (flags & ANYOF_UNICODE)
396482e1 7449 sv_catpvs(sv, "{unicode}");
1aa99e6b 7450 else if (flags & ANYOF_UNICODE_ALL)
396482e1 7451 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
7452
7453 {
7454 SV *lv;
32fc9b6a 7455 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 7456
ffc61ed2
JH
7457 if (lv) {
7458 if (sw) {
89ebb4a3 7459 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 7460
ffc61ed2 7461 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 7462 uvchr_to_utf8(s, i);
ffc61ed2 7463
3568d838 7464 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
7465 if (rangestart == -1)
7466 rangestart = i;
7467 } else if (rangestart != -1) {
ffc61ed2
JH
7468 if (i <= rangestart + 3)
7469 for (; rangestart < i; rangestart++) {
2d03de9c
AL
7470 const U8 * const e = uvchr_to_utf8(s,rangestart);
7471 U8 *p;
7472 for(p = s; p < e; p++)
ffc61ed2
JH
7473 put_byte(sv, *p);
7474 }
7475 else {
2d03de9c
AL
7476 const U8 *e = uvchr_to_utf8(s,rangestart);
7477 U8 *p;
7478 for (p = s; p < e; p++)
ffc61ed2 7479 put_byte(sv, *p);
396482e1 7480 sv_catpvs(sv, "-");
2d03de9c
AL
7481 e = uvchr_to_utf8(s, i-1);
7482 for (p = s; p < e; p++)
1df70142 7483 put_byte(sv, *p);
ffc61ed2
JH
7484 }
7485 rangestart = -1;
7486 }
19860706 7487 }
ffc61ed2 7488
396482e1 7489 sv_catpvs(sv, "..."); /* et cetera */
19860706 7490 }
fde631ed 7491
ffc61ed2 7492 {
2e0de35c 7493 char *s = savesvpv(lv);
c445ea15 7494 char * const origs = s;
b81d288d 7495
3dab1dad
YO
7496 while (*s && *s != '\n')
7497 s++;
b81d288d 7498
ffc61ed2 7499 if (*s == '\n') {
2d03de9c 7500 const char * const t = ++s;
ffc61ed2
JH
7501
7502 while (*s) {
7503 if (*s == '\n')
7504 *s = ' ';
7505 s++;
7506 }
7507 if (s[-1] == ' ')
7508 s[-1] = 0;
7509
7510 sv_catpv(sv, t);
fde631ed 7511 }
b81d288d 7512
ffc61ed2 7513 Safefree(origs);
fde631ed
JH
7514 }
7515 }
653099ff 7516 }
ffc61ed2 7517
653099ff
GS
7518 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
7519 }
9b155405 7520 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 7521 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 7522#else
96a5add6 7523 PERL_UNUSED_CONTEXT;
65e66c80
SP
7524 PERL_UNUSED_ARG(sv);
7525 PERL_UNUSED_ARG(o);
f9049ba1 7526 PERL_UNUSED_ARG(prog);
17c3b450 7527#endif /* DEBUGGING */
35ff7856 7528}
a687059c 7529
cad2e5aa
JH
7530SV *
7531Perl_re_intuit_string(pTHX_ regexp *prog)
7532{ /* Assume that RE_INTUIT is set */
97aff369 7533 dVAR;
a3621e74 7534 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
7535 PERL_UNUSED_CONTEXT;
7536
a3621e74 7537 DEBUG_COMPILE_r(
cfd0369c 7538 {
2d03de9c 7539 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 7540 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
7541
7542 if (!PL_colorset) reginitcolors();
7543 PerlIO_printf(Perl_debug_log,
a0288114 7544 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
7545 PL_colors[4],
7546 prog->check_substr ? "" : "utf8 ",
7547 PL_colors[5],PL_colors[0],
cad2e5aa
JH
7548 s,
7549 PL_colors[1],
7550 (strlen(s) > 60 ? "..." : ""));
7551 } );
7552
33b8afdf 7553 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
7554}
7555
84da74a7
YO
7556/*
7557 pregfree - free a regexp
7558
7559 See regdupe below if you change anything here.
7560*/
7561
2b69d0c2 7562void
864dbfa3 7563Perl_pregfree(pTHX_ struct regexp *r)
a687059c 7564{
27da23d5 7565 dVAR;
0df25f3d 7566
fc32ee4a 7567 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 7568
7821416a
IZ
7569 if (!r || (--r->refcnt > 0))
7570 return;
ab3bbdeb 7571 DEBUG_COMPILE_r({
0df25f3d
YO
7572 if (!PL_colorset)
7573 reginitcolors();
ab3bbdeb
YO
7574 if (RX_DEBUG(r)){
7575 SV *dsv= sv_newmortal();
7576 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
7577 dsv, r->precomp, r->prelen, 60);
7578 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
7579 PL_colors[4],PL_colors[5],s);
7580 }
9e55ce06 7581 });
cad2e5aa 7582
43c5f42d
NC
7583 /* gcov results gave these as non-null 100% of the time, so there's no
7584 optimisation in checking them before calling Safefree */
7585 Safefree(r->precomp);
7586 Safefree(r->offsets); /* 20010421 MJD */
ed252734 7587 RX_MATCH_COPY_FREE(r);
f8c7b90f 7588#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
7589 if (r->saved_copy)
7590 SvREFCNT_dec(r->saved_copy);
7591#endif
a193d654
GS
7592 if (r->substrs) {
7593 if (r->anchored_substr)
7594 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
7595 if (r->anchored_utf8)
7596 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
7597 if (r->float_substr)
7598 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
7599 if (r->float_utf8)
7600 SvREFCNT_dec(r->float_utf8);
2779dcf1 7601 Safefree(r->substrs);
a193d654 7602 }
c277df42
IZ
7603 if (r->data) {
7604 int n = r->data->count;
f3548bdc
DM
7605 PAD* new_comppad = NULL;
7606 PAD* old_comppad;
4026c95a 7607 PADOFFSET refcnt;
dfad63ad 7608
c277df42 7609 while (--n >= 0) {
261faec3 7610 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
7611 switch (r->data->what[n]) {
7612 case 's':
7613 SvREFCNT_dec((SV*)r->data->data[n]);
7614 break;
653099ff
GS
7615 case 'f':
7616 Safefree(r->data->data[n]);
7617 break;
dfad63ad
HS
7618 case 'p':
7619 new_comppad = (AV*)r->data->data[n];
7620 break;
c277df42 7621 case 'o':
dfad63ad 7622 if (new_comppad == NULL)
cea2e8a9 7623 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
7624 PAD_SAVE_LOCAL(old_comppad,
7625 /* Watch out for global destruction's random ordering. */
c445ea15 7626 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 7627 );
b34c0dd4 7628 OP_REFCNT_LOCK;
4026c95a
SH
7629 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
7630 OP_REFCNT_UNLOCK;
7631 if (!refcnt)
9b978d73 7632 op_free((OP_4tree*)r->data->data[n]);
9b978d73 7633
f3548bdc 7634 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
7635 SvREFCNT_dec((SV*)new_comppad);
7636 new_comppad = NULL;
c277df42
IZ
7637 break;
7638 case 'n':
9e55ce06 7639 break;
07be1b83 7640 case 'T':
be8e71aa
YO
7641 { /* Aho Corasick add-on structure for a trie node.
7642 Used in stclass optimization only */
07be1b83
YO
7643 U32 refcount;
7644 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
7645 OP_REFCNT_LOCK;
7646 refcount = --aho->refcount;
7647 OP_REFCNT_UNLOCK;
7648 if ( !refcount ) {
7649 Safefree(aho->states);
7650 Safefree(aho->fail);
7651 aho->trie=NULL; /* not necessary to free this as it is
7652 handled by the 't' case */
7653 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 7654 Safefree(r->regstclass);
07be1b83
YO
7655 }
7656 }
7657 break;
a3621e74 7658 case 't':
07be1b83 7659 {
be8e71aa 7660 /* trie structure. */
07be1b83
YO
7661 U32 refcount;
7662 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
7663 OP_REFCNT_LOCK;
7664 refcount = --trie->refcount;
7665 OP_REFCNT_UNLOCK;
7666 if ( !refcount ) {
7667 Safefree(trie->charmap);
7668 if (trie->widecharmap)
7669 SvREFCNT_dec((SV*)trie->widecharmap);
7670 Safefree(trie->states);
7671 Safefree(trie->trans);
7672 if (trie->bitmap)
7673 Safefree(trie->bitmap);
7674 if (trie->wordlen)
7675 Safefree(trie->wordlen);
786e8c11
YO
7676 if (trie->jump)
7677 Safefree(trie->jump);
7678 if (trie->nextword)
7679 Safefree(trie->nextword);
a3621e74 7680#ifdef DEBUGGING
be8e71aa
YO
7681 if (RX_DEBUG(r)) {
7682 if (trie->words)
7683 SvREFCNT_dec((SV*)trie->words);
7684 if (trie->revcharmap)
7685 SvREFCNT_dec((SV*)trie->revcharmap);
7686 }
a3621e74 7687#endif
07be1b83 7688 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 7689 }
07be1b83
YO
7690 }
7691 break;
c277df42 7692 default:
830247a4 7693 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
7694 }
7695 }
7696 Safefree(r->data->what);
7697 Safefree(r->data);
a0d0e21e
LW
7698 }
7699 Safefree(r->startp);
7700 Safefree(r->endp);
7701 Safefree(r);
a687059c 7702}
c277df42 7703
84da74a7
YO
7704#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
7705#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
7706#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
7707
7708/*
7709 regdupe - duplicate a regexp.
7710
7711 This routine is called by sv.c's re_dup and is expected to clone a
7712 given regexp structure. It is a no-op when not under USE_ITHREADS.
7713 (Originally this *was* re_dup() for change history see sv.c)
7714
7715 See pregfree() above if you change anything here.
7716*/
a3c0e9ca 7717#if defined(USE_ITHREADS)
84da74a7
YO
7718regexp *
7719Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
7720{
84da74a7
YO
7721 dVAR;
7722 REGEXP *ret;
7723 int i, len, npar;
7724 struct reg_substr_datum *s;
7725
7726 if (!r)
7727 return (REGEXP *)NULL;
7728
7729 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
7730 return ret;
7731
7732 len = r->offsets[0];
7733 npar = r->nparens+1;
7734
7735 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
7736 Copy(r->program, ret->program, len+1, regnode);
7737
7738 Newx(ret->startp, npar, I32);
7739 Copy(r->startp, ret->startp, npar, I32);
7740 Newx(ret->endp, npar, I32);
7741 Copy(r->startp, ret->startp, npar, I32);
7742
7743 Newx(ret->substrs, 1, struct reg_substr_data);
7744 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
7745 s->min_offset = r->substrs->data[i].min_offset;
7746 s->max_offset = r->substrs->data[i].max_offset;
7747 s->end_shift = r->substrs->data[i].end_shift;
7748 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
7749 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
7750 }
7751
7752 ret->regstclass = NULL;
7753 if (r->data) {
7754 struct reg_data *d;
7755 const int count = r->data->count;
7756 int i;
7757
7758 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
7759 char, struct reg_data);
7760 Newx(d->what, count, U8);
7761
7762 d->count = count;
7763 for (i = 0; i < count; i++) {
7764 d->what[i] = r->data->what[i];
7765 switch (d->what[i]) {
7766 /* legal options are one of: sfpont
7767 see also regcomp.h and pregfree() */
7768 case 's':
7769 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
7770 break;
7771 case 'p':
7772 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
7773 break;
7774 case 'f':
7775 /* This is cheating. */
7776 Newx(d->data[i], 1, struct regnode_charclass_class);
7777 StructCopy(r->data->data[i], d->data[i],
7778 struct regnode_charclass_class);
7779 ret->regstclass = (regnode*)d->data[i];
7780 break;
7781 case 'o':
7782 /* Compiled op trees are readonly, and can thus be
7783 shared without duplication. */
7784 OP_REFCNT_LOCK;
7785 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
7786 OP_REFCNT_UNLOCK;
7787 break;
7788 case 'n':
7789 d->data[i] = r->data->data[i];
7790 break;
7791 case 't':
7792 d->data[i] = r->data->data[i];
7793 OP_REFCNT_LOCK;
7794 ((reg_trie_data*)d->data[i])->refcount++;
7795 OP_REFCNT_UNLOCK;
7796 break;
7797 case 'T':
7798 d->data[i] = r->data->data[i];
7799 OP_REFCNT_LOCK;
7800 ((reg_ac_data*)d->data[i])->refcount++;
7801 OP_REFCNT_UNLOCK;
7802 /* Trie stclasses are readonly and can thus be shared
7803 * without duplication. We free the stclass in pregfree
7804 * when the corresponding reg_ac_data struct is freed.
7805 */
7806 ret->regstclass= r->regstclass;
7807 break;
7808 default:
7809 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
7810 }
7811 }
7812
7813 ret->data = d;
7814 }
7815 else
7816 ret->data = NULL;
7817
7818 Newx(ret->offsets, 2*len+1, U32);
7819 Copy(r->offsets, ret->offsets, 2*len+1, U32);
7820
7821 ret->precomp = SAVEPVN(r->precomp, r->prelen);
7822 ret->refcnt = r->refcnt;
7823 ret->minlen = r->minlen;
7824 ret->prelen = r->prelen;
7825 ret->nparens = r->nparens;
7826 ret->lastparen = r->lastparen;
7827 ret->lastcloseparen = r->lastcloseparen;
7828 ret->reganch = r->reganch;
7829
7830 ret->sublen = r->sublen;
7831
f9f4320a
YO
7832 ret->engine = r->engine;
7833
84da74a7
YO
7834 if (RX_MATCH_COPIED(ret))
7835 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
7836 else
7837 ret->subbeg = NULL;
7838#ifdef PERL_OLD_COPY_ON_WRITE
7839 ret->saved_copy = NULL;
7840#endif
7841
7842 ptr_table_store(PL_ptr_table, r, ret);
7843 return ret;
84da74a7 7844}
a3c0e9ca 7845#endif
84da74a7 7846
76234dfb 7847#ifndef PERL_IN_XSUB_RE
c277df42
IZ
7848/*
7849 - regnext - dig the "next" pointer out of a node
c277df42
IZ
7850 */
7851regnode *
864dbfa3 7852Perl_regnext(pTHX_ register regnode *p)
c277df42 7853{
97aff369 7854 dVAR;
c277df42
IZ
7855 register I32 offset;
7856
3280af22 7857 if (p == &PL_regdummy)
c277df42
IZ
7858 return(NULL);
7859
7860 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
7861 if (offset == 0)
7862 return(NULL);
7863
c277df42 7864 return(p+offset);
c277df42 7865}
76234dfb 7866#endif
c277df42 7867
01f988be 7868STATIC void
cea2e8a9 7869S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
7870{
7871 va_list args;
7872 STRLEN l1 = strlen(pat1);
7873 STRLEN l2 = strlen(pat2);
7874 char buf[512];
06bf62c7 7875 SV *msv;
73d840c0 7876 const char *message;
c277df42
IZ
7877
7878 if (l1 > 510)
7879 l1 = 510;
7880 if (l1 + l2 > 510)
7881 l2 = 510 - l1;
7882 Copy(pat1, buf, l1 , char);
7883 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
7884 buf[l1 + l2] = '\n';
7885 buf[l1 + l2 + 1] = '\0';
8736538c
AS
7886#ifdef I_STDARG
7887 /* ANSI variant takes additional second argument */
c277df42 7888 va_start(args, pat2);
8736538c
AS
7889#else
7890 va_start(args);
7891#endif
5a844595 7892 msv = vmess(buf, &args);
c277df42 7893 va_end(args);
cfd0369c 7894 message = SvPV_const(msv,l1);
c277df42
IZ
7895 if (l1 > 512)
7896 l1 = 512;
7897 Copy(message, buf, l1 , char);
197cf9b9 7898 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 7899 Perl_croak(aTHX_ "%s", buf);
c277df42 7900}
a0ed51b3
LW
7901
7902/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
7903
76234dfb 7904#ifndef PERL_IN_XSUB_RE
a0ed51b3 7905void
864dbfa3 7906Perl_save_re_context(pTHX)
b81d288d 7907{
97aff369 7908 dVAR;
1ade1aa1
NC
7909
7910 struct re_save_state *state;
7911
7912 SAVEVPTR(PL_curcop);
7913 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
7914
7915 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
7916 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
7917 SSPUSHINT(SAVEt_RE_STATE);
7918
46ab3289 7919 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 7920
a0ed51b3 7921 PL_reg_start_tmp = 0;
a0ed51b3 7922 PL_reg_start_tmpl = 0;
c445ea15 7923 PL_reg_oldsaved = NULL;
a5db57d6 7924 PL_reg_oldsavedlen = 0;
a5db57d6 7925 PL_reg_maxiter = 0;
a5db57d6 7926 PL_reg_leftiter = 0;
c445ea15 7927 PL_reg_poscache = NULL;
a5db57d6 7928 PL_reg_poscache_size = 0;
1ade1aa1
NC
7929#ifdef PERL_OLD_COPY_ON_WRITE
7930 PL_nrs = NULL;
7931#endif
ada6e8a9 7932
c445ea15
AL
7933 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
7934 if (PL_curpm) {
7935 const REGEXP * const rx = PM_GETRE(PL_curpm);
7936 if (rx) {
1df70142 7937 U32 i;
ada6e8a9 7938 for (i = 1; i <= rx->nparens; i++) {
1df70142 7939 char digits[TYPE_CHARS(long)];
d9fad198 7940 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
7941 GV *const *const gvp
7942 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
7943
b37c2d43
AL
7944 if (gvp) {
7945 GV * const gv = *gvp;
7946 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
7947 save_scalar(gv);
49f27e4b 7948 }
ada6e8a9
AMS
7949 }
7950 }
7951 }
a0ed51b3 7952}
76234dfb 7953#endif
51371543 7954
51371543 7955static void
acfe0abc 7956clear_re(pTHX_ void *r)
51371543 7957{
97aff369 7958 dVAR;
51371543
GS
7959 ReREFCNT_dec((regexp *)r);
7960}
ffbc6a93 7961
a28509cc
AL
7962#ifdef DEBUGGING
7963
7964STATIC void
7965S_put_byte(pTHX_ SV *sv, int c)
7966{
7967 if (isCNTRL(c) || c == 255 || !isPRINT(c))
7968 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
7969 else if (c == '-' || c == ']' || c == '\\' || c == '^')
7970 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
7971 else
7972 Perl_sv_catpvf(aTHX_ sv, "%c", c);
7973}
7974
786e8c11 7975
3dab1dad
YO
7976#define CLEAR_OPTSTART \
7977 if (optstart) STMT_START { \
07be1b83 7978 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
7979 optstart=NULL; \
7980 } STMT_END
7981
786e8c11 7982#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 7983
b5a2f8d8
NC
7984STATIC const regnode *
7985S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
7986 const regnode *last, const regnode *plast,
7987 SV* sv, I32 indent, U32 depth)
a28509cc 7988{
97aff369 7989 dVAR;
786e8c11 7990 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 7991 register const regnode *next;
3dab1dad
YO
7992 const regnode *optstart= NULL;
7993 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 7994
786e8c11
YO
7995#ifdef DEBUG_DUMPUNTIL
7996 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
7997 last ? last-start : 0,plast ? plast-start : 0);
7998#endif
7999
8000 if (plast && plast < last)
8001 last= plast;
8002
8003 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
8004 /* While that wasn't END last time... */
8005
8006 NODE_ALIGN(node);
8007 op = OP(node);
8008 if (op == CLOSE)
786e8c11 8009 indent--;
b5a2f8d8 8010 next = regnext((regnode *)node);
07be1b83 8011
a28509cc 8012 /* Where, what. */
8e11feef 8013 if (OP(node) == OPTIMIZED) {
e68ec53f 8014 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 8015 optstart = node;
3dab1dad 8016 else
8e11feef 8017 goto after_print;
3dab1dad
YO
8018 } else
8019 CLEAR_OPTSTART;
07be1b83 8020
32fc9b6a 8021 regprop(r, sv, node);
a28509cc 8022 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 8023 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
8024
8025 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
8026 if (next == NULL) /* Next ptr. */
8027 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
8028 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
8029 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
8030 else
8031 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 8032
1de06328 8033 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 8034 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
8035 }
8036
a28509cc
AL
8037 after_print:
8038 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
8039 assert(next);
8040 {
8041 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
8042 ? regnext((regnode *)next)
8043 : next);
be8e71aa
YO
8044 if (last && nnode > last)
8045 nnode = last;
786e8c11 8046 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 8047 }
a28509cc
AL
8048 }
8049 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 8050 assert(next);
786e8c11 8051 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
8052 }
8053 else if ( PL_regkind[(U8)op] == TRIE ) {
1de06328 8054 const char op = OP(node);
a28509cc 8055 const I32 n = ARG(node);
1de06328
YO
8056 const reg_ac_data * const ac = op>=AHOCORASICK ?
8057 (reg_ac_data *)r->data->data[n] :
8058 NULL;
8059 const reg_trie_data * const trie = op<AHOCORASICK ?
8060 (reg_trie_data*)r->data->data[n] :
8061 ac->trie;
786e8c11 8062 const regnode *nextbranch= NULL;
a28509cc 8063 I32 word_idx;
1de06328 8064 sv_setpvn(sv, "", 0);
786e8c11 8065 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 8066 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
8067
8068 PerlIO_printf(Perl_debug_log, "%*s%s ",
8069 (int)(2*(indent+3)), "",
8070 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
8071 PL_colors[0], PL_colors[1],
8072 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
8073 PERL_PV_PRETTY_ELIPSES |
8074 PERL_PV_PRETTY_LTGT
786e8c11
YO
8075 )
8076 : "???"
8077 );
8078 if (trie->jump) {
8079 U16 dist= trie->jump[word_idx+1];
8080 PerlIO_printf(Perl_debug_log, "(%u)\n",(next - dist) - start);
8081 if (dist) {
8082 if (!nextbranch)
8083 nextbranch= next - trie->jump[0];
8084 DUMPUNTIL(next - dist, nextbranch);
8085 }
8086 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
8087 nextbranch= regnext((regnode *)nextbranch);
8088 } else {
8089 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 8090 }
786e8c11
YO
8091 }
8092 if (last && next > last)
8093 node= last;
8094 else
8095 node= next;
a28509cc 8096 }
786e8c11
YO
8097 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
8098 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
8099 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
8100 }
8101 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 8102 assert(next);
786e8c11 8103 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
8104 }
8105 else if ( op == PLUS || op == STAR) {
786e8c11 8106 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
8107 }
8108 else if (op == ANYOF) {
8109 /* arglen 1 + class block */
8110 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
8111 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
8112 node = NEXTOPER(node);
8113 }
8114 else if (PL_regkind[(U8)op] == EXACT) {
8115 /* Literal string, where present. */
8116 node += NODE_SZ_STR(node) - 1;
8117 node = NEXTOPER(node);
8118 }
8119 else {
8120 node = NEXTOPER(node);
8121 node += regarglen[(U8)op];
8122 }
8123 if (op == CURLYX || op == OPEN)
786e8c11 8124 indent++;
a28509cc 8125 else if (op == WHILEM)
786e8c11 8126 indent--;
a28509cc 8127 }
3dab1dad 8128 CLEAR_OPTSTART;
786e8c11
YO
8129#ifdef DEBUG_DUMPUNTIL
8130 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
8131#endif
1de06328 8132 return node;
a28509cc
AL
8133}
8134
8135#endif /* DEBUGGING */
8136
241d1a3b
NC
8137/*
8138 * Local variables:
8139 * c-indentation-style: bsd
8140 * c-basic-offset: 4
8141 * indent-tabs-mode: t
8142 * End:
8143 *
37442d52
RGS
8144 * ex: set ts=8 sts=4 sw=4 noet:
8145 */