This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[perl #40369] File::Find mishandles non-dangling symlinks
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
1aa99e6b 120 I32 utf8;
fc8cd66c 121 HV *charnames; /* cache of named sequences */
830247a4
IZ
122#if ADD_TO_REGEXEC
123 char *starttry; /* -Dr: where regtry was called. */
124#define RExC_starttry (pRExC_state->starttry)
125#endif
3dab1dad 126#ifdef DEBUGGING
be8e71aa 127 const char *lastparse;
3dab1dad
YO
128 I32 lastnum;
129#define RExC_lastparse (pRExC_state->lastparse)
130#define RExC_lastnum (pRExC_state->lastnum)
131#endif
830247a4
IZ
132} RExC_state_t;
133
e2509266 134#define RExC_flags (pRExC_state->flags)
830247a4
IZ
135#define RExC_precomp (pRExC_state->precomp)
136#define RExC_rx (pRExC_state->rx)
fac92740 137#define RExC_start (pRExC_state->start)
830247a4
IZ
138#define RExC_end (pRExC_state->end)
139#define RExC_parse (pRExC_state->parse)
140#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 141#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 142#define RExC_emit (pRExC_state->emit)
fac92740 143#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
144#define RExC_naughty (pRExC_state->naughty)
145#define RExC_sawback (pRExC_state->sawback)
146#define RExC_seen (pRExC_state->seen)
147#define RExC_size (pRExC_state->size)
148#define RExC_npar (pRExC_state->npar)
149#define RExC_extralen (pRExC_state->extralen)
150#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
151#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 152#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 153#define RExC_charnames (pRExC_state->charnames)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
3dab1dad
YO
171#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
172
07be1b83
YO
173/* whether trie related optimizations are enabled */
174#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
175#define TRIE_STUDY_OPT
786e8c11 176#define FULL_TRIE_STUDY
07be1b83
YO
177#define TRIE_STCLASS
178#endif
1de06328
YO
179
180
181/* About scan_data_t.
182
183 During optimisation we recurse through the regexp program performing
184 various inplace (keyhole style) optimisations. In addition study_chunk
185 and scan_commit populate this data structure with information about
186 what strings MUST appear in the pattern. We look for the longest
187 string that must appear for at a fixed location, and we look for the
188 longest string that may appear at a floating location. So for instance
189 in the pattern:
190
191 /FOO[xX]A.*B[xX]BAR/
192
193 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
194 strings (because they follow a .* construct). study_chunk will identify
195 both FOO and BAR as being the longest fixed and floating strings respectively.
196
197 The strings can be composites, for instance
198
199 /(f)(o)(o)/
200
201 will result in a composite fixed substring 'foo'.
202
203 For each string some basic information is maintained:
204
205 - offset or min_offset
206 This is the position the string must appear at, or not before.
207 It also implicitly (when combined with minlenp) tells us how many
208 character must match before the string we are searching.
209 Likewise when combined with minlenp and the length of the string
210 tells us how many characters must appear after the string we have
211 found.
212
213 - max_offset
214 Only used for floating strings. This is the rightmost point that
215 the string can appear at. Ifset to I32 max it indicates that the
216 string can occur infinitely far to the right.
217
218 - minlenp
219 A pointer to the minimum length of the pattern that the string
220 was found inside. This is important as in the case of positive
221 lookahead or positive lookbehind we can have multiple patterns
222 involved. Consider
223
224 /(?=FOO).*F/
225
226 The minimum length of the pattern overall is 3, the minimum length
227 of the lookahead part is 3, but the minimum length of the part that
228 will actually match is 1. So 'FOO's minimum length is 3, but the
229 minimum length for the F is 1. This is important as the minimum length
230 is used to determine offsets in front of and behind the string being
231 looked for. Since strings can be composites this is the length of the
232 pattern at the time it was commited with a scan_commit. Note that
233 the length is calculated by study_chunk, so that the minimum lengths
234 are not known until the full pattern has been compiled, thus the
235 pointer to the value.
236
237 - lookbehind
238
239 In the case of lookbehind the string being searched for can be
240 offset past the start point of the final matching string.
241 If this value was just blithely removed from the min_offset it would
242 invalidate some of the calculations for how many chars must match
243 before or after (as they are derived from min_offset and minlen and
244 the length of the string being searched for).
245 When the final pattern is compiled and the data is moved from the
246 scan_data_t structure into the regexp structure the information
247 about lookbehind is factored in, with the information that would
248 have been lost precalculated in the end_shift field for the
249 associated string.
250
251 The fields pos_min and pos_delta are used to store the minimum offset
252 and the delta to the maximum offset at the current point in the pattern.
253
254*/
2c2d71f5
JH
255
256typedef struct scan_data_t {
1de06328
YO
257 /*I32 len_min; unused */
258 /*I32 len_delta; unused */
2c2d71f5
JH
259 I32 pos_min;
260 I32 pos_delta;
261 SV *last_found;
1de06328 262 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
263 I32 last_start_min;
264 I32 last_start_max;
1de06328
YO
265 SV **longest; /* Either &l_fixed, or &l_float. */
266 SV *longest_fixed; /* longest fixed string found in pattern */
267 I32 offset_fixed; /* offset where it starts */
268 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
269 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
270 SV *longest_float; /* longest floating string found in pattern */
271 I32 offset_float_min; /* earliest point in string it can appear */
272 I32 offset_float_max; /* latest point in string it can appear */
273 I32 *minlen_float; /* pointer to the minlen relevent to the string */
274 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
275 I32 flags;
276 I32 whilem_c;
cb434fcc 277 I32 *last_closep;
653099ff 278 struct regnode_charclass_class *start_class;
2c2d71f5
JH
279} scan_data_t;
280
a687059c 281/*
e50aee73 282 * Forward declarations for pregcomp()'s friends.
a687059c 283 */
a0d0e21e 284
27da23d5 285static const scan_data_t zero_scan_data =
1de06328 286 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
287
288#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
289#define SF_BEFORE_SEOL 0x0001
290#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
291#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
292#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
293
09b7f37c
CB
294#ifdef NO_UNARY_PLUS
295# define SF_FIX_SHIFT_EOL (0+2)
296# define SF_FL_SHIFT_EOL (0+4)
297#else
298# define SF_FIX_SHIFT_EOL (+2)
299# define SF_FL_SHIFT_EOL (+4)
300#endif
c277df42
IZ
301
302#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
303#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
304
305#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
306#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
307#define SF_IS_INF 0x0040
308#define SF_HAS_PAR 0x0080
309#define SF_IN_PAR 0x0100
310#define SF_HAS_EVAL 0x0200
311#define SCF_DO_SUBSTR 0x0400
653099ff
GS
312#define SCF_DO_STCLASS_AND 0x0800
313#define SCF_DO_STCLASS_OR 0x1000
314#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 315#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 316
786e8c11
YO
317#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
318
07be1b83 319
eb160463 320#define UTF (RExC_utf8 != 0)
e2509266
JH
321#define LOC ((RExC_flags & PMf_LOCALE) != 0)
322#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 323
ffc61ed2 324#define OOB_UNICODE 12345678
93733859 325#define OOB_NAMEDCLASS -1
b8c5462f 326
a0ed51b3
LW
327#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
328#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
329
8615cb43 330
b45f050a
JF
331/* length of regex to show in messages that don't mark a position within */
332#define RegexLengthToShowInErrorMessages 127
333
334/*
335 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
336 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
337 * op/pragma/warn/regcomp.
338 */
7253e4e3
RK
339#define MARKER1 "<-- HERE" /* marker as it appears in the description */
340#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 341
7253e4e3 342#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
343
344/*
345 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
346 * arg. Show regex, up to a maximum length. If it's too long, chop and add
347 * "...".
348 */
ccb2c380 349#define FAIL(msg) STMT_START { \
bfed75c6 350 const char *ellipses = ""; \
ccb2c380
MP
351 IV len = RExC_end - RExC_precomp; \
352 \
353 if (!SIZE_ONLY) \
354 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
355 if (len > RegexLengthToShowInErrorMessages) { \
356 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
357 len = RegexLengthToShowInErrorMessages - 10; \
358 ellipses = "..."; \
359 } \
360 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
361 msg, (int)len, RExC_precomp, ellipses); \
362} STMT_END
8615cb43 363
b45f050a 364/*
b45f050a
JF
365 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
366 */
ccb2c380 367#define Simple_vFAIL(m) STMT_START { \
a28509cc 368 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
369 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
370 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
371} STMT_END
b45f050a
JF
372
373/*
374 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
375 */
ccb2c380
MP
376#define vFAIL(m) STMT_START { \
377 if (!SIZE_ONLY) \
378 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
379 Simple_vFAIL(m); \
380} STMT_END
b45f050a
JF
381
382/*
383 * Like Simple_vFAIL(), but accepts two arguments.
384 */
ccb2c380 385#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 386 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
387 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
388 (int)offset, RExC_precomp, RExC_precomp + offset); \
389} STMT_END
b45f050a
JF
390
391/*
392 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
393 */
ccb2c380
MP
394#define vFAIL2(m,a1) STMT_START { \
395 if (!SIZE_ONLY) \
396 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
397 Simple_vFAIL2(m, a1); \
398} STMT_END
b45f050a
JF
399
400
401/*
402 * Like Simple_vFAIL(), but accepts three arguments.
403 */
ccb2c380 404#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 405 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
406 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
407 (int)offset, RExC_precomp, RExC_precomp + offset); \
408} STMT_END
b45f050a
JF
409
410/*
411 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
412 */
ccb2c380
MP
413#define vFAIL3(m,a1,a2) STMT_START { \
414 if (!SIZE_ONLY) \
415 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
416 Simple_vFAIL3(m, a1, a2); \
417} STMT_END
b45f050a
JF
418
419/*
420 * Like Simple_vFAIL(), but accepts four arguments.
421 */
ccb2c380 422#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 423 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
424 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
425 (int)offset, RExC_precomp, RExC_precomp + offset); \
426} STMT_END
b45f050a 427
ccb2c380 428#define vWARN(loc,m) STMT_START { \
a28509cc 429 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
430 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
431 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
432} STMT_END
433
434#define vWARNdep(loc,m) STMT_START { \
a28509cc 435 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
436 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
437 "%s" REPORT_LOCATION, \
438 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
439} STMT_END
440
441
442#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 443 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
444 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
445 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
446} STMT_END
447
448#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 449 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
450 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
451 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
452} STMT_END
453
454#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 455 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
456 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
457 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
458} STMT_END
459
460#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 461 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
462 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
463 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
464} STMT_END
9d1d55b5 465
8615cb43 466
cd439c50 467/* Allow for side effects in s */
ccb2c380
MP
468#define REGC(c,s) STMT_START { \
469 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
470} STMT_END
cd439c50 471
fac92740
MJD
472/* Macros for recording node offsets. 20001227 mjd@plover.com
473 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
474 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
475 * Element 0 holds the number n.
07be1b83 476 * Position is 1 indexed.
fac92740
MJD
477 */
478
ccb2c380
MP
479#define Set_Node_Offset_To_R(node,byte) STMT_START { \
480 if (! SIZE_ONLY) { \
481 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 482 __LINE__, (node), (int)(byte))); \
ccb2c380 483 if((node) < 0) { \
551405c4 484 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
485 } else { \
486 RExC_offsets[2*(node)-1] = (byte); \
487 } \
488 } \
489} STMT_END
490
491#define Set_Node_Offset(node,byte) \
492 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
493#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
494
495#define Set_Node_Length_To_R(node,len) STMT_START { \
496 if (! SIZE_ONLY) { \
497 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 498 __LINE__, (int)(node), (int)(len))); \
ccb2c380 499 if((node) < 0) { \
551405c4 500 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
501 } else { \
502 RExC_offsets[2*(node)] = (len); \
503 } \
504 } \
505} STMT_END
506
507#define Set_Node_Length(node,len) \
508 Set_Node_Length_To_R((node)-RExC_emit_start, len)
509#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
510#define Set_Node_Cur_Length(node) \
511 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
512
513/* Get offsets and lengths */
514#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
515#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
516
07be1b83
YO
517#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
518 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
519 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
520} STMT_END
521
522
523#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
524#define EXPERIMENTAL_INPLACESCAN
525#endif
526
1de06328
YO
527#define DEBUG_STUDYDATA(data,depth) \
528DEBUG_OPTIMISE_r(if(data){ \
529 PerlIO_printf(Perl_debug_log, \
530 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
531 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
532 (int)(depth)*2, "", \
533 (IV)((data)->pos_min), \
534 (IV)((data)->pos_delta), \
535 (IV)((data)->flags), \
536 (IV)((data)->whilem_c), \
537 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
538 ); \
539 if ((data)->last_found) \
540 PerlIO_printf(Perl_debug_log, \
541 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
542 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
543 SvPVX_const((data)->last_found), \
544 (IV)((data)->last_end), \
545 (IV)((data)->last_start_min), \
546 (IV)((data)->last_start_max), \
547 ((data)->longest && \
548 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
549 SvPVX_const((data)->longest_fixed), \
550 (IV)((data)->offset_fixed), \
551 ((data)->longest && \
552 (data)->longest==&((data)->longest_float)) ? "*" : "", \
553 SvPVX_const((data)->longest_float), \
554 (IV)((data)->offset_float_min), \
555 (IV)((data)->offset_float_max) \
556 ); \
557 PerlIO_printf(Perl_debug_log,"\n"); \
558});
559
acfe0abc 560static void clear_re(pTHX_ void *r);
4327152a 561
653099ff 562/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 563 Update the longest found anchored substring and the longest found
653099ff
GS
564 floating substrings if needed. */
565
4327152a 566STATIC void
1de06328 567S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 568{
e1ec3a88
AL
569 const STRLEN l = CHR_SVLEN(data->last_found);
570 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 571 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 572
c277df42 573 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 574 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
575 if (*data->longest == data->longest_fixed) {
576 data->offset_fixed = l ? data->last_start_min : data->pos_min;
577 if (data->flags & SF_BEFORE_EOL)
b81d288d 578 data->flags
c277df42
IZ
579 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
580 else
581 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
582 data->minlen_fixed=minlenp;
583 data->lookbehind_fixed=0;
a0ed51b3
LW
584 }
585 else {
c277df42 586 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
587 data->offset_float_max = (l
588 ? data->last_start_max
c277df42 589 : data->pos_min + data->pos_delta);
9051bda5
HS
590 if ((U32)data->offset_float_max > (U32)I32_MAX)
591 data->offset_float_max = I32_MAX;
c277df42 592 if (data->flags & SF_BEFORE_EOL)
b81d288d 593 data->flags
c277df42
IZ
594 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
595 else
596 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
597 data->minlen_float=minlenp;
598 data->lookbehind_float=0;
c277df42
IZ
599 }
600 }
601 SvCUR_set(data->last_found, 0);
0eda9292 602 {
a28509cc 603 SV * const sv = data->last_found;
097eb12c
AL
604 if (SvUTF8(sv) && SvMAGICAL(sv)) {
605 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
606 if (mg)
607 mg->mg_len = 0;
608 }
0eda9292 609 }
c277df42
IZ
610 data->last_end = -1;
611 data->flags &= ~SF_BEFORE_EOL;
1de06328 612 DEBUG_STUDYDATA(data,0);
c277df42
IZ
613}
614
653099ff
GS
615/* Can match anything (initialization) */
616STATIC void
097eb12c 617S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 618{
653099ff 619 ANYOF_CLASS_ZERO(cl);
f8bef550 620 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 621 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
622 if (LOC)
623 cl->flags |= ANYOF_LOCALE;
624}
625
626/* Can match anything (initialization) */
627STATIC int
5f66b61c 628S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
629{
630 int value;
631
aaa51d5e 632 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
633 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
634 return 1;
1aa99e6b
IH
635 if (!(cl->flags & ANYOF_UNICODE_ALL))
636 return 0;
10edeb5d 637 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 638 return 0;
653099ff
GS
639 return 1;
640}
641
642/* Can match anything (initialization) */
643STATIC void
097eb12c 644S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 645{
8ecf7187 646 Zero(cl, 1, struct regnode_charclass_class);
653099ff 647 cl->type = ANYOF;
830247a4 648 cl_anything(pRExC_state, cl);
653099ff
GS
649}
650
651STATIC void
097eb12c 652S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 653{
8ecf7187 654 Zero(cl, 1, struct regnode_charclass_class);
653099ff 655 cl->type = ANYOF;
830247a4 656 cl_anything(pRExC_state, cl);
653099ff
GS
657 if (LOC)
658 cl->flags |= ANYOF_LOCALE;
659}
660
661/* 'And' a given class with another one. Can create false positives */
662/* We assume that cl is not inverted */
663STATIC void
5f66b61c 664S_cl_and(struct regnode_charclass_class *cl,
a28509cc 665 const struct regnode_charclass_class *and_with)
653099ff 666{
653099ff
GS
667 if (!(and_with->flags & ANYOF_CLASS)
668 && !(cl->flags & ANYOF_CLASS)
669 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
670 && !(and_with->flags & ANYOF_FOLD)
671 && !(cl->flags & ANYOF_FOLD)) {
672 int i;
673
674 if (and_with->flags & ANYOF_INVERT)
675 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
676 cl->bitmap[i] &= ~and_with->bitmap[i];
677 else
678 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
679 cl->bitmap[i] &= and_with->bitmap[i];
680 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
681 if (!(and_with->flags & ANYOF_EOS))
682 cl->flags &= ~ANYOF_EOS;
1aa99e6b 683
14ebb1a2
JH
684 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
685 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
686 cl->flags &= ~ANYOF_UNICODE_ALL;
687 cl->flags |= ANYOF_UNICODE;
688 ARG_SET(cl, ARG(and_with));
689 }
14ebb1a2
JH
690 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
691 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 692 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
693 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
694 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 695 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
696}
697
698/* 'OR' a given class with another one. Can create false positives */
699/* We assume that cl is not inverted */
700STATIC void
097eb12c 701S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 702{
653099ff
GS
703 if (or_with->flags & ANYOF_INVERT) {
704 /* We do not use
705 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
706 * <= (B1 | !B2) | (CL1 | !CL2)
707 * which is wasteful if CL2 is small, but we ignore CL2:
708 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
709 * XXXX Can we handle case-fold? Unclear:
710 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
711 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
712 */
713 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
714 && !(or_with->flags & ANYOF_FOLD)
715 && !(cl->flags & ANYOF_FOLD) ) {
716 int i;
717
718 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
719 cl->bitmap[i] |= ~or_with->bitmap[i];
720 } /* XXXX: logic is complicated otherwise */
721 else {
830247a4 722 cl_anything(pRExC_state, cl);
653099ff
GS
723 }
724 } else {
725 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
726 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 727 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
728 || (cl->flags & ANYOF_FOLD)) ) {
729 int i;
730
731 /* OR char bitmap and class bitmap separately */
732 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
733 cl->bitmap[i] |= or_with->bitmap[i];
734 if (or_with->flags & ANYOF_CLASS) {
735 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
736 cl->classflags[i] |= or_with->classflags[i];
737 cl->flags |= ANYOF_CLASS;
738 }
739 }
740 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 741 cl_anything(pRExC_state, cl);
653099ff
GS
742 }
743 }
744 if (or_with->flags & ANYOF_EOS)
745 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
746
747 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
748 ARG(cl) != ARG(or_with)) {
749 cl->flags |= ANYOF_UNICODE_ALL;
750 cl->flags &= ~ANYOF_UNICODE;
751 }
752 if (or_with->flags & ANYOF_UNICODE_ALL) {
753 cl->flags |= ANYOF_UNICODE_ALL;
754 cl->flags &= ~ANYOF_UNICODE;
755 }
653099ff
GS
756}
757
a3621e74
YO
758#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
759#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
760#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
761#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
762
3dab1dad
YO
763
764#ifdef DEBUGGING
07be1b83 765/*
3dab1dad
YO
766 dump_trie(trie)
767 dump_trie_interim_list(trie,next_alloc)
768 dump_trie_interim_table(trie,next_alloc)
769
770 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
771 The _interim_ variants are used for debugging the interim
772 tables that are used to generate the final compressed
773 representation which is what dump_trie expects.
774
3dab1dad
YO
775 Part of the reason for their existance is to provide a form
776 of documentation as to how the different representations function.
07be1b83
YO
777
778*/
3dab1dad
YO
779
780/*
781 dump_trie(trie)
782 Dumps the final compressed table form of the trie to Perl_debug_log.
783 Used for debugging make_trie().
784*/
785
786STATIC void
787S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
788{
789 U32 state;
ab3bbdeb
YO
790 SV *sv=sv_newmortal();
791 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
792 GET_RE_DEBUG_FLAGS_DECL;
793
ab3bbdeb 794
3dab1dad
YO
795 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
796 (int)depth * 2 + 2,"",
797 "Match","Base","Ofs" );
798
799 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 800 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 801 if ( tmp ) {
ab3bbdeb
YO
802 PerlIO_printf( Perl_debug_log, "%*s",
803 colwidth,
ddc5bc0f 804 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
805 PL_colors[0], PL_colors[1],
806 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
807 PERL_PV_ESCAPE_FIRSTCHAR
808 )
809 );
3dab1dad
YO
810 }
811 }
812 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
813 (int)depth * 2 + 2,"");
814
815 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 816 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
817 PerlIO_printf( Perl_debug_log, "\n");
818
786e8c11 819 for( state = 1 ; state < trie->laststate ; state++ ) {
be8e71aa 820 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
821
822 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
823
824 if ( trie->states[ state ].wordnum ) {
825 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
826 } else {
827 PerlIO_printf( Perl_debug_log, "%6s", "" );
828 }
829
830 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
831
832 if ( base ) {
833 U32 ofs = 0;
834
835 while( ( base + ofs < trie->uniquecharcount ) ||
836 ( base + ofs - trie->uniquecharcount < trie->lasttrans
837 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
838 ofs++;
839
840 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
841
842 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
843 if ( ( base + ofs >= trie->uniquecharcount ) &&
844 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
845 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
846 {
ab3bbdeb
YO
847 PerlIO_printf( Perl_debug_log, "%*"UVXf,
848 colwidth,
3dab1dad
YO
849 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
850 } else {
ab3bbdeb 851 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
852 }
853 }
854
855 PerlIO_printf( Perl_debug_log, "]");
856
857 }
858 PerlIO_printf( Perl_debug_log, "\n" );
859 }
860}
861/*
862 dump_trie_interim_list(trie,next_alloc)
863 Dumps a fully constructed but uncompressed trie in list form.
864 List tries normally only are used for construction when the number of
865 possible chars (trie->uniquecharcount) is very high.
866 Used for debugging make_trie().
867*/
868STATIC void
869S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
870{
871 U32 state;
ab3bbdeb
YO
872 SV *sv=sv_newmortal();
873 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
874 GET_RE_DEBUG_FLAGS_DECL;
875 /* print out the table precompression. */
ab3bbdeb
YO
876 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
877 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
878 "------:-----+-----------------\n" );
3dab1dad
YO
879
880 for( state=1 ; state < next_alloc ; state ++ ) {
881 U16 charid;
882
ab3bbdeb 883 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
884 (int)depth * 2 + 2,"", (UV)state );
885 if ( ! trie->states[ state ].wordnum ) {
886 PerlIO_printf( Perl_debug_log, "%5s| ","");
887 } else {
888 PerlIO_printf( Perl_debug_log, "W%4x| ",
889 trie->states[ state ].wordnum
890 );
891 }
892 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 893 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
894 if ( tmp ) {
895 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
896 colwidth,
ddc5bc0f 897 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
898 PL_colors[0], PL_colors[1],
899 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
900 PERL_PV_ESCAPE_FIRSTCHAR
901 ) ,
3dab1dad
YO
902 TRIE_LIST_ITEM(state,charid).forid,
903 (UV)TRIE_LIST_ITEM(state,charid).newstate
904 );
905 }
ab3bbdeb
YO
906 }
907 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
908 }
909}
910
911/*
912 dump_trie_interim_table(trie,next_alloc)
913 Dumps a fully constructed but uncompressed trie in table form.
914 This is the normal DFA style state transition table, with a few
915 twists to facilitate compression later.
916 Used for debugging make_trie().
917*/
918STATIC void
919S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
920{
921 U32 state;
922 U16 charid;
ab3bbdeb
YO
923 SV *sv=sv_newmortal();
924 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
925 GET_RE_DEBUG_FLAGS_DECL;
926
927 /*
928 print out the table precompression so that we can do a visual check
929 that they are identical.
930 */
931
932 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
933
934 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 935 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 936 if ( tmp ) {
ab3bbdeb
YO
937 PerlIO_printf( Perl_debug_log, "%*s",
938 colwidth,
ddc5bc0f 939 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
940 PL_colors[0], PL_colors[1],
941 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
942 PERL_PV_ESCAPE_FIRSTCHAR
943 )
944 );
3dab1dad
YO
945 }
946 }
947
948 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
949
950 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 951 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
952 }
953
954 PerlIO_printf( Perl_debug_log, "\n" );
955
956 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
957
958 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
959 (int)depth * 2 + 2,"",
960 (UV)TRIE_NODENUM( state ) );
961
962 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
963 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
964 if (v)
965 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
966 else
967 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
968 }
969 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
970 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
971 } else {
972 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
973 trie->states[ TRIE_NODENUM( state ) ].wordnum );
974 }
975 }
07be1b83 976}
3dab1dad
YO
977
978#endif
979
786e8c11
YO
980/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
981 startbranch: the first branch in the whole branch sequence
982 first : start branch of sequence of branch-exact nodes.
983 May be the same as startbranch
984 last : Thing following the last branch.
985 May be the same as tail.
986 tail : item following the branch sequence
987 count : words in the sequence
988 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
989 depth : indent depth
3dab1dad 990
786e8c11 991Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 992
786e8c11
YO
993A trie is an N'ary tree where the branches are determined by digital
994decomposition of the key. IE, at the root node you look up the 1st character and
995follow that branch repeat until you find the end of the branches. Nodes can be
996marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 997
786e8c11 998 /he|she|his|hers/
72f13be8 999
786e8c11
YO
1000would convert into the following structure. Numbers represent states, letters
1001following numbers represent valid transitions on the letter from that state, if
1002the number is in square brackets it represents an accepting state, otherwise it
1003will be in parenthesis.
07be1b83 1004
786e8c11
YO
1005 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1006 | |
1007 | (2)
1008 | |
1009 (1) +-i->(6)-+-s->[7]
1010 |
1011 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1012
786e8c11
YO
1013 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1014
1015This shows that when matching against the string 'hers' we will begin at state 1
1016read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1017then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1018is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1019single traverse. We store a mapping from accepting to state to which word was
1020matched, and then when we have multiple possibilities we try to complete the
1021rest of the regex in the order in which they occured in the alternation.
1022
1023The only prior NFA like behaviour that would be changed by the TRIE support is
1024the silent ignoring of duplicate alternations which are of the form:
1025
1026 / (DUPE|DUPE) X? (?{ ... }) Y /x
1027
1028Thus EVAL blocks follwing a trie may be called a different number of times with
1029and without the optimisation. With the optimisations dupes will be silently
1030ignored. This inconsistant behaviour of EVAL type nodes is well established as
1031the following demonstrates:
1032
1033 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1034
1035which prints out 'word' three times, but
1036
1037 'words'=~/(word|word|word)(?{ print $1 })S/
1038
1039which doesnt print it out at all. This is due to other optimisations kicking in.
1040
1041Example of what happens on a structural level:
1042
1043The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1044
1045 1: CURLYM[1] {1,32767}(18)
1046 5: BRANCH(8)
1047 6: EXACT <ac>(16)
1048 8: BRANCH(11)
1049 9: EXACT <ad>(16)
1050 11: BRANCH(14)
1051 12: EXACT <ab>(16)
1052 16: SUCCEED(0)
1053 17: NOTHING(18)
1054 18: END(0)
1055
1056This would be optimizable with startbranch=5, first=5, last=16, tail=16
1057and should turn into:
1058
1059 1: CURLYM[1] {1,32767}(18)
1060 5: TRIE(16)
1061 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1062 <ac>
1063 <ad>
1064 <ab>
1065 16: SUCCEED(0)
1066 17: NOTHING(18)
1067 18: END(0)
1068
1069Cases where tail != last would be like /(?foo|bar)baz/:
1070
1071 1: BRANCH(4)
1072 2: EXACT <foo>(8)
1073 4: BRANCH(7)
1074 5: EXACT <bar>(8)
1075 7: TAIL(8)
1076 8: EXACT <baz>(10)
1077 10: END(0)
1078
1079which would be optimizable with startbranch=1, first=1, last=7, tail=8
1080and would end up looking like:
1081
1082 1: TRIE(8)
1083 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1084 <foo>
1085 <bar>
1086 7: TAIL(8)
1087 8: EXACT <baz>(10)
1088 10: END(0)
1089
1090 d = uvuni_to_utf8_flags(d, uv, 0);
1091
1092is the recommended Unicode-aware way of saying
1093
1094 *(d++) = uv;
1095*/
1096
1097#define TRIE_STORE_REVCHAR \
1098 STMT_START { \
1099 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
1100 if (UTF) SvUTF8_on(tmp); \
1101 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1102 } STMT_END
1103
1104#define TRIE_READ_CHAR STMT_START { \
1105 wordlen++; \
1106 if ( UTF ) { \
1107 if ( folder ) { \
1108 if ( foldlen > 0 ) { \
1109 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1110 foldlen -= len; \
1111 scan += len; \
1112 len = 0; \
1113 } else { \
1114 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1115 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1116 foldlen -= UNISKIP( uvc ); \
1117 scan = foldbuf + UNISKIP( uvc ); \
1118 } \
1119 } else { \
1120 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1121 } \
1122 } else { \
1123 uvc = (U32)*uc; \
1124 len = 1; \
1125 } \
1126} STMT_END
1127
1128
1129
1130#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1131 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1132 TRIE_LIST_LEN( state ) *= 2; \
1133 Renew( trie->states[ state ].trans.list, \
1134 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
1135 } \
1136 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1137 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1138 TRIE_LIST_CUR( state )++; \
1139} STMT_END
07be1b83 1140
786e8c11
YO
1141#define TRIE_LIST_NEW(state) STMT_START { \
1142 Newxz( trie->states[ state ].trans.list, \
1143 4, reg_trie_trans_le ); \
1144 TRIE_LIST_CUR( state ) = 1; \
1145 TRIE_LIST_LEN( state ) = 4; \
1146} STMT_END
07be1b83 1147
786e8c11
YO
1148#define TRIE_HANDLE_WORD(state) STMT_START { \
1149 U16 dupe= trie->states[ state ].wordnum; \
1150 regnode * const noper_next = regnext( noper ); \
1151 \
1152 if (trie->wordlen) \
1153 trie->wordlen[ curword ] = wordlen; \
1154 DEBUG_r({ \
1155 /* store the word for dumping */ \
1156 SV* tmp; \
1157 if (OP(noper) != NOTHING) \
1158 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1159 else \
1160 tmp = newSVpvn( "", 0 ); \
1161 if ( UTF ) SvUTF8_on( tmp ); \
1162 av_push( trie->words, tmp ); \
1163 }); \
1164 \
1165 curword++; \
1166 \
1167 if ( noper_next < tail ) { \
1168 if (!trie->jump) \
1169 Newxz( trie->jump, word_count + 1, U16); \
1170 trie->jump[curword] = (U16)(tail - noper_next); \
1171 if (!jumper) \
1172 jumper = noper_next; \
1173 if (!nextbranch) \
1174 nextbranch= regnext(cur); \
1175 } \
1176 \
1177 if ( dupe ) { \
1178 /* So it's a dupe. This means we need to maintain a */\
1179 /* linked-list from the first to the next. */\
1180 /* we only allocate the nextword buffer when there */\
1181 /* a dupe, so first time we have to do the allocation */\
1182 if (!trie->nextword) \
1183 Newxz( trie->nextword, word_count + 1, U16); \
1184 while ( trie->nextword[dupe] ) \
1185 dupe= trie->nextword[dupe]; \
1186 trie->nextword[dupe]= curword; \
1187 } else { \
1188 /* we haven't inserted this word yet. */ \
1189 trie->states[ state ].wordnum = curword; \
1190 } \
1191} STMT_END
07be1b83 1192
3dab1dad 1193
786e8c11
YO
1194#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1195 ( ( base + charid >= ucharcount \
1196 && base + charid < ubound \
1197 && state == trie->trans[ base - ucharcount + charid ].check \
1198 && trie->trans[ base - ucharcount + charid ].next ) \
1199 ? trie->trans[ base - ucharcount + charid ].next \
1200 : ( state==1 ? special : 0 ) \
1201 )
3dab1dad 1202
786e8c11
YO
1203#define MADE_TRIE 1
1204#define MADE_JUMP_TRIE 2
1205#define MADE_EXACT_TRIE 4
3dab1dad 1206
a3621e74 1207STATIC I32
786e8c11 1208S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1209{
27da23d5 1210 dVAR;
a3621e74
YO
1211 /* first pass, loop through and scan words */
1212 reg_trie_data *trie;
1213 regnode *cur;
9f7f3913 1214 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1215 STRLEN len = 0;
1216 UV uvc = 0;
1217 U16 curword = 0;
1218 U32 next_alloc = 0;
786e8c11
YO
1219 regnode *jumper = NULL;
1220 regnode *nextbranch = NULL;
a3621e74 1221 /* we just use folder as a flag in utf8 */
e1ec3a88 1222 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1223 ? PL_fold
1224 : ( flags == EXACTFL
1225 ? PL_fold_locale
1226 : NULL
1227 )
1228 );
1229
e1ec3a88 1230 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1231 SV *re_trie_maxbuff;
3dab1dad
YO
1232#ifndef DEBUGGING
1233 /* these are only used during construction but are useful during
8e11feef 1234 * debugging so we store them in the struct when debugging.
8e11feef 1235 */
3dab1dad 1236 STRLEN trie_charcount=0;
3dab1dad
YO
1237 AV *trie_revcharmap;
1238#endif
a3621e74 1239 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1240#ifndef DEBUGGING
1241 PERL_UNUSED_ARG(depth);
1242#endif
a3621e74 1243
a02a5408 1244 Newxz( trie, 1, reg_trie_data );
a3621e74 1245 trie->refcount = 1;
3dab1dad 1246 trie->startstate = 1;
786e8c11 1247 trie->wordcount = word_count;
a3621e74 1248 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1249 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1250 if (!(UTF && folder))
1251 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1252 DEBUG_r({
1253 trie->words = newAV();
a3621e74 1254 });
3dab1dad 1255 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1256
0111c4fd 1257 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1258 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1259 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1260 }
3dab1dad
YO
1261 DEBUG_OPTIMISE_r({
1262 PerlIO_printf( Perl_debug_log,
786e8c11 1263 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1264 (int)depth * 2 + 2, "",
1265 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1266 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1267 (int)depth);
3dab1dad 1268 });
a3621e74
YO
1269 /* -- First loop and Setup --
1270
1271 We first traverse the branches and scan each word to determine if it
1272 contains widechars, and how many unique chars there are, this is
1273 important as we have to build a table with at least as many columns as we
1274 have unique chars.
1275
1276 We use an array of integers to represent the character codes 0..255
1277 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1278 native representation of the character value as the key and IV's for the
1279 coded index.
1280
1281 *TODO* If we keep track of how many times each character is used we can
1282 remap the columns so that the table compression later on is more
1283 efficient in terms of memory by ensuring most common value is in the
1284 middle and the least common are on the outside. IMO this would be better
1285 than a most to least common mapping as theres a decent chance the most
1286 common letter will share a node with the least common, meaning the node
1287 will not be compressable. With a middle is most common approach the worst
1288 case is when we have the least common nodes twice.
1289
1290 */
1291
a3621e74 1292 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1293 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1294 const U8 *uc = (U8*)STRING( noper );
a28509cc 1295 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1296 STRLEN foldlen = 0;
1297 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1298 const U8 *scan = (U8*)NULL;
07be1b83 1299 U32 wordlen = 0; /* required init */
3dab1dad 1300 STRLEN chars=0;
a3621e74 1301
3dab1dad
YO
1302 if (OP(noper) == NOTHING) {
1303 trie->minlen= 0;
1304 continue;
1305 }
1306 if (trie->bitmap) {
1307 TRIE_BITMAP_SET(trie,*uc);
1308 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1309 }
a3621e74 1310 for ( ; uc < e ; uc += len ) {
3dab1dad 1311 TRIE_CHARCOUNT(trie)++;
a3621e74 1312 TRIE_READ_CHAR;
3dab1dad 1313 chars++;
a3621e74
YO
1314 if ( uvc < 256 ) {
1315 if ( !trie->charmap[ uvc ] ) {
1316 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1317 if ( folder )
1318 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1319 TRIE_STORE_REVCHAR;
a3621e74
YO
1320 }
1321 } else {
1322 SV** svpp;
1323 if ( !trie->widecharmap )
1324 trie->widecharmap = newHV();
1325
1326 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1327
1328 if ( !svpp )
e4584336 1329 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1330
1331 if ( !SvTRUE( *svpp ) ) {
1332 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1333 TRIE_STORE_REVCHAR;
a3621e74
YO
1334 }
1335 }
1336 }
3dab1dad
YO
1337 if( cur == first ) {
1338 trie->minlen=chars;
1339 trie->maxlen=chars;
1340 } else if (chars < trie->minlen) {
1341 trie->minlen=chars;
1342 } else if (chars > trie->maxlen) {
1343 trie->maxlen=chars;
1344 }
1345
a3621e74
YO
1346 } /* end first pass */
1347 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1348 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1349 (int)depth * 2 + 2,"",
85c3142d 1350 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1351 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1352 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1353 );
786e8c11 1354 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1355
1356 /*
1357 We now know what we are dealing with in terms of unique chars and
1358 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1359 representation using a flat table will take. If it's over a reasonable
1360 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1361 conservative but potentially much slower representation using an array
1362 of lists.
1363
1364 At the end we convert both representations into the same compressed
1365 form that will be used in regexec.c for matching with. The latter
1366 is a form that cannot be used to construct with but has memory
1367 properties similar to the list form and access properties similar
1368 to the table form making it both suitable for fast searches and
1369 small enough that its feasable to store for the duration of a program.
1370
1371 See the comment in the code where the compressed table is produced
1372 inplace from the flat tabe representation for an explanation of how
1373 the compression works.
1374
1375 */
1376
1377
3dab1dad 1378 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1379 /*
1380 Second Pass -- Array Of Lists Representation
1381
1382 Each state will be represented by a list of charid:state records
1383 (reg_trie_trans_le) the first such element holds the CUR and LEN
1384 points of the allocated array. (See defines above).
1385
1386 We build the initial structure using the lists, and then convert
1387 it into the compressed table form which allows faster lookups
1388 (but cant be modified once converted).
a3621e74
YO
1389 */
1390
a3621e74
YO
1391 STRLEN transcount = 1;
1392
3dab1dad 1393 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1394 TRIE_LIST_NEW(1);
1395 next_alloc = 2;
1396
1397 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1398
c445ea15
AL
1399 regnode * const noper = NEXTOPER( cur );
1400 U8 *uc = (U8*)STRING( noper );
1401 const U8 * const e = uc + STR_LEN( noper );
1402 U32 state = 1; /* required init */
1403 U16 charid = 0; /* sanity init */
1404 U8 *scan = (U8*)NULL; /* sanity init */
1405 STRLEN foldlen = 0; /* required init */
07be1b83 1406 U32 wordlen = 0; /* required init */
c445ea15
AL
1407 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1408
3dab1dad 1409 if (OP(noper) != NOTHING) {
786e8c11 1410 for ( ; uc < e ; uc += len ) {
c445ea15 1411
786e8c11 1412 TRIE_READ_CHAR;
c445ea15 1413
786e8c11
YO
1414 if ( uvc < 256 ) {
1415 charid = trie->charmap[ uvc ];
c445ea15 1416 } else {
786e8c11
YO
1417 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1418 if ( !svpp ) {
1419 charid = 0;
1420 } else {
1421 charid=(U16)SvIV( *svpp );
1422 }
c445ea15 1423 }
786e8c11
YO
1424 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1425 if ( charid ) {
a3621e74 1426
786e8c11
YO
1427 U16 check;
1428 U32 newstate = 0;
a3621e74 1429
786e8c11
YO
1430 charid--;
1431 if ( !trie->states[ state ].trans.list ) {
1432 TRIE_LIST_NEW( state );
c445ea15 1433 }
786e8c11
YO
1434 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1435 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1436 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1437 break;
1438 }
1439 }
1440 if ( ! newstate ) {
1441 newstate = next_alloc++;
1442 TRIE_LIST_PUSH( state, charid, newstate );
1443 transcount++;
1444 }
1445 state = newstate;
1446 } else {
1447 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1448 }
a28509cc 1449 }
c445ea15 1450 }
3dab1dad 1451 TRIE_HANDLE_WORD(state);
a3621e74
YO
1452
1453 } /* end second pass */
1454
786e8c11 1455 trie->laststate = next_alloc;
a3621e74
YO
1456 Renew( trie->states, next_alloc, reg_trie_state );
1457
3dab1dad
YO
1458 /* and now dump it out before we compress it */
1459 DEBUG_TRIE_COMPILE_MORE_r(
1460 dump_trie_interim_list(trie,next_alloc,depth+1)
a3621e74 1461 );
a3621e74 1462
a02a5408 1463 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1464 {
1465 U32 state;
a3621e74
YO
1466 U32 tp = 0;
1467 U32 zp = 0;
1468
1469
1470 for( state=1 ; state < next_alloc ; state ++ ) {
1471 U32 base=0;
1472
1473 /*
1474 DEBUG_TRIE_COMPILE_MORE_r(
1475 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1476 );
1477 */
1478
1479 if (trie->states[state].trans.list) {
1480 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1481 U16 maxid=minid;
a28509cc 1482 U16 idx;
a3621e74
YO
1483
1484 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1485 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1486 if ( forid < minid ) {
1487 minid=forid;
1488 } else if ( forid > maxid ) {
1489 maxid=forid;
1490 }
a3621e74
YO
1491 }
1492 if ( transcount < tp + maxid - minid + 1) {
1493 transcount *= 2;
1494 Renew( trie->trans, transcount, reg_trie_trans );
1495 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1496 }
1497 base = trie->uniquecharcount + tp - minid;
1498 if ( maxid == minid ) {
1499 U32 set = 0;
1500 for ( ; zp < tp ; zp++ ) {
1501 if ( ! trie->trans[ zp ].next ) {
1502 base = trie->uniquecharcount + zp - minid;
1503 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1504 trie->trans[ zp ].check = state;
1505 set = 1;
1506 break;
1507 }
1508 }
1509 if ( !set ) {
1510 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1511 trie->trans[ tp ].check = state;
1512 tp++;
1513 zp = tp;
1514 }
1515 } else {
1516 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1517 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1518 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1519 trie->trans[ tid ].check = state;
1520 }
1521 tp += ( maxid - minid + 1 );
1522 }
1523 Safefree(trie->states[ state ].trans.list);
1524 }
1525 /*
1526 DEBUG_TRIE_COMPILE_MORE_r(
1527 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1528 );
1529 */
1530 trie->states[ state ].trans.base=base;
1531 }
cc601c31 1532 trie->lasttrans = tp + 1;
a3621e74
YO
1533 }
1534 } else {
1535 /*
1536 Second Pass -- Flat Table Representation.
1537
1538 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1539 We know that we will need Charcount+1 trans at most to store the data
1540 (one row per char at worst case) So we preallocate both structures
1541 assuming worst case.
1542
1543 We then construct the trie using only the .next slots of the entry
1544 structs.
1545
1546 We use the .check field of the first entry of the node temporarily to
1547 make compression both faster and easier by keeping track of how many non
1548 zero fields are in the node.
1549
1550 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1551 transition.
1552
1553 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1554 number representing the first entry of the node, and state as a
1555 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1556 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1557 are 2 entrys per node. eg:
1558
1559 A B A B
1560 1. 2 4 1. 3 7
1561 2. 0 3 3. 0 5
1562 3. 0 0 5. 0 0
1563 4. 0 0 7. 0 0
1564
1565 The table is internally in the right hand, idx form. However as we also
1566 have to deal with the states array which is indexed by nodenum we have to
1567 use TRIE_NODENUM() to convert.
1568
1569 */
1570
3dab1dad
YO
1571
1572 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1573 reg_trie_trans );
3dab1dad 1574 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1575 next_alloc = trie->uniquecharcount + 1;
1576
3dab1dad 1577
a3621e74
YO
1578 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1579
c445ea15 1580 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1581 const U8 *uc = (U8*)STRING( noper );
1582 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1583
1584 U32 state = 1; /* required init */
1585
1586 U16 charid = 0; /* sanity init */
1587 U32 accept_state = 0; /* sanity init */
1588 U8 *scan = (U8*)NULL; /* sanity init */
1589
1590 STRLEN foldlen = 0; /* required init */
07be1b83 1591 U32 wordlen = 0; /* required init */
a3621e74
YO
1592 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1593
3dab1dad 1594 if ( OP(noper) != NOTHING ) {
786e8c11 1595 for ( ; uc < e ; uc += len ) {
a3621e74 1596
786e8c11 1597 TRIE_READ_CHAR;
a3621e74 1598
786e8c11
YO
1599 if ( uvc < 256 ) {
1600 charid = trie->charmap[ uvc ];
1601 } else {
1602 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1603 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1604 }
786e8c11
YO
1605 if ( charid ) {
1606 charid--;
1607 if ( !trie->trans[ state + charid ].next ) {
1608 trie->trans[ state + charid ].next = next_alloc;
1609 trie->trans[ state ].check++;
1610 next_alloc += trie->uniquecharcount;
1611 }
1612 state = trie->trans[ state + charid ].next;
1613 } else {
1614 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1615 }
1616 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1617 }
a3621e74 1618 }
3dab1dad
YO
1619 accept_state = TRIE_NODENUM( state );
1620 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1621
1622 } /* end second pass */
1623
3dab1dad
YO
1624 /* and now dump it out before we compress it */
1625 DEBUG_TRIE_COMPILE_MORE_r(
1626 dump_trie_interim_table(trie,next_alloc,depth+1)
1627 );
a3621e74 1628
a3621e74
YO
1629 {
1630 /*
1631 * Inplace compress the table.*
1632
1633 For sparse data sets the table constructed by the trie algorithm will
1634 be mostly 0/FAIL transitions or to put it another way mostly empty.
1635 (Note that leaf nodes will not contain any transitions.)
1636
1637 This algorithm compresses the tables by eliminating most such
1638 transitions, at the cost of a modest bit of extra work during lookup:
1639
1640 - Each states[] entry contains a .base field which indicates the
1641 index in the state[] array wheres its transition data is stored.
1642
1643 - If .base is 0 there are no valid transitions from that node.
1644
1645 - If .base is nonzero then charid is added to it to find an entry in
1646 the trans array.
1647
1648 -If trans[states[state].base+charid].check!=state then the
1649 transition is taken to be a 0/Fail transition. Thus if there are fail
1650 transitions at the front of the node then the .base offset will point
1651 somewhere inside the previous nodes data (or maybe even into a node
1652 even earlier), but the .check field determines if the transition is
1653 valid.
1654
786e8c11 1655 XXX - wrong maybe?
a3621e74
YO
1656 The following process inplace converts the table to the compressed
1657 table: We first do not compress the root node 1,and mark its all its
1658 .check pointers as 1 and set its .base pointer as 1 as well. This
1659 allows to do a DFA construction from the compressed table later, and
1660 ensures that any .base pointers we calculate later are greater than
1661 0.
1662
1663 - We set 'pos' to indicate the first entry of the second node.
1664
1665 - We then iterate over the columns of the node, finding the first and
1666 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1667 and set the .check pointers accordingly, and advance pos
1668 appropriately and repreat for the next node. Note that when we copy
1669 the next pointers we have to convert them from the original
1670 NODEIDX form to NODENUM form as the former is not valid post
1671 compression.
1672
1673 - If a node has no transitions used we mark its base as 0 and do not
1674 advance the pos pointer.
1675
1676 - If a node only has one transition we use a second pointer into the
1677 structure to fill in allocated fail transitions from other states.
1678 This pointer is independent of the main pointer and scans forward
1679 looking for null transitions that are allocated to a state. When it
1680 finds one it writes the single transition into the "hole". If the
786e8c11 1681 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1682
1683 - Once compressed we can Renew/realloc the structures to release the
1684 excess space.
1685
1686 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1687 specifically Fig 3.47 and the associated pseudocode.
1688
1689 demq
1690 */
a3b680e6 1691 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1692 U32 state, charid;
a3621e74 1693 U32 pos = 0, zp=0;
786e8c11 1694 trie->laststate = laststate;
a3621e74
YO
1695
1696 for ( state = 1 ; state < laststate ; state++ ) {
1697 U8 flag = 0;
a28509cc
AL
1698 const U32 stateidx = TRIE_NODEIDX( state );
1699 const U32 o_used = trie->trans[ stateidx ].check;
1700 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1701 trie->trans[ stateidx ].check = 0;
1702
1703 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1704 if ( flag || trie->trans[ stateidx + charid ].next ) {
1705 if ( trie->trans[ stateidx + charid ].next ) {
1706 if (o_used == 1) {
1707 for ( ; zp < pos ; zp++ ) {
1708 if ( ! trie->trans[ zp ].next ) {
1709 break;
1710 }
1711 }
1712 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1713 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1714 trie->trans[ zp ].check = state;
1715 if ( ++zp > pos ) pos = zp;
1716 break;
1717 }
1718 used--;
1719 }
1720 if ( !flag ) {
1721 flag = 1;
1722 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1723 }
1724 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1725 trie->trans[ pos ].check = state;
1726 pos++;
1727 }
1728 }
1729 }
cc601c31 1730 trie->lasttrans = pos + 1;
a3621e74
YO
1731 Renew( trie->states, laststate + 1, reg_trie_state);
1732 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1733 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1734 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1735 (int)depth * 2 + 2,"",
1736 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1737 (IV)next_alloc,
1738 (IV)pos,
a3621e74
YO
1739 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1740 );
1741
1742 } /* end table compress */
1743 }
cc601c31
YO
1744 /* resize the trans array to remove unused space */
1745 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1746
3dab1dad
YO
1747 /* and now dump out the compressed format */
1748 DEBUG_TRIE_COMPILE_r(
1749 dump_trie(trie,depth+1)
1750 );
07be1b83 1751
3dab1dad
YO
1752 { /* Modify the program and insert the new TRIE node*/
1753 regnode *convert;
1754 U8 nodetype =(U8)(flags & 0xFF);
1755 char *str=NULL;
786e8c11 1756
07be1b83 1757#ifdef DEBUGGING
786e8c11 1758
07be1b83
YO
1759 U32 mjd_offset;
1760 U32 mjd_nodelen;
1761#endif
a3621e74 1762 /*
3dab1dad
YO
1763 This means we convert either the first branch or the first Exact,
1764 depending on whether the thing following (in 'last') is a branch
1765 or not and whther first is the startbranch (ie is it a sub part of
1766 the alternation or is it the whole thing.)
1767 Assuming its a sub part we conver the EXACT otherwise we convert
1768 the whole branch sequence, including the first.
a3621e74 1769 */
3dab1dad
YO
1770 /* Find the node we are going to overwrite */
1771 if ( first == startbranch && OP( last ) != BRANCH ) {
07be1b83 1772 /* whole branch chain */
3dab1dad 1773 convert = first;
07be1b83
YO
1774 DEBUG_r({
1775 const regnode *nop = NEXTOPER( convert );
1776 mjd_offset= Node_Offset((nop));
1777 mjd_nodelen= Node_Length((nop));
1778 });
1779 } else {
1780 /* branch sub-chain */
3dab1dad
YO
1781 convert = NEXTOPER( first );
1782 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1783 DEBUG_r({
1784 mjd_offset= Node_Offset((convert));
1785 mjd_nodelen= Node_Length((convert));
1786 });
1787 }
1788 DEBUG_OPTIMISE_r(
1789 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1790 (int)depth * 2 + 2, "",
786e8c11 1791 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1792 );
a3621e74 1793
3dab1dad
YO
1794 /* But first we check to see if there is a common prefix we can
1795 split out as an EXACT and put in front of the TRIE node. */
1796 trie->startstate= 1;
786e8c11 1797 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad
YO
1798 U32 state;
1799 DEBUG_OPTIMISE_r(
8e11feef
RGS
1800 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1801 (int)depth * 2 + 2, "",
786e8c11 1802 (UV)trie->laststate)
8e11feef 1803 );
786e8c11 1804 for ( state = 1 ; state < trie->laststate-1 ; state++ ) {
a3621e74 1805 U32 ofs = 0;
8e11feef
RGS
1806 I32 idx = -1;
1807 U32 count = 0;
1808 const U32 base = trie->states[ state ].trans.base;
a3621e74 1809
3dab1dad 1810 if ( trie->states[state].wordnum )
8e11feef 1811 count = 1;
a3621e74 1812
8e11feef 1813 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1814 if ( ( base + ofs >= trie->uniquecharcount ) &&
1815 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1816 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1817 {
3dab1dad 1818 if ( ++count > 1 ) {
8e11feef 1819 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1820 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1821 if ( state == 1 ) break;
3dab1dad
YO
1822 if ( count == 2 ) {
1823 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1824 DEBUG_OPTIMISE_r(
8e11feef
RGS
1825 PerlIO_printf(Perl_debug_log,
1826 "%*sNew Start State=%"UVuf" Class: [",
1827 (int)depth * 2 + 2, "",
786e8c11 1828 (UV)state));
be8e71aa
YO
1829 if (idx >= 0) {
1830 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1831 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1832
3dab1dad 1833 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1834 if ( folder )
1835 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1836 DEBUG_OPTIMISE_r(
07be1b83 1837 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1838 );
8e11feef
RGS
1839 }
1840 }
1841 TRIE_BITMAP_SET(trie,*ch);
1842 if ( folder )
1843 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1844 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1845 }
1846 idx = ofs;
1847 }
3dab1dad
YO
1848 }
1849 if ( count == 1 ) {
1850 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1851 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1852 DEBUG_OPTIMISE_r(
8e11feef
RGS
1853 PerlIO_printf( Perl_debug_log,
1854 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1855 (int)depth * 2 + 2, "",
786e8c11 1856 (UV)state, (UV)idx, ch)
3dab1dad
YO
1857 );
1858 if ( state==1 ) {
1859 OP( convert ) = nodetype;
1860 str=STRING(convert);
1861 STR_LEN(convert)=0;
1862 }
1863 *str++=*ch;
1864 STR_LEN(convert)++;
a3621e74 1865
8e11feef 1866 } else {
f9049ba1 1867#ifdef DEBUGGING
8e11feef
RGS
1868 if (state>1)
1869 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1870#endif
8e11feef
RGS
1871 break;
1872 }
1873 }
3dab1dad 1874 if (str) {
8e11feef 1875 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1876 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1877 trie->startstate = state;
07be1b83
YO
1878 trie->minlen -= (state - 1);
1879 trie->maxlen -= (state - 1);
1880 DEBUG_r({
1881 regnode *fix = convert;
1882 mjd_nodelen++;
1883 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1884 while( ++fix < n ) {
1885 Set_Node_Offset_Length(fix, 0, 0);
1886 }
1887 });
8e11feef
RGS
1888 if (trie->maxlen) {
1889 convert = n;
1890 } else {
3dab1dad
YO
1891 NEXT_OFF(convert) = (U16)(tail - convert);
1892 }
1893 }
1894 }
1895 if ( trie->maxlen ) {
8e11feef
RGS
1896 NEXT_OFF( convert ) = (U16)(tail - convert);
1897 ARG_SET( convert, data_slot );
786e8c11
YO
1898 /* Store the offset to the first unabsorbed branch in
1899 jump[0], which is otherwise unused by the jump logic.
1900 We use this when dumping a trie and during optimisation. */
1901 if (trie->jump)
1902 trie->jump[0] = (U16)(tail - nextbranch);
1903 if (!jumper)
85c3142d 1904 jumper = last;
786e8c11
YO
1905 /* XXXX */
1906 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1907 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1908 {
1909 OP( convert ) = TRIEC;
1910 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1911 Safefree(trie->bitmap);
1912 trie->bitmap= NULL;
1913 } else
1914 OP( convert ) = TRIE;
a3621e74 1915
3dab1dad
YO
1916 /* store the type in the flags */
1917 convert->flags = nodetype;
1918 /* XXX We really should free up the resource in trie now, as we wont use them */
1919 }
a3621e74
YO
1920 /* needed for dumping*/
1921 DEBUG_r({
786e8c11
YO
1922 regnode *optimize = convert
1923 + NODE_STEP_REGNODE
1924 + regarglen[ OP( convert ) ];
07be1b83
YO
1925 regnode *opt = convert;
1926 while (++opt<optimize) {
1927 Set_Node_Offset_Length(opt,0,0);
1928 }
786e8c11
YO
1929 /*
1930 Try to clean up some of the debris left after the
1931 optimisation.
a3621e74 1932 */
786e8c11 1933 while( optimize < jumper ) {
07be1b83 1934 mjd_nodelen += Node_Length((optimize));
a3621e74 1935 OP( optimize ) = OPTIMIZED;
07be1b83 1936 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1937 optimize++;
1938 }
07be1b83 1939 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1940 });
1941 } /* end node insert */
07be1b83 1942#ifndef DEBUGGING
6e8b4190 1943 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1944#endif
786e8c11
YO
1945 return trie->jump
1946 ? MADE_JUMP_TRIE
1947 : trie->startstate>1
1948 ? MADE_EXACT_TRIE
1949 : MADE_TRIE;
1950}
1951
1952STATIC void
1953S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1954{
1955/* The Trie is constructed and compressed now so we can build a fail array now if its needed
1956
1957 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1958 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1959 ISBN 0-201-10088-6
1960
1961 We find the fail state for each state in the trie, this state is the longest proper
1962 suffix of the current states 'word' that is also a proper prefix of another word in our
1963 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1964 the DFA not to have to restart after its tried and failed a word at a given point, it
1965 simply continues as though it had been matching the other word in the first place.
1966 Consider
1967 'abcdgu'=~/abcdefg|cdgu/
1968 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1969 fail, which would bring use to the state representing 'd' in the second word where we would
1970 try 'g' and succeed, prodceding to match 'cdgu'.
1971 */
1972 /* add a fail transition */
1973 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1974 U32 *q;
1975 const U32 ucharcount = trie->uniquecharcount;
1976 const U32 numstates = trie->laststate;
1977 const U32 ubound = trie->lasttrans + ucharcount;
1978 U32 q_read = 0;
1979 U32 q_write = 0;
1980 U32 charid;
1981 U32 base = trie->states[ 1 ].trans.base;
1982 U32 *fail;
1983 reg_ac_data *aho;
1984 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1985 GET_RE_DEBUG_FLAGS_DECL;
1986#ifndef DEBUGGING
1987 PERL_UNUSED_ARG(depth);
1988#endif
1989
1990
1991 ARG_SET( stclass, data_slot );
1992 Newxz( aho, 1, reg_ac_data );
1993 RExC_rx->data->data[ data_slot ] = (void*)aho;
1994 aho->trie=trie;
1995 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1996 (trie->laststate+1)*sizeof(reg_trie_state));
1997 Newxz( q, numstates, U32);
1998 Newxz( aho->fail, numstates, U32 );
1999 aho->refcount = 1;
2000 fail = aho->fail;
2001 /* initialize fail[0..1] to be 1 so that we always have
2002 a valid final fail state */
2003 fail[ 0 ] = fail[ 1 ] = 1;
2004
2005 for ( charid = 0; charid < ucharcount ; charid++ ) {
2006 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2007 if ( newstate ) {
2008 q[ q_write ] = newstate;
2009 /* set to point at the root */
2010 fail[ q[ q_write++ ] ]=1;
2011 }
2012 }
2013 while ( q_read < q_write) {
2014 const U32 cur = q[ q_read++ % numstates ];
2015 base = trie->states[ cur ].trans.base;
2016
2017 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2018 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2019 if (ch_state) {
2020 U32 fail_state = cur;
2021 U32 fail_base;
2022 do {
2023 fail_state = fail[ fail_state ];
2024 fail_base = aho->states[ fail_state ].trans.base;
2025 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2026
2027 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2028 fail[ ch_state ] = fail_state;
2029 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2030 {
2031 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2032 }
2033 q[ q_write++ % numstates] = ch_state;
2034 }
2035 }
2036 }
2037 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2038 when we fail in state 1, this allows us to use the
2039 charclass scan to find a valid start char. This is based on the principle
2040 that theres a good chance the string being searched contains lots of stuff
2041 that cant be a start char.
2042 */
2043 fail[ 0 ] = fail[ 1 ] = 0;
2044 DEBUG_TRIE_COMPILE_r({
2045 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable: 0", (int)(depth * 2), "");
2046 for( q_read=1; q_read<numstates; q_read++ ) {
2047 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2048 }
2049 PerlIO_printf(Perl_debug_log, "\n");
2050 });
2051 Safefree(q);
2052 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2053}
2054
786e8c11 2055
a3621e74 2056/*
5d1c421c
JH
2057 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2058 * These need to be revisited when a newer toolchain becomes available.
2059 */
2060#if defined(__sparc64__) && defined(__GNUC__)
2061# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2062# undef SPARC64_GCC_WORKAROUND
2063# define SPARC64_GCC_WORKAROUND 1
2064# endif
2065#endif
2066
07be1b83
YO
2067#define DEBUG_PEEP(str,scan,depth) \
2068 DEBUG_OPTIMISE_r({ \
2069 SV * const mysv=sv_newmortal(); \
2070 regnode *Next = regnext(scan); \
2071 regprop(RExC_rx, mysv, scan); \
2072 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
2073 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2074 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2075 });
2076
1de06328
YO
2077
2078
2079
2080
07be1b83
YO
2081#define JOIN_EXACT(scan,min,flags) \
2082 if (PL_regkind[OP(scan)] == EXACT) \
2083 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2084
be8e71aa 2085STATIC U32
07be1b83
YO
2086S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2087 /* Merge several consecutive EXACTish nodes into one. */
2088 regnode *n = regnext(scan);
2089 U32 stringok = 1;
2090 regnode *next = scan + NODE_SZ_STR(scan);
2091 U32 merged = 0;
2092 U32 stopnow = 0;
2093#ifdef DEBUGGING
2094 regnode *stop = scan;
72f13be8 2095 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2096#else
d47053eb
RGS
2097 PERL_UNUSED_ARG(depth);
2098#endif
2099#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2100 PERL_UNUSED_ARG(flags);
2101 PERL_UNUSED_ARG(val);
07be1b83 2102#endif
07be1b83
YO
2103 DEBUG_PEEP("join",scan,depth);
2104
2105 /* Skip NOTHING, merge EXACT*. */
2106 while (n &&
2107 ( PL_regkind[OP(n)] == NOTHING ||
2108 (stringok && (OP(n) == OP(scan))))
2109 && NEXT_OFF(n)
2110 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2111
2112 if (OP(n) == TAIL || n > next)
2113 stringok = 0;
2114 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2115 DEBUG_PEEP("skip:",n,depth);
2116 NEXT_OFF(scan) += NEXT_OFF(n);
2117 next = n + NODE_STEP_REGNODE;
2118#ifdef DEBUGGING
2119 if (stringok)
2120 stop = n;
2121#endif
2122 n = regnext(n);
2123 }
2124 else if (stringok) {
786e8c11 2125 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2126 regnode * const nnext = regnext(n);
2127
2128 DEBUG_PEEP("merg",n,depth);
2129
2130 merged++;
2131 if (oldl + STR_LEN(n) > U8_MAX)
2132 break;
2133 NEXT_OFF(scan) += NEXT_OFF(n);
2134 STR_LEN(scan) += STR_LEN(n);
2135 next = n + NODE_SZ_STR(n);
2136 /* Now we can overwrite *n : */
2137 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2138#ifdef DEBUGGING
2139 stop = next - 1;
2140#endif
2141 n = nnext;
2142 if (stopnow) break;
2143 }
2144
d47053eb
RGS
2145#ifdef EXPERIMENTAL_INPLACESCAN
2146 if (flags && !NEXT_OFF(n)) {
2147 DEBUG_PEEP("atch", val, depth);
2148 if (reg_off_by_arg[OP(n)]) {
2149 ARG_SET(n, val - n);
2150 }
2151 else {
2152 NEXT_OFF(n) = val - n;
2153 }
2154 stopnow = 1;
2155 }
07be1b83
YO
2156#endif
2157 }
2158
2159 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2160 /*
2161 Two problematic code points in Unicode casefolding of EXACT nodes:
2162
2163 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2164 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2165
2166 which casefold to
2167
2168 Unicode UTF-8
2169
2170 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2171 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2172
2173 This means that in case-insensitive matching (or "loose matching",
2174 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2175 length of the above casefolded versions) can match a target string
2176 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2177 This would rather mess up the minimum length computation.
2178
2179 What we'll do is to look for the tail four bytes, and then peek
2180 at the preceding two bytes to see whether we need to decrease
2181 the minimum length by four (six minus two).
2182
2183 Thanks to the design of UTF-8, there cannot be false matches:
2184 A sequence of valid UTF-8 bytes cannot be a subsequence of
2185 another valid sequence of UTF-8 bytes.
2186
2187 */
2188 char * const s0 = STRING(scan), *s, *t;
2189 char * const s1 = s0 + STR_LEN(scan) - 1;
2190 char * const s2 = s1 - 4;
e294cc5d
JH
2191#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2192 const char t0[] = "\xaf\x49\xaf\x42";
2193#else
07be1b83 2194 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2195#endif
07be1b83
YO
2196 const char * const t1 = t0 + 3;
2197
2198 for (s = s0 + 2;
2199 s < s2 && (t = ninstr(s, s1, t0, t1));
2200 s = t + 4) {
e294cc5d
JH
2201#ifdef EBCDIC
2202 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2203 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2204#else
07be1b83
YO
2205 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2206 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2207#endif
07be1b83
YO
2208 *min -= 4;
2209 }
2210 }
2211
2212#ifdef DEBUGGING
2213 /* Allow dumping */
2214 n = scan + NODE_SZ_STR(scan);
2215 while (n <= stop) {
2216 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2217 OP(n) = OPTIMIZED;
2218 NEXT_OFF(n) = 0;
2219 }
2220 n++;
2221 }
2222#endif
2223 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2224 return stopnow;
2225}
2226
653099ff
GS
2227/* REx optimizer. Converts nodes into quickier variants "in place".
2228 Finds fixed substrings. */
2229
a0288114 2230/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2231 to the position after last scanned or to NULL. */
2232
07be1b83
YO
2233
2234
76e3520e 2235STATIC I32
1de06328
YO
2236S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2237 I32 *minlenp, I32 *deltap,
9a957fbc 2238 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
2239 /* scanp: Start here (read-write). */
2240 /* deltap: Write maxlen-minlen here. */
2241 /* last: Stop before this one. */
2242{
97aff369 2243 dVAR;
c277df42
IZ
2244 I32 min = 0, pars = 0, code;
2245 regnode *scan = *scanp, *next;
2246 I32 delta = 0;
2247 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2248 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2249 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2250 scan_data_t data_fake;
653099ff 2251 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74 2252 SV *re_trie_maxbuff = NULL;
786e8c11
YO
2253 regnode *first_non_open = scan;
2254
a3621e74
YO
2255
2256 GET_RE_DEBUG_FLAGS_DECL;
13a24bad
YO
2257#ifdef DEBUGGING
2258 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2259#endif
786e8c11
YO
2260 if ( depth == 0 ) {
2261 while (first_non_open && OP(first_non_open) == OPEN)
2262 first_non_open=regnext(first_non_open);
2263 }
2264
b81d288d 2265
c277df42
IZ
2266 while (scan && OP(scan) != END && scan < last) {
2267 /* Peephole optimizer: */
1de06328 2268 DEBUG_STUDYDATA(data,depth);
07be1b83 2269 DEBUG_PEEP("Peep",scan,depth);
07be1b83 2270 JOIN_EXACT(scan,&min,0);
a3621e74 2271
653099ff
GS
2272 /* Follow the next-chain of the current node and optimize
2273 away all the NOTHINGs from it. */
c277df42 2274 if (OP(scan) != CURLYX) {
a3b680e6 2275 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2276 ? I32_MAX
2277 /* I32 may be smaller than U16 on CRAYs! */
2278 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2279 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2280 int noff;
2281 regnode *n = scan;
b81d288d 2282
c277df42
IZ
2283 /* Skip NOTHING and LONGJMP. */
2284 while ((n = regnext(n))
3dab1dad 2285 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2286 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2287 && off + noff < max)
2288 off += noff;
2289 if (reg_off_by_arg[OP(scan)])
2290 ARG(scan) = off;
b81d288d 2291 else
c277df42
IZ
2292 NEXT_OFF(scan) = off;
2293 }
a3621e74 2294
07be1b83 2295
3dab1dad 2296
653099ff
GS
2297 /* The principal pseudo-switch. Cannot be a switch, since we
2298 look into several different things. */
b81d288d 2299 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2300 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2301 next = regnext(scan);
2302 code = OP(scan);
a3621e74 2303 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2304
2305 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2306 /* NOTE - There is similar code to this block below for handling
2307 TRIE nodes on a re-study. If you change stuff here check there
2308 too. */
c277df42 2309 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2310 struct regnode_charclass_class accum;
d4c19fe8 2311 regnode * const startbranch=scan;
c277df42 2312
653099ff 2313 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 2314 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
653099ff 2315 if (flags & SCF_DO_STCLASS)
830247a4 2316 cl_init_zero(pRExC_state, &accum);
a3621e74 2317
c277df42 2318 while (OP(scan) == code) {
830247a4 2319 I32 deltanext, minnext, f = 0, fake;
653099ff 2320 struct regnode_charclass_class this_class;
c277df42
IZ
2321
2322 num++;
2323 data_fake.flags = 0;
b81d288d 2324 if (data) {
2c2d71f5 2325 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2326 data_fake.last_closep = data->last_closep;
2327 }
2328 else
2329 data_fake.last_closep = &fake;
c277df42
IZ
2330 next = regnext(scan);
2331 scan = NEXTOPER(scan);
2332 if (code != BRANCH)
2333 scan = NEXTOPER(scan);
653099ff 2334 if (flags & SCF_DO_STCLASS) {
830247a4 2335 cl_init(pRExC_state, &this_class);
653099ff
GS
2336 data_fake.start_class = &this_class;
2337 f = SCF_DO_STCLASS_AND;
b81d288d 2338 }
e1901655
IZ
2339 if (flags & SCF_WHILEM_VISITED_POS)
2340 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2341
653099ff 2342 /* we suppose the run is continuous, last=next...*/
1de06328 2343 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
a3621e74 2344 next, &data_fake, f,depth+1);
b81d288d 2345 if (min1 > minnext)
c277df42
IZ
2346 min1 = minnext;
2347 if (max1 < minnext + deltanext)
2348 max1 = minnext + deltanext;
2349 if (deltanext == I32_MAX)
aca2d497 2350 is_inf = is_inf_internal = 1;
c277df42
IZ
2351 scan = next;
2352 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2353 pars++;
3dab1dad
YO
2354 if (data) {
2355 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2356 data->flags |= SF_HAS_EVAL;
2c2d71f5 2357 data->whilem_c = data_fake.whilem_c;
3dab1dad 2358 }
653099ff 2359 if (flags & SCF_DO_STCLASS)
830247a4 2360 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2361 if (code == SUSPEND)
c277df42
IZ
2362 break;
2363 }
2364 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2365 min1 = 0;
2366 if (flags & SCF_DO_SUBSTR) {
2367 data->pos_min += min1;
2368 data->pos_delta += max1 - min1;
2369 if (max1 != min1 || is_inf)
2370 data->longest = &(data->longest_float);
2371 }
2372 min += min1;
2373 delta += max1 - min1;
653099ff 2374 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2375 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
2376 if (min1) {
2377 cl_and(data->start_class, &and_with);
2378 flags &= ~SCF_DO_STCLASS;
2379 }
2380 }
2381 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2382 if (min1) {
2383 cl_and(data->start_class, &accum);
653099ff 2384 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2385 }
2386 else {
b81d288d 2387 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
2388 * data->start_class */
2389 StructCopy(data->start_class, &and_with,
2390 struct regnode_charclass_class);
2391 flags &= ~SCF_DO_STCLASS_AND;
2392 StructCopy(&accum, data->start_class,
2393 struct regnode_charclass_class);
2394 flags |= SCF_DO_STCLASS_OR;
2395 data->start_class->flags |= ANYOF_EOS;
2396 }
653099ff 2397 }
a3621e74 2398
786e8c11 2399 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2400 /* demq.
2401
2402 Assuming this was/is a branch we are dealing with: 'scan' now
2403 points at the item that follows the branch sequence, whatever
2404 it is. We now start at the beginning of the sequence and look
2405 for subsequences of
2406
786e8c11
YO
2407 BRANCH->EXACT=>x1
2408 BRANCH->EXACT=>x2
2409 tail
a3621e74
YO
2410
2411 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2412
2413 If we can find such a subseqence we need to turn the first
2414 element into a trie and then add the subsequent branch exact
2415 strings to the trie.
2416
2417 We have two cases
2418
786e8c11 2419 1. patterns where the whole set of branch can be converted.
a3621e74 2420
786e8c11 2421 2. patterns where only a subset can be converted.
a3621e74
YO
2422
2423 In case 1 we can replace the whole set with a single regop
2424 for the trie. In case 2 we need to keep the start and end
2425 branchs so
2426
2427 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2428 becomes BRANCH TRIE; BRANCH X;
2429
786e8c11
YO
2430 There is an additional case, that being where there is a
2431 common prefix, which gets split out into an EXACT like node
2432 preceding the TRIE node.
2433
2434 If x(1..n)==tail then we can do a simple trie, if not we make
2435 a "jump" trie, such that when we match the appropriate word
2436 we "jump" to the appopriate tail node. Essentailly we turn
2437 a nested if into a case structure of sorts.
a3621e74
YO
2438
2439 */
786e8c11 2440
3dab1dad 2441 int made=0;
0111c4fd
RGS
2442 if (!re_trie_maxbuff) {
2443 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2444 if (!SvIOK(re_trie_maxbuff))
2445 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2446 }
786e8c11 2447 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2448 regnode *cur;
2449 regnode *first = (regnode *)NULL;
2450 regnode *last = (regnode *)NULL;
2451 regnode *tail = scan;
2452 U8 optype = 0;
2453 U32 count=0;
2454
2455#ifdef DEBUGGING
c445ea15 2456 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2457#endif
2458 /* var tail is used because there may be a TAIL
2459 regop in the way. Ie, the exacts will point to the
2460 thing following the TAIL, but the last branch will
2461 point at the TAIL. So we advance tail. If we
2462 have nested (?:) we may have to move through several
2463 tails.
2464 */
2465
2466 while ( OP( tail ) == TAIL ) {
2467 /* this is the TAIL generated by (?:) */
2468 tail = regnext( tail );
2469 }
2470
3dab1dad 2471
a3621e74 2472 DEBUG_OPTIMISE_r({
32fc9b6a 2473 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2474 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2475 (int)depth * 2 + 2, "",
2476 "Looking for TRIE'able sequences. Tail node is: ",
2477 SvPV_nolen_const( mysv )
a3621e74
YO
2478 );
2479 });
3dab1dad 2480
a3621e74
YO
2481 /*
2482
2483 step through the branches, cur represents each
2484 branch, noper is the first thing to be matched
2485 as part of that branch and noper_next is the
2486 regnext() of that node. if noper is an EXACT
2487 and noper_next is the same as scan (our current
2488 position in the regex) then the EXACT branch is
2489 a possible optimization target. Once we have
2490 two or more consequetive such branches we can
2491 create a trie of the EXACT's contents and stich
2492 it in place. If the sequence represents all of
2493 the branches we eliminate the whole thing and
2494 replace it with a single TRIE. If it is a
2495 subsequence then we need to stitch it in. This
2496 means the first branch has to remain, and needs
2497 to be repointed at the item on the branch chain
2498 following the last branch optimized. This could
2499 be either a BRANCH, in which case the
2500 subsequence is internal, or it could be the
2501 item following the branch sequence in which
2502 case the subsequence is at the end.
2503
2504 */
2505
2506 /* dont use tail as the end marker for this traverse */
2507 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14 2508 regnode * const noper = NEXTOPER( cur );
be981c67 2509#if defined(DEBUGGING) || defined(NOJUMPTRIE)
aec46f14 2510 regnode * const noper_next = regnext( noper );
be981c67 2511#endif
a3621e74 2512
a3621e74 2513 DEBUG_OPTIMISE_r({
32fc9b6a 2514 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2515 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2516 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2517
32fc9b6a 2518 regprop(RExC_rx, mysv, noper);
a3621e74 2519 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2520 SvPV_nolen_const(mysv));
a3621e74
YO
2521
2522 if ( noper_next ) {
32fc9b6a 2523 regprop(RExC_rx, mysv, noper_next );
a3621e74 2524 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2525 SvPV_nolen_const(mysv));
a3621e74 2526 }
3dab1dad
YO
2527 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2528 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2529 });
3dab1dad
YO
2530 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2531 : PL_regkind[ OP( noper ) ] == EXACT )
2532 || OP(noper) == NOTHING )
786e8c11
YO
2533#ifdef NOJUMPTRIE
2534 && noper_next == tail
2535#endif
2536 && count < U16_MAX)
a3621e74
YO
2537 {
2538 count++;
3dab1dad
YO
2539 if ( !first || optype == NOTHING ) {
2540 if (!first) first = cur;
a3621e74
YO
2541 optype = OP( noper );
2542 } else {
a3621e74 2543 last = cur;
a3621e74
YO
2544 }
2545 } else {
2546 if ( last ) {
786e8c11
YO
2547 make_trie( pRExC_state,
2548 startbranch, first, cur, tail, count,
2549 optype, depth+1 );
a3621e74 2550 }
3dab1dad 2551 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2552#ifdef NOJUMPTRIE
2553 && noper_next == tail
2554#endif
2555 ){
a3621e74
YO
2556 count = 1;
2557 first = cur;
2558 optype = OP( noper );
2559 } else {
2560 count = 0;
2561 first = NULL;
2562 optype = 0;
2563 }
2564 last = NULL;
2565 }
2566 }
2567 DEBUG_OPTIMISE_r({
32fc9b6a 2568 regprop(RExC_rx, mysv, cur);
a3621e74 2569 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2570 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2571 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2572
2573 });
2574 if ( last ) {
786e8c11 2575 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2576#ifdef TRIE_STUDY_OPT
786e8c11
YO
2577 if ( ((made == MADE_EXACT_TRIE &&
2578 startbranch == first)
2579 || ( first_non_open == first )) &&
2580 depth==0 )
2581 flags |= SCF_TRIE_RESTUDY;
3dab1dad 2582#endif
07be1b83 2583 }
a3621e74 2584 }
3dab1dad
YO
2585
2586 } /* do trie */
786e8c11 2587
a0ed51b3 2588 }
a3621e74 2589 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2590 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2591 } else /* single branch is optimized. */
c277df42
IZ
2592 scan = NEXTOPER(scan);
2593 continue;
a0ed51b3
LW
2594 }
2595 else if (OP(scan) == EXACT) {
cd439c50 2596 I32 l = STR_LEN(scan);
c445ea15 2597 UV uc;
a0ed51b3 2598 if (UTF) {
a3b680e6 2599 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2600 l = utf8_length(s, s + l);
9041c2e3 2601 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2602 } else {
2603 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2604 }
2605 min += l;
c277df42 2606 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2607 /* The code below prefers earlier match for fixed
2608 offset, later match for variable offset. */
2609 if (data->last_end == -1) { /* Update the start info. */
2610 data->last_start_min = data->pos_min;
2611 data->last_start_max = is_inf
b81d288d 2612 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2613 }
cd439c50 2614 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2615 if (UTF)
2616 SvUTF8_on(data->last_found);
0eda9292 2617 {
9a957fbc 2618 SV * const sv = data->last_found;
a28509cc 2619 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2620 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2621 if (mg && mg->mg_len >= 0)
5e43f467
JH
2622 mg->mg_len += utf8_length((U8*)STRING(scan),
2623 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2624 }
c277df42
IZ
2625 data->last_end = data->pos_min + l;
2626 data->pos_min += l; /* As in the first entry. */
2627 data->flags &= ~SF_BEFORE_EOL;
2628 }
653099ff
GS
2629 if (flags & SCF_DO_STCLASS_AND) {
2630 /* Check whether it is compatible with what we know already! */
2631 int compat = 1;
2632
1aa99e6b 2633 if (uc >= 0x100 ||
516a5887 2634 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2635 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2636 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2637 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2638 )
653099ff
GS
2639 compat = 0;
2640 ANYOF_CLASS_ZERO(data->start_class);
2641 ANYOF_BITMAP_ZERO(data->start_class);
2642 if (compat)
1aa99e6b 2643 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2644 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2645 if (uc < 0x100)
2646 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2647 }
2648 else if (flags & SCF_DO_STCLASS_OR) {
2649 /* false positive possible if the class is case-folded */
1aa99e6b 2650 if (uc < 0x100)
9b877dbb
IH
2651 ANYOF_BITMAP_SET(data->start_class, uc);
2652 else
2653 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2654 data->start_class->flags &= ~ANYOF_EOS;
2655 cl_and(data->start_class, &and_with);
2656 }
2657 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2658 }
3dab1dad 2659 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2660 I32 l = STR_LEN(scan);
1aa99e6b 2661 UV uc = *((U8*)STRING(scan));
653099ff
GS
2662
2663 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2664 if (flags & SCF_DO_SUBSTR) {
2665 assert(data);
1de06328 2666 scan_commit(pRExC_state, data, minlenp);
ecaa9b9c 2667 }
a0ed51b3 2668 if (UTF) {
6136c704 2669 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2670 l = utf8_length(s, s + l);
9041c2e3 2671 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2672 }
2673 min += l;
ecaa9b9c 2674 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2675 data->pos_min += l;
653099ff
GS
2676 if (flags & SCF_DO_STCLASS_AND) {
2677 /* Check whether it is compatible with what we know already! */
2678 int compat = 1;
2679
1aa99e6b 2680 if (uc >= 0x100 ||
516a5887 2681 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2682 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2683 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2684 compat = 0;
2685 ANYOF_CLASS_ZERO(data->start_class);
2686 ANYOF_BITMAP_ZERO(data->start_class);
2687 if (compat) {
1aa99e6b 2688 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2689 data->start_class->flags &= ~ANYOF_EOS;
2690 data->start_class->flags |= ANYOF_FOLD;
2691 if (OP(scan) == EXACTFL)
2692 data->start_class->flags |= ANYOF_LOCALE;
2693 }
2694 }
2695 else if (flags & SCF_DO_STCLASS_OR) {
2696 if (data->start_class->flags & ANYOF_FOLD) {
2697 /* false positive possible if the class is case-folded.
2698 Assume that the locale settings are the same... */
1aa99e6b
IH
2699 if (uc < 0x100)
2700 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2701 data->start_class->flags &= ~ANYOF_EOS;
2702 }
2703 cl_and(data->start_class, &and_with);
2704 }
2705 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2706 }
bfed75c6 2707 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2708 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2709 I32 f = flags, pos_before = 0;
d4c19fe8 2710 regnode * const oscan = scan;
653099ff
GS
2711 struct regnode_charclass_class this_class;
2712 struct regnode_charclass_class *oclass = NULL;
727f22e3 2713 I32 next_is_eval = 0;
653099ff 2714
3dab1dad 2715 switch (PL_regkind[OP(scan)]) {
653099ff 2716 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2717 scan = NEXTOPER(scan);
2718 goto finish;
2719 case PLUS:
653099ff 2720 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2721 next = NEXTOPER(scan);
653099ff 2722 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2723 mincount = 1;
2724 maxcount = REG_INFTY;
c277df42
IZ
2725 next = regnext(scan);
2726 scan = NEXTOPER(scan);
2727 goto do_curly;
2728 }
2729 }
2730 if (flags & SCF_DO_SUBSTR)
2731 data->pos_min++;
2732 min++;
2733 /* Fall through. */
2734 case STAR:
653099ff
GS
2735 if (flags & SCF_DO_STCLASS) {
2736 mincount = 0;
b81d288d 2737 maxcount = REG_INFTY;
653099ff
GS
2738 next = regnext(scan);
2739 scan = NEXTOPER(scan);
2740 goto do_curly;
2741 }
b81d288d 2742 is_inf = is_inf_internal = 1;
c277df42
IZ
2743 scan = regnext(scan);
2744 if (flags & SCF_DO_SUBSTR) {
1de06328 2745 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2746 data->longest = &(data->longest_float);
2747 }
2748 goto optimize_curly_tail;
2749 case CURLY:
b81d288d 2750 mincount = ARG1(scan);
c277df42
IZ
2751 maxcount = ARG2(scan);
2752 next = regnext(scan);
cb434fcc
IZ
2753 if (OP(scan) == CURLYX) {
2754 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2755 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2756 }
c277df42 2757 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2758 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2759 do_curly:
2760 if (flags & SCF_DO_SUBSTR) {
1de06328 2761 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2762 pos_before = data->pos_min;
2763 }
2764 if (data) {
2765 fl = data->flags;
2766 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2767 if (is_inf)
2768 data->flags |= SF_IS_INF;
2769 }
653099ff 2770 if (flags & SCF_DO_STCLASS) {
830247a4 2771 cl_init(pRExC_state, &this_class);
653099ff
GS
2772 oclass = data->start_class;
2773 data->start_class = &this_class;
2774 f |= SCF_DO_STCLASS_AND;
2775 f &= ~SCF_DO_STCLASS_OR;
2776 }
e1901655
IZ
2777 /* These are the cases when once a subexpression
2778 fails at a particular position, it cannot succeed
2779 even after backtracking at the enclosing scope.
b81d288d 2780
e1901655
IZ
2781 XXXX what if minimal match and we are at the
2782 initial run of {n,m}? */
2783 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2784 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2785
c277df42 2786 /* This will finish on WHILEM, setting scan, or on NULL: */
1de06328 2787 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, last, data,
a3621e74
YO
2788 (mincount == 0
2789 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2790
2791 if (flags & SCF_DO_STCLASS)
2792 data->start_class = oclass;
2793 if (mincount == 0 || minnext == 0) {
2794 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2795 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2796 }
2797 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2798 /* Switch to OR mode: cache the old value of
653099ff
GS
2799 * data->start_class */
2800 StructCopy(data->start_class, &and_with,
2801 struct regnode_charclass_class);
2802 flags &= ~SCF_DO_STCLASS_AND;
2803 StructCopy(&this_class, data->start_class,
2804 struct regnode_charclass_class);
2805 flags |= SCF_DO_STCLASS_OR;
2806 data->start_class->flags |= ANYOF_EOS;
2807 }
2808 } else { /* Non-zero len */
2809 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2810 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2811 cl_and(data->start_class, &and_with);
2812 }
2813 else if (flags & SCF_DO_STCLASS_AND)
2814 cl_and(data->start_class, &this_class);
2815 flags &= ~SCF_DO_STCLASS;
2816 }
c277df42
IZ
2817 if (!scan) /* It was not CURLYX, but CURLY. */
2818 scan = next;
041457d9
DM
2819 if ( /* ? quantifier ok, except for (?{ ... }) */
2820 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2821 && (minnext == 0) && (deltanext == 0)
99799961 2822 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2823 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2824 && ckWARN(WARN_REGEXP))
b45f050a 2825 {
830247a4 2826 vWARN(RExC_parse,
b45f050a
JF
2827 "Quantifier unexpected on zero-length expression");
2828 }
2829
c277df42 2830 min += minnext * mincount;
b81d288d 2831 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2832 && (minnext + deltanext) > 0)
2833 || deltanext == I32_MAX);
aca2d497 2834 is_inf |= is_inf_internal;
c277df42
IZ
2835 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2836
2837 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2838 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2839 && data->flags & SF_IN_PAR
2840 && !(data->flags & SF_HAS_EVAL)
2841 && !deltanext && minnext == 1 ) {
2842 /* Try to optimize to CURLYN. */
2843 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2844 regnode * const nxt1 = nxt;
497b47a8
JH
2845#ifdef DEBUGGING
2846 regnode *nxt2;
2847#endif
c277df42
IZ
2848
2849 /* Skip open. */
2850 nxt = regnext(nxt);
bfed75c6 2851 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2852 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2853 && STR_LEN(nxt) == 1))
c277df42 2854 goto nogo;
497b47a8 2855#ifdef DEBUGGING
c277df42 2856 nxt2 = nxt;
497b47a8 2857#endif
c277df42 2858 nxt = regnext(nxt);
b81d288d 2859 if (OP(nxt) != CLOSE)
c277df42
IZ
2860 goto nogo;
2861 /* Now we know that nxt2 is the only contents: */
eb160463 2862 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2863 OP(oscan) = CURLYN;
2864 OP(nxt1) = NOTHING; /* was OPEN. */
2865#ifdef DEBUGGING
2866 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2867 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2868 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2869 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2870 OP(nxt + 1) = OPTIMIZED; /* was count. */
2871 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2872#endif
c277df42 2873 }
c277df42
IZ
2874 nogo:
2875
2876 /* Try optimization CURLYX => CURLYM. */
b81d288d 2877 if ( OP(oscan) == CURLYX && data
c277df42 2878 && !(data->flags & SF_HAS_PAR)
c277df42 2879 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2880 && !deltanext /* atom is fixed width */
2881 && minnext != 0 /* CURLYM can't handle zero width */
2882 ) {
c277df42
IZ
2883 /* XXXX How to optimize if data == 0? */
2884 /* Optimize to a simpler form. */
2885 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2886 regnode *nxt2;
2887
2888 OP(oscan) = CURLYM;
2889 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2890 && (OP(nxt2) != WHILEM))
c277df42
IZ
2891 nxt = nxt2;
2892 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2893 /* Need to optimize away parenths. */
2894 if (data->flags & SF_IN_PAR) {
2895 /* Set the parenth number. */
2896 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2897
b81d288d 2898 if (OP(nxt) != CLOSE)
b45f050a 2899 FAIL("Panic opt close");
eb160463 2900 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2901 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2902 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2903#ifdef DEBUGGING
2904 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2905 OP(nxt + 1) = OPTIMIZED; /* was count. */
2906 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2907 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2908#endif
c277df42
IZ
2909#if 0
2910 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2911 regnode *nnxt = regnext(nxt1);
b81d288d 2912
c277df42
IZ
2913 if (nnxt == nxt) {
2914 if (reg_off_by_arg[OP(nxt1)])
2915 ARG_SET(nxt1, nxt2 - nxt1);
2916 else if (nxt2 - nxt1 < U16_MAX)
2917 NEXT_OFF(nxt1) = nxt2 - nxt1;
2918 else
2919 OP(nxt) = NOTHING; /* Cannot beautify */
2920 }
2921 nxt1 = nnxt;
2922 }
2923#endif
2924 /* Optimize again: */
1de06328 2925 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
a3621e74 2926 NULL, 0,depth+1);
a0ed51b3
LW
2927 }
2928 else
c277df42 2929 oscan->flags = 0;
c277df42 2930 }
e1901655
IZ
2931 else if ((OP(oscan) == CURLYX)
2932 && (flags & SCF_WHILEM_VISITED_POS)
2933 /* See the comment on a similar expression above.
2934 However, this time it not a subexpression
2935 we care about, but the expression itself. */
2936 && (maxcount == REG_INFTY)
2937 && data && ++data->whilem_c < 16) {
2938 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2939 /* Find WHILEM (as in regexec.c) */
2940 regnode *nxt = oscan + NEXT_OFF(oscan);
2941
2942 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2943 nxt += ARG(nxt);
eb160463
GS
2944 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2945 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2946 }
b81d288d 2947 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2948 pars++;
2949 if (flags & SCF_DO_SUBSTR) {
c445ea15 2950 SV *last_str = NULL;
c277df42
IZ
2951 int counted = mincount != 0;
2952
2953 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2954#if defined(SPARC64_GCC_WORKAROUND)
2955 I32 b = 0;
2956 STRLEN l = 0;
cfd0369c 2957 const char *s = NULL;
5d1c421c
JH
2958 I32 old = 0;
2959
2960 if (pos_before >= data->last_start_min)
2961 b = pos_before;
2962 else
2963 b = data->last_start_min;
2964
2965 l = 0;
cfd0369c 2966 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2967 old = b - data->last_start_min;
2968
2969#else
b81d288d 2970 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2971 ? pos_before : data->last_start_min;
2972 STRLEN l;
d4c19fe8 2973 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2974 I32 old = b - data->last_start_min;
5d1c421c 2975#endif
a0ed51b3
LW
2976
2977 if (UTF)
2978 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2979
a0ed51b3 2980 l -= old;
c277df42 2981 /* Get the added string: */
79cb57f6 2982 last_str = newSVpvn(s + old, l);
0e933229
IH
2983 if (UTF)
2984 SvUTF8_on(last_str);
c277df42
IZ
2985 if (deltanext == 0 && pos_before == b) {
2986 /* What was added is a constant string */
2987 if (mincount > 1) {
2988 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2989 repeatcpy(SvPVX(last_str) + l,
3f7c398e 2990 SvPVX_const(last_str), l, mincount - 1);
b162af07 2991 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2992 /* Add additional parts. */
b81d288d 2993 SvCUR_set(data->last_found,
c277df42
IZ
2994 SvCUR(data->last_found) - l);
2995 sv_catsv(data->last_found, last_str);
0eda9292
JH
2996 {
2997 SV * sv = data->last_found;
2998 MAGIC *mg =
2999 SvUTF8(sv) && SvMAGICAL(sv) ?
3000 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3001 if (mg && mg->mg_len >= 0)
3002 mg->mg_len += CHR_SVLEN(last_str);
3003 }
c277df42
IZ
3004 data->last_end += l * (mincount - 1);
3005 }
2a8d9689
HS
3006 } else {
3007 /* start offset must point into the last copy */
3008 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
3009 data->last_start_max += is_inf ? I32_MAX
3010 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
3011 }
3012 }
3013 /* It is counted once already... */
3014 data->pos_min += minnext * (mincount - counted);
3015 data->pos_delta += - counted * deltanext +
3016 (minnext + deltanext) * maxcount - minnext * mincount;
3017 if (mincount != maxcount) {
653099ff
GS
3018 /* Cannot extend fixed substrings found inside
3019 the group. */
1de06328 3020 scan_commit(pRExC_state,data,minlenp);
c277df42 3021 if (mincount && last_str) {
d4c19fe8
AL
3022 SV * const sv = data->last_found;
3023 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
3024 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3025
3026 if (mg)
3027 mg->mg_len = -1;
3028 sv_setsv(sv, last_str);
c277df42 3029 data->last_end = data->pos_min;
b81d288d 3030 data->last_start_min =
a0ed51b3 3031 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
3032 data->last_start_max = is_inf
3033 ? I32_MAX
c277df42 3034 : data->pos_min + data->pos_delta
a0ed51b3 3035 - CHR_SVLEN(last_str);
c277df42
IZ
3036 }
3037 data->longest = &(data->longest_float);
3038 }
aca2d497 3039 SvREFCNT_dec(last_str);
c277df42 3040 }
405ff068 3041 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
3042 data->flags |= SF_HAS_EVAL;
3043 optimize_curly_tail:
c277df42 3044 if (OP(oscan) != CURLYX) {
3dab1dad 3045 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
3046 && NEXT_OFF(next))
3047 NEXT_OFF(oscan) += NEXT_OFF(next);
3048 }
c277df42 3049 continue;
653099ff 3050 default: /* REF and CLUMP only? */
c277df42 3051 if (flags & SCF_DO_SUBSTR) {
1de06328 3052 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
c277df42
IZ
3053 data->longest = &(data->longest_float);
3054 }
aca2d497 3055 is_inf = is_inf_internal = 1;
653099ff 3056 if (flags & SCF_DO_STCLASS_OR)
830247a4 3057 cl_anything(pRExC_state, data->start_class);
653099ff 3058 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
3059 break;
3060 }
a0ed51b3 3061 }
bfed75c6 3062 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 3063 int value = 0;
653099ff 3064
c277df42 3065 if (flags & SCF_DO_SUBSTR) {
1de06328 3066 scan_commit(pRExC_state,data,minlenp);
c277df42
IZ
3067 data->pos_min++;
3068 }
3069 min++;
653099ff
GS
3070 if (flags & SCF_DO_STCLASS) {
3071 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3072
3073 /* Some of the logic below assumes that switching
3074 locale on will only add false positives. */
3dab1dad 3075 switch (PL_regkind[OP(scan)]) {
653099ff 3076 case SANY:
653099ff
GS
3077 default:
3078 do_default:
3079 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3080 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3081 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3082 break;
3083 case REG_ANY:
3084 if (OP(scan) == SANY)
3085 goto do_default;
3086 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3087 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3088 || (data->start_class->flags & ANYOF_CLASS));
830247a4 3089 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3090 }
3091 if (flags & SCF_DO_STCLASS_AND || !value)
3092 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3093 break;
3094 case ANYOF:
3095 if (flags & SCF_DO_STCLASS_AND)
3096 cl_and(data->start_class,
3097 (struct regnode_charclass_class*)scan);
3098 else
830247a4 3099 cl_or(pRExC_state, data->start_class,
653099ff
GS
3100 (struct regnode_charclass_class*)scan);
3101 break;
3102 case ALNUM:
3103 if (flags & SCF_DO_STCLASS_AND) {
3104 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3105 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3106 for (value = 0; value < 256; value++)
3107 if (!isALNUM(value))
3108 ANYOF_BITMAP_CLEAR(data->start_class, value);
3109 }
3110 }
3111 else {
3112 if (data->start_class->flags & ANYOF_LOCALE)
3113 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3114 else {
3115 for (value = 0; value < 256; value++)
3116 if (isALNUM(value))
b81d288d 3117 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3118 }
3119 }
3120 break;
3121 case ALNUML:
3122 if (flags & SCF_DO_STCLASS_AND) {
3123 if (data->start_class->flags & ANYOF_LOCALE)
3124 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3125 }
3126 else {
3127 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3128 data->start_class->flags |= ANYOF_LOCALE;
3129 }
3130 break;
3131 case NALNUM:
3132 if (flags & SCF_DO_STCLASS_AND) {
3133 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3134 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3135 for (value = 0; value < 256; value++)
3136 if (isALNUM(value))
3137 ANYOF_BITMAP_CLEAR(data->start_class, value);
3138 }
3139 }
3140 else {
3141 if (data->start_class->flags & ANYOF_LOCALE)
3142 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3143 else {
3144 for (value = 0; value < 256; value++)
3145 if (!isALNUM(value))
b81d288d 3146 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3147 }
3148 }
3149 break;
3150 case NALNUML:
3151 if (flags & SCF_DO_STCLASS_AND) {
3152 if (data->start_class->flags & ANYOF_LOCALE)
3153 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3154 }
3155 else {
3156 data->start_class->flags |= ANYOF_LOCALE;
3157 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3158 }
3159 break;
3160 case SPACE:
3161 if (flags & SCF_DO_STCLASS_AND) {
3162 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3163 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3164 for (value = 0; value < 256; value++)
3165 if (!isSPACE(value))
3166 ANYOF_BITMAP_CLEAR(data->start_class, value);
3167 }
3168 }
3169 else {
3170 if (data->start_class->flags & ANYOF_LOCALE)
3171 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3172 else {
3173 for (value = 0; value < 256; value++)
3174 if (isSPACE(value))
b81d288d 3175 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3176 }
3177 }
3178 break;
3179 case SPACEL:
3180 if (flags & SCF_DO_STCLASS_AND) {
3181 if (data->start_class->flags & ANYOF_LOCALE)
3182 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3183 }
3184 else {
3185 data->start_class->flags |= ANYOF_LOCALE;
3186 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3187 }
3188 break;
3189 case NSPACE:
3190 if (flags & SCF_DO_STCLASS_AND) {
3191 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3192 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3193 for (value = 0; value < 256; value++)
3194 if (isSPACE(value))
3195 ANYOF_BITMAP_CLEAR(data->start_class, value);
3196 }
3197 }
3198 else {
3199 if (data->start_class->flags & ANYOF_LOCALE)
3200 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3201 else {
3202 for (value = 0; value < 256; value++)
3203 if (!isSPACE(value))
b81d288d 3204 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3205 }
3206 }
3207 break;
3208 case NSPACEL:
3209 if (flags & SCF_DO_STCLASS_AND) {
3210 if (data->start_class->flags & ANYOF_LOCALE) {
3211 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3212 for (value = 0; value < 256; value++)
3213 if (!isSPACE(value))
3214 ANYOF_BITMAP_CLEAR(data->start_class, value);
3215 }
3216 }
3217 else {
3218 data->start_class->flags |= ANYOF_LOCALE;
3219 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3220 }
3221 break;
3222 case DIGIT:
3223 if (flags & SCF_DO_STCLASS_AND) {
3224 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3225 for (value = 0; value < 256; value++)
3226 if (!isDIGIT(value))
3227 ANYOF_BITMAP_CLEAR(data->start_class, value);
3228 }
3229 else {
3230 if (data->start_class->flags & ANYOF_LOCALE)
3231 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3232 else {
3233 for (value = 0; value < 256; value++)
3234 if (isDIGIT(value))
b81d288d 3235 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3236 }
3237 }
3238 break;
3239 case NDIGIT:
3240 if (flags & SCF_DO_STCLASS_AND) {
3241 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3242 for (value = 0; value < 256; value++)
3243 if (isDIGIT(value))
3244 ANYOF_BITMAP_CLEAR(data->start_class, value);
3245 }
3246 else {
3247 if (data->start_class->flags & ANYOF_LOCALE)
3248 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3249 else {
3250 for (value = 0; value < 256; value++)
3251 if (!isDIGIT(value))
b81d288d 3252 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3253 }
3254 }
3255 break;
3256 }
3257 if (flags & SCF_DO_STCLASS_OR)
3258 cl_and(data->start_class, &and_with);
3259 flags &= ~SCF_DO_STCLASS;
3260 }
a0ed51b3 3261 }
3dab1dad 3262 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3263 data->flags |= (OP(scan) == MEOL
3264 ? SF_BEFORE_MEOL
3265 : SF_BEFORE_SEOL);
a0ed51b3 3266 }
3dab1dad 3267 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3268 /* Lookbehind, or need to calculate parens/evals/stclass: */
3269 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3270 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1de06328
YO
3271 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3272 || OP(scan) == UNLESSM )
3273 {
3274 /* Negative Lookahead/lookbehind
3275 In this case we can't do fixed string optimisation.
3276 */
3277
3278 I32 deltanext, minnext, fake = 0;
3279 regnode *nscan;
3280 struct regnode_charclass_class intrnl;
3281 int f = 0;
3282
3283 data_fake.flags = 0;
3284 if (data) {
3285 data_fake.whilem_c = data->whilem_c;
3286 data_fake.last_closep = data->last_closep;
a0ed51b3 3287 }
1de06328
YO
3288 else
3289 data_fake.last_closep = &fake;
3290 if ( flags & SCF_DO_STCLASS && !scan->flags
3291 && OP(scan) == IFMATCH ) { /* Lookahead */
3292 cl_init(pRExC_state, &intrnl);
3293 data_fake.start_class = &intrnl;
3294 f |= SCF_DO_STCLASS_AND;
c277df42 3295 }
1de06328
YO
3296 if (flags & SCF_WHILEM_VISITED_POS)
3297 f |= SCF_WHILEM_VISITED_POS;
3298 next = regnext(scan);
3299 nscan = NEXTOPER(NEXTOPER(scan));
3300 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, last, &data_fake, f,depth+1);
3301 if (scan->flags) {
3302 if (deltanext) {
3303 vFAIL("Variable length lookbehind not implemented");
3304 }
3305 else if (minnext > (I32)U8_MAX) {
3306 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3307 }
3308 scan->flags = (U8)minnext;
3309 }
3310 if (data) {
3311 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3312 pars++;
3313 if (data_fake.flags & SF_HAS_EVAL)
3314 data->flags |= SF_HAS_EVAL;
3315 data->whilem_c = data_fake.whilem_c;
3316 }
3317 if (f & SCF_DO_STCLASS_AND) {
3318 const int was = (data->start_class->flags & ANYOF_EOS);
3319
3320 cl_and(data->start_class, &intrnl);
3321 if (was)
3322 data->start_class->flags |= ANYOF_EOS;
3323 }
be8e71aa 3324 }
1de06328
YO
3325#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3326 else {
3327 /* Positive Lookahead/lookbehind
3328 In this case we can do fixed string optimisation,
3329 but we must be careful about it. Note in the case of
3330 lookbehind the positions will be offset by the minimum
3331 length of the pattern, something we won't know about
3332 until after the recurse.
3333 */
3334 I32 deltanext, fake = 0;
3335 regnode *nscan;
3336 struct regnode_charclass_class intrnl;
3337 int f = 0;
3338 /* We use SAVEFREEPV so that when the full compile
3339 is finished perl will clean up the allocated
3340 minlens when its all done. This was we don't
3341 have to worry about freeing them when we know
3342 they wont be used, which would be a pain.
3343 */
3344 I32 *minnextp;
3345 Newx( minnextp, 1, I32 );
3346 SAVEFREEPV(minnextp);
3347
3348 if (data) {
3349 StructCopy(data, &data_fake, scan_data_t);
3350 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3351 f |= SCF_DO_SUBSTR;
3352 if (scan->flags)
3353 scan_commit(pRExC_state, &data_fake,minlenp);
3354 data_fake.last_found=newSVsv(data->last_found);
3355 }
3356 }
3357 else
3358 data_fake.last_closep = &fake;
3359 data_fake.flags = 0;
3360 if (is_inf)
3361 data_fake.flags |= SF_IS_INF;
3362 if ( flags & SCF_DO_STCLASS && !scan->flags
3363 && OP(scan) == IFMATCH ) { /* Lookahead */
3364 cl_init(pRExC_state, &intrnl);
3365 data_fake.start_class = &intrnl;
3366 f |= SCF_DO_STCLASS_AND;
3367 }
3368 if (flags & SCF_WHILEM_VISITED_POS)
3369 f |= SCF_WHILEM_VISITED_POS;
3370 next = regnext(scan);
3371 nscan = NEXTOPER(NEXTOPER(scan));
3372
3373 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, last, &data_fake, f,depth+1);
3374 if (scan->flags) {
3375 if (deltanext) {
3376 vFAIL("Variable length lookbehind not implemented");
3377 }
3378 else if (*minnextp > (I32)U8_MAX) {
3379 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3380 }
3381 scan->flags = (U8)*minnextp;
3382 }
3383
3384 *minnextp += min;
3385
3386
3387 if (f & SCF_DO_STCLASS_AND) {
3388 const int was = (data->start_class->flags & ANYOF_EOS);
3389
3390 cl_and(data->start_class, &intrnl);
3391 if (was)
3392 data->start_class->flags |= ANYOF_EOS;
3393 }
3394 if (data) {
3395 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3396 pars++;
3397 if (data_fake.flags & SF_HAS_EVAL)
3398 data->flags |= SF_HAS_EVAL;
3399 data->whilem_c = data_fake.whilem_c;
3400 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3401 if (RExC_rx->minlen<*minnextp)
3402 RExC_rx->minlen=*minnextp;
3403 scan_commit(pRExC_state, &data_fake, minnextp);
3404 SvREFCNT_dec(data_fake.last_found);
3405
3406 if ( data_fake.minlen_fixed != minlenp )
3407 {
3408 data->offset_fixed= data_fake.offset_fixed;
3409 data->minlen_fixed= data_fake.minlen_fixed;
3410 data->lookbehind_fixed+= scan->flags;
3411 }
3412 if ( data_fake.minlen_float != minlenp )
3413 {
3414 data->minlen_float= data_fake.minlen_float;
3415 data->offset_float_min=data_fake.offset_float_min;
3416 data->offset_float_max=data_fake.offset_float_max;
3417 data->lookbehind_float+= scan->flags;
3418 }
3419 }
3420 }
3421
653099ff 3422
653099ff 3423 }
1de06328 3424#endif
a0ed51b3
LW
3425 }
3426 else if (OP(scan) == OPEN) {
c277df42 3427 pars++;
a0ed51b3 3428 }
cb434fcc 3429 else if (OP(scan) == CLOSE) {
eb160463 3430 if ((I32)ARG(scan) == is_par) {
cb434fcc 3431 next = regnext(scan);
c277df42 3432
cb434fcc
IZ
3433 if ( next && (OP(next) != WHILEM) && next < last)
3434 is_par = 0; /* Disable optimization */
3435 }
3436 if (data)
3437 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
3438 }
3439 else if (OP(scan) == EVAL) {
c277df42
IZ
3440 if (data)
3441 data->flags |= SF_HAS_EVAL;
3442 }
96776eda 3443 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 3444 if (flags & SCF_DO_SUBSTR) {
1de06328 3445 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3446 data->longest = &(data->longest_float);
3447 }
3448 is_inf = is_inf_internal = 1;
653099ff 3449 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3450 cl_anything(pRExC_state, data->start_class);
96776eda 3451 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3452 }
786e8c11
YO
3453#ifdef TRIE_STUDY_OPT
3454#ifdef FULL_TRIE_STUDY
3455 else if (PL_regkind[OP(scan)] == TRIE) {
3456 /* NOTE - There is similar code to this block above for handling
3457 BRANCH nodes on the initial study. If you change stuff here
3458 check there too. */
3459 regnode *tail= regnext(scan);
3460 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3461 I32 max1 = 0, min1 = I32_MAX;
3462 struct regnode_charclass_class accum;
3463
3464 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 3465 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
786e8c11
YO
3466 if (flags & SCF_DO_STCLASS)
3467 cl_init_zero(pRExC_state, &accum);
3468
3469 if (!trie->jump) {
3470 min1= trie->minlen;
3471 max1= trie->maxlen;
3472 } else {
3473 const regnode *nextbranch= NULL;
3474 U32 word;
3475
3476 for ( word=1 ; word <= trie->wordcount ; word++)
3477 {
3478 I32 deltanext=0, minnext=0, f = 0, fake;
3479 struct regnode_charclass_class this_class;
3480
3481 data_fake.flags = 0;
3482 if (data) {
3483 data_fake.whilem_c = data->whilem_c;
3484 data_fake.last_closep = data->last_closep;
3485 }
3486 else
3487 data_fake.last_closep = &fake;
3488
3489 if (flags & SCF_DO_STCLASS) {
3490 cl_init(pRExC_state, &this_class);
3491 data_fake.start_class = &this_class;
3492 f = SCF_DO_STCLASS_AND;
3493 }
3494 if (flags & SCF_WHILEM_VISITED_POS)
3495 f |= SCF_WHILEM_VISITED_POS;
3496
3497 if (trie->jump[word]) {
3498 if (!nextbranch)
3499 nextbranch = tail - trie->jump[0];
3500 scan= tail - trie->jump[word];
3501 /* We go from the jump point to the branch that follows
3502 it. Note this means we need the vestigal unused branches
3503 even though they arent otherwise used.
3504 */
1de06328 3505 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
786e8c11
YO
3506 (regnode *)nextbranch, &data_fake, f,depth+1);
3507 }
3508 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3509 nextbranch= regnext((regnode*)nextbranch);
3510
3511 if (min1 > (I32)(minnext + trie->minlen))
3512 min1 = minnext + trie->minlen;
3513 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3514 max1 = minnext + deltanext + trie->maxlen;
3515 if (deltanext == I32_MAX)
3516 is_inf = is_inf_internal = 1;
3517
3518 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3519 pars++;
3520
3521 if (data) {
3522 if (data_fake.flags & SF_HAS_EVAL)
3523 data->flags |= SF_HAS_EVAL;
3524 data->whilem_c = data_fake.whilem_c;
3525 }
3526 if (flags & SCF_DO_STCLASS)
3527 cl_or(pRExC_state, &accum, &this_class);
3528 }
3529 }
3530 if (flags & SCF_DO_SUBSTR) {
3531 data->pos_min += min1;
3532 data->pos_delta += max1 - min1;
3533 if (max1 != min1 || is_inf)
3534 data->longest = &(data->longest_float);
3535 }
3536 min += min1;
3537 delta += max1 - min1;
3538 if (flags & SCF_DO_STCLASS_OR) {
3539 cl_or(pRExC_state, data->start_class, &accum);
3540 if (min1) {
3541 cl_and(data->start_class, &and_with);
3542 flags &= ~SCF_DO_STCLASS;
3543 }
3544 }
3545 else if (flags & SCF_DO_STCLASS_AND) {
3546 if (min1) {
3547 cl_and(data->start_class, &accum);
3548 flags &= ~SCF_DO_STCLASS;
3549 }
3550 else {
3551 /* Switch to OR mode: cache the old value of
3552 * data->start_class */
3553 StructCopy(data->start_class, &and_with,
3554 struct regnode_charclass_class);
3555 flags &= ~SCF_DO_STCLASS_AND;
3556 StructCopy(&accum, data->start_class,
3557 struct regnode_charclass_class);
3558 flags |= SCF_DO_STCLASS_OR;
3559 data->start_class->flags |= ANYOF_EOS;
3560 }
3561 }
3562 scan= tail;
3563 continue;
3564 }
3565#else
3566 else if (PL_regkind[OP(scan)] == TRIE) {
3567 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3568 U8*bang=NULL;
3569
3570 min += trie->minlen;
3571 delta += (trie->maxlen - trie->minlen);
3572 flags &= ~SCF_DO_STCLASS; /* xxx */
3573 if (flags & SCF_DO_SUBSTR) {
1de06328 3574 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
786e8c11
YO
3575 data->pos_min += trie->minlen;
3576 data->pos_delta += (trie->maxlen - trie->minlen);
3577 if (trie->maxlen != trie->minlen)
3578 data->longest = &(data->longest_float);
3579 }
3580 if (trie->jump) /* no more substrings -- for now /grr*/
3581 flags &= ~SCF_DO_SUBSTR;
3582 }
3583#endif /* old or new */
3584#endif /* TRIE_STUDY_OPT */
c277df42
IZ
3585 /* Else: zero-length, ignore. */
3586 scan = regnext(scan);
3587 }
3588
3589 finish:
3590 *scanp = scan;
aca2d497 3591 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3592 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3593 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3594 if (is_par > (I32)U8_MAX)
c277df42
IZ
3595 is_par = 0;
3596 if (is_par && pars==1 && data) {
3597 data->flags |= SF_IN_PAR;
3598 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3599 }
3600 else if (pars && data) {
c277df42
IZ
3601 data->flags |= SF_HAS_PAR;
3602 data->flags &= ~SF_IN_PAR;
3603 }
653099ff
GS
3604 if (flags & SCF_DO_STCLASS_OR)
3605 cl_and(data->start_class, &and_with);
786e8c11
YO
3606 if (flags & SCF_TRIE_RESTUDY)
3607 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3608
3609 DEBUG_STUDYDATA(data,depth);
3610
c277df42
IZ
3611 return min;
3612}
3613
76e3520e 3614STATIC I32
5f66b61c 3615S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3616{
830247a4 3617 if (RExC_rx->data) {
b81d288d
AB
3618 Renewc(RExC_rx->data,
3619 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 3620 char, struct reg_data);
830247a4
IZ
3621 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3622 RExC_rx->data->count += n;
a0ed51b3
LW
3623 }
3624 else {
a02a5408 3625 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3626 char, struct reg_data);
a02a5408 3627 Newx(RExC_rx->data->what, n, U8);
830247a4 3628 RExC_rx->data->count = n;
c277df42 3629 }
830247a4
IZ
3630 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3631 return RExC_rx->data->count - n;
c277df42
IZ
3632}
3633
76234dfb 3634#ifndef PERL_IN_XSUB_RE
d88dccdf 3635void
864dbfa3 3636Perl_reginitcolors(pTHX)
d88dccdf 3637{
97aff369 3638 dVAR;
1df70142 3639 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3640 if (s) {
1df70142
AL
3641 char *t = savepv(s);
3642 int i = 0;
3643 PL_colors[0] = t;
d88dccdf 3644 while (++i < 6) {
1df70142
AL
3645 t = strchr(t, '\t');
3646 if (t) {
3647 *t = '\0';
3648 PL_colors[i] = ++t;
d88dccdf
IZ
3649 }
3650 else
1df70142 3651 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3652 }
3653 } else {
1df70142 3654 int i = 0;
b81d288d 3655 while (i < 6)
06b5626a 3656 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3657 }
3658 PL_colorset = 1;
3659}
76234dfb 3660#endif
8615cb43 3661
07be1b83 3662
786e8c11
YO
3663#ifdef TRIE_STUDY_OPT
3664#define CHECK_RESTUDY_GOTO \
3665 if ( \
3666 (data.flags & SCF_TRIE_RESTUDY) \
3667 && ! restudied++ \
3668 ) goto reStudy
3669#else
3670#define CHECK_RESTUDY_GOTO
3671#endif
a687059c 3672/*
e50aee73 3673 - pregcomp - compile a regular expression into internal code
a687059c
LW
3674 *
3675 * We can't allocate space until we know how big the compiled form will be,
3676 * but we can't compile it (and thus know how big it is) until we've got a
3677 * place to put the code. So we cheat: we compile it twice, once with code
3678 * generation turned off and size counting turned on, and once "for real".
3679 * This also means that we don't allocate space until we are sure that the
3680 * thing really will compile successfully, and we never have to move the
3681 * code and thus invalidate pointers into it. (Note that it has to be in
3682 * one piece because free() must be able to free it all.) [NB: not true in perl]
3683 *
3684 * Beware that the optimization-preparation code in here knows about some
3685 * of the structure of the compiled regexp. [I'll say.]
3686 */
3687regexp *
864dbfa3 3688Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3689{
97aff369 3690 dVAR;
a0d0e21e 3691 register regexp *r;
c277df42 3692 regnode *scan;
c277df42 3693 regnode *first;
a0d0e21e 3694 I32 flags;
a0d0e21e
LW
3695 I32 minlen = 0;
3696 I32 sawplus = 0;
3697 I32 sawopen = 0;
2c2d71f5 3698 scan_data_t data;
830247a4 3699 RExC_state_t RExC_state;
be8e71aa 3700 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3701#ifdef TRIE_STUDY_OPT
3702 int restudied= 0;
3703 RExC_state_t copyRExC_state;
3704#endif
a0d0e21e 3705
a3621e74
YO
3706 GET_RE_DEBUG_FLAGS_DECL;
3707
a0d0e21e 3708 if (exp == NULL)
c277df42 3709 FAIL("NULL regexp argument");
a0d0e21e 3710
a5961de5 3711 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3712
5cfc7842 3713 RExC_precomp = exp;
a3621e74
YO
3714 DEBUG_r(if (!PL_colorset) reginitcolors());
3715 DEBUG_COMPILE_r({
ab3bbdeb
YO
3716 SV *dsv= sv_newmortal();
3717 RE_PV_QUOTED_DECL(s, RExC_utf8,
3718 dsv, RExC_precomp, (xend - exp), 60);
3719 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3720 PL_colors[4],PL_colors[5],s);
a5961de5 3721 });
e2509266 3722 RExC_flags = pm->op_pmflags;
830247a4 3723 RExC_sawback = 0;
bbce6d69 3724
830247a4
IZ
3725 RExC_seen = 0;
3726 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3727 RExC_seen_evals = 0;
3728 RExC_extralen = 0;
c277df42 3729
bbce6d69 3730 /* First pass: determine size, legality. */
830247a4 3731 RExC_parse = exp;
fac92740 3732 RExC_start = exp;
830247a4
IZ
3733 RExC_end = xend;
3734 RExC_naughty = 0;
3735 RExC_npar = 1;
3736 RExC_size = 0L;
3737 RExC_emit = &PL_regdummy;
3738 RExC_whilem_seen = 0;
fc8cd66c
YO
3739 RExC_charnames = NULL;
3740
85ddcde9
JH
3741#if 0 /* REGC() is (currently) a NOP at the first pass.
3742 * Clever compilers notice this and complain. --jhi */
830247a4 3743 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3744#endif
3dab1dad
YO
3745 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3746 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3747 RExC_precomp = NULL;
a0d0e21e
LW
3748 return(NULL);
3749 }
3dab1dad
YO
3750 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3751 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3752 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
07be1b83
YO
3753 DEBUG_PARSE_r({
3754 RExC_lastnum=0;
3755 RExC_lastparse=NULL;
3756 });
c277df42 3757
07be1b83 3758
c277df42
IZ
3759 /* Small enough for pointer-storage convention?
3760 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3761 if (RExC_size >= 0x10000L && RExC_extralen)
3762 RExC_size += RExC_extralen;
c277df42 3763 else
830247a4
IZ
3764 RExC_extralen = 0;
3765 if (RExC_whilem_seen > 15)
3766 RExC_whilem_seen = 15;
a0d0e21e 3767
bbce6d69 3768 /* Allocate space and initialize. */
a02a5408 3769 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3770 char, regexp);
a0d0e21e 3771 if (r == NULL)
b45f050a
JF
3772 FAIL("Regexp out of space");
3773
0f79a09d
GS
3774#ifdef DEBUGGING
3775 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3776 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3777#endif
c277df42 3778 r->refcnt = 1;
bbce6d69 3779 r->prelen = xend - exp;
5cfc7842 3780 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3781 r->subbeg = NULL;
f8c7b90f 3782#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3783 r->saved_copy = NULL;
ed252734 3784#endif
cf93c79d 3785 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3786 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3787 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3788
3789 r->substrs = 0; /* Useful during FAIL. */
3790 r->startp = 0; /* Useful during FAIL. */
3791 r->endp = 0; /* Useful during FAIL. */
3792
a02a5408 3793 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3794 if (r->offsets) {
2af232bd 3795 r->offsets[0] = RExC_size;
fac92740 3796 }
a3621e74 3797 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3798 "%s %"UVuf" bytes for offset annotations.\n",
3799 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3800 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 3801
830247a4 3802 RExC_rx = r;
bbce6d69 3803
3804 /* Second pass: emit code. */
e2509266 3805 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
3806 RExC_parse = exp;
3807 RExC_end = xend;
3808 RExC_naughty = 0;
3809 RExC_npar = 1;
fac92740 3810 RExC_emit_start = r->program;
830247a4 3811 RExC_emit = r->program;
2cd61cdb 3812 /* Store the count of eval-groups for security checks: */
786e8c11 3813 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 3814 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 3815 r->data = 0;
3dab1dad 3816 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 3817 return(NULL);
07be1b83
YO
3818 /* XXXX To minimize changes to RE engine we always allocate
3819 3-units-long substrs field. */
3820 Newx(r->substrs, 1, struct reg_substr_data);
a0d0e21e 3821
07be1b83 3822reStudy:
1de06328 3823 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83
YO
3824 Zero(r->substrs, 1, struct reg_substr_data);
3825 StructCopy(&zero_scan_data, &data, scan_data_t);
a3621e74 3826
07be1b83
YO
3827#ifdef TRIE_STUDY_OPT
3828 if ( restudied ) {
3829 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3830 RExC_state=copyRExC_state;
1de06328 3831 if (data.last_found) {
07be1b83 3832 SvREFCNT_dec(data.longest_fixed);
07be1b83 3833 SvREFCNT_dec(data.longest_float);
07be1b83 3834 SvREFCNT_dec(data.last_found);
1de06328 3835 }
07be1b83
YO
3836 } else {
3837 copyRExC_state=RExC_state;
3838 }
3839#endif
fc8cd66c 3840
a0d0e21e 3841 /* Dig out information for optimizations. */
cf93c79d 3842 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 3843 pm->op_pmflags = RExC_flags;
a0ed51b3 3844 if (UTF)
5ff6fc6d 3845 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 3846 r->regstclass = NULL;
830247a4 3847 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 3848 r->reganch |= ROPT_NAUGHTY;
c277df42 3849 scan = r->program + 1; /* First BRANCH. */
2779dcf1 3850
1de06328
YO
3851 /* testing for BRANCH here tells us whether there is "must appear"
3852 data in the pattern. If there is then we can use it for optimisations */
c277df42 3853 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 3854 I32 fake;
c5254dd6 3855 STRLEN longest_float_length, longest_fixed_length;
07be1b83 3856 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 3857 int stclass_flag;
07be1b83 3858 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
3859
3860 first = scan;
c277df42 3861 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 3862 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 3863 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 3864 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
3865 /* for now we can't handle lookbehind IFMATCH*/
3866 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
3867 (OP(first) == PLUS) ||
3868 (OP(first) == MINMOD) ||
653099ff 3869 /* An {n,m} with n>0 */
07be1b83
YO
3870 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3871 {
786e8c11 3872
a0d0e21e
LW
3873 if (OP(first) == PLUS)
3874 sawplus = 1;
3875 else
3dab1dad 3876 first += regarglen[OP(first)];
07be1b83
YO
3877 if (OP(first) == IFMATCH) {
3878 first = NEXTOPER(first);
3879 first += EXTRA_STEP_2ARGS;
7c167cea 3880 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 3881 first = NEXTOPER(first);
a687059c
LW
3882 }
3883
a0d0e21e
LW
3884 /* Starting-point info. */
3885 again:
786e8c11 3886 DEBUG_PEEP("first:",first,0);
07be1b83 3887 /* Ignore EXACT as we deal with it later. */
3dab1dad 3888 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 3889 if (OP(first) == EXACT)
6f207bd3 3890 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 3891 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
3892 r->regstclass = first;
3893 }
07be1b83 3894#ifdef TRIE_STCLASS
786e8c11 3895 else if (PL_regkind[OP(first)] == TRIE &&
07be1b83
YO
3896 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3897 {
786e8c11 3898 regnode *trie_op;
07be1b83 3899 /* this can happen only on restudy */
786e8c11
YO
3900 if ( OP(first) == TRIE ) {
3901 struct regnode_1 *trieop;
3902 Newxz(trieop,1,struct regnode_1);
3903 StructCopy(first,trieop,struct regnode_1);
3904 trie_op=(regnode *)trieop;
3905 } else {
3906 struct regnode_charclass *trieop;
3907 Newxz(trieop,1,struct regnode_charclass);
3908 StructCopy(first,trieop,struct regnode_charclass);
3909 trie_op=(regnode *)trieop;
3910 }
1de06328 3911 OP(trie_op)+=2;
786e8c11
YO
3912 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
3913 r->regstclass = trie_op;
07be1b83
YO
3914 }
3915#endif
bfed75c6 3916 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 3917 r->regstclass = first;
3dab1dad
YO
3918 else if (PL_regkind[OP(first)] == BOUND ||
3919 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 3920 r->regstclass = first;
3dab1dad 3921 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
3922 r->reganch |= (OP(first) == MBOL
3923 ? ROPT_ANCH_MBOL
3924 : (OP(first) == SBOL
3925 ? ROPT_ANCH_SBOL
3926 : ROPT_ANCH_BOL));
a0d0e21e 3927 first = NEXTOPER(first);
774d564b 3928 goto again;
3929 }
3930 else if (OP(first) == GPOS) {
3931 r->reganch |= ROPT_ANCH_GPOS;
3932 first = NEXTOPER(first);
3933 goto again;
a0d0e21e 3934 }
e09294f4 3935 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 3936 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3937 !(r->reganch & ROPT_ANCH) )
3938 {
3939 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
3940 const int type =
3941 (OP(NEXTOPER(first)) == REG_ANY)
3942 ? ROPT_ANCH_MBOL
3943 : ROPT_ANCH_SBOL;
cad2e5aa 3944 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3945 first = NEXTOPER(first);
774d564b 3946 goto again;
a0d0e21e 3947 }
b81d288d 3948 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3949 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3950 /* x+ must match at the 1st pos of run of x's */
3951 r->reganch |= ROPT_SKIP;
a0d0e21e 3952
c277df42 3953 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa
YO
3954#ifdef TRIE_STUDY_OPT
3955 DEBUG_COMPILE_r(
3956 if (!restudied)
3957 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3958 (IV)(first - scan + 1))
3959 );
3960#else
3961 DEBUG_COMPILE_r(
3962 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
3963 (IV)(first - scan + 1))
3964 );
3965#endif
3966
3967
a0d0e21e
LW
3968 /*
3969 * If there's something expensive in the r.e., find the
3970 * longest literal string that must appear and make it the
3971 * regmust. Resolve ties in favor of later strings, since
3972 * the regstart check works with the beginning of the r.e.
3973 * and avoiding duplication strengthens checking. Not a
3974 * strong reason, but sufficient in the absence of others.
3975 * [Now we resolve ties in favor of the earlier string if
c277df42 3976 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3977 * earlier string may buy us something the later one won't.]
3978 */
a0d0e21e 3979 minlen = 0;
a687059c 3980
396482e1
GA
3981 data.longest_fixed = newSVpvs("");
3982 data.longest_float = newSVpvs("");
3983 data.last_found = newSVpvs("");
c277df42
IZ
3984 data.longest = &(data.longest_fixed);
3985 first = scan;
653099ff 3986 if (!r->regstclass) {
830247a4 3987 cl_init(pRExC_state, &ch_class);
653099ff
GS
3988 data.start_class = &ch_class;
3989 stclass_flag = SCF_DO_STCLASS_AND;
3990 } else /* XXXX Check for BOUND? */
3991 stclass_flag = 0;
cb434fcc 3992 data.last_closep = &last_close;
653099ff 3993
1de06328 3994 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
a3621e74 3995 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 3996
07be1b83 3997
786e8c11
YO
3998 CHECK_RESTUDY_GOTO;
3999
4000
830247a4 4001 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4002 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
4003 && !RExC_seen_zerolen
4004 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 4005 r->reganch |= ROPT_CHECK_ALL;
1de06328 4006 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4007 SvREFCNT_dec(data.last_found);
4008
1de06328
YO
4009 /* Note that code very similar to this but for anchored string
4010 follows immediately below, changes may need to be made to both.
4011 Be careful.
4012 */
a0ed51b3 4013 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4014 if (longest_float_length
c277df42
IZ
4015 || (data.flags & SF_FL_BEFORE_EOL
4016 && (!(data.flags & SF_FL_BEFORE_MEOL)
1de06328
YO
4017 || (RExC_flags & PMf_MULTILINE))))
4018 {
1182767e 4019 I32 t,ml;
cf93c79d 4020
1de06328 4021 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4022 && data.offset_fixed == data.offset_float_min
4023 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4024 goto remove_float; /* As in (a)+. */
4025
1de06328
YO
4026 /* copy the information about the longest float from the reg_scan_data
4027 over to the program. */
33b8afdf
JH
4028 if (SvUTF8(data.longest_float)) {
4029 r->float_utf8 = data.longest_float;
c445ea15 4030 r->float_substr = NULL;
33b8afdf
JH
4031 } else {
4032 r->float_substr = data.longest_float;
c445ea15 4033 r->float_utf8 = NULL;
33b8afdf 4034 }
1de06328
YO
4035 /* float_end_shift is how many chars that must be matched that
4036 follow this item. We calculate it ahead of time as once the
4037 lookbehind offset is added in we lose the ability to correctly
4038 calculate it.*/
4039 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4040 : (I32)longest_float_length;
1de06328
YO
4041 r->float_end_shift = ml - data.offset_float_min
4042 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4043 + data.lookbehind_float;
4044 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4045 r->float_max_offset = data.offset_float_max;
1182767e 4046 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4047 r->float_max_offset -= data.lookbehind_float;
4048
cf93c79d
IZ
4049 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4050 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 4051 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4052 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4053 }
4054 else {
aca2d497 4055 remove_float:
c445ea15 4056 r->float_substr = r->float_utf8 = NULL;
c277df42 4057 SvREFCNT_dec(data.longest_float);
c5254dd6 4058 longest_float_length = 0;
a0d0e21e 4059 }
c277df42 4060
1de06328
YO
4061 /* Note that code very similar to this but for floating string
4062 is immediately above, changes may need to be made to both.
4063 Be careful.
4064 */
a0ed51b3 4065 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4066 if (longest_fixed_length
c277df42
IZ
4067 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4068 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1de06328
YO
4069 || (RExC_flags & PMf_MULTILINE))))
4070 {
1182767e 4071 I32 t,ml;
cf93c79d 4072
1de06328
YO
4073 /* copy the information about the longest fixed
4074 from the reg_scan_data over to the program. */
33b8afdf
JH
4075 if (SvUTF8(data.longest_fixed)) {
4076 r->anchored_utf8 = data.longest_fixed;
c445ea15 4077 r->anchored_substr = NULL;
33b8afdf
JH
4078 } else {
4079 r->anchored_substr = data.longest_fixed;
c445ea15 4080 r->anchored_utf8 = NULL;
33b8afdf 4081 }
1de06328
YO
4082 /* fixed_end_shift is how many chars that must be matched that
4083 follow this item. We calculate it ahead of time as once the
4084 lookbehind offset is added in we lose the ability to correctly
4085 calculate it.*/
4086 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4087 : (I32)longest_fixed_length;
1de06328
YO
4088 r->anchored_end_shift = ml - data.offset_fixed
4089 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4090 + data.lookbehind_fixed;
4091 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4092
cf93c79d
IZ
4093 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4094 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 4095 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4096 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4097 }
4098 else {
c445ea15 4099 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4100 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4101 longest_fixed_length = 0;
a0d0e21e 4102 }
b81d288d 4103 if (r->regstclass
ffc61ed2 4104 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 4105 r->regstclass = NULL;
33b8afdf
JH
4106 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4107 && stclass_flag
653099ff 4108 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4109 && !cl_is_anything(data.start_class))
4110 {
1df70142 4111 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4112
a02a5408 4113 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4114 struct regnode_charclass_class);
4115 StructCopy(data.start_class,
830247a4 4116 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4117 struct regnode_charclass_class);
830247a4 4118 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4119 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4120 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4121 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4122 PerlIO_printf(Perl_debug_log,
a0288114 4123 "synthetic stclass \"%s\".\n",
3f7c398e 4124 SvPVX_const(sv));});
653099ff 4125 }
c277df42
IZ
4126
4127 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4128 if (longest_fixed_length > longest_float_length) {
1de06328 4129 r->check_end_shift = r->anchored_end_shift;
c277df42 4130 r->check_substr = r->anchored_substr;
33b8afdf 4131 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
4132 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4133 if (r->reganch & ROPT_ANCH_SINGLE)
4134 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
4135 }
4136 else {
1de06328 4137 r->check_end_shift = r->float_end_shift;
c277df42 4138 r->check_substr = r->float_substr;
33b8afdf 4139 r->check_utf8 = r->float_utf8;
1de06328
YO
4140 r->check_offset_min = r->float_min_offset;
4141 r->check_offset_max = r->float_max_offset;
a0d0e21e 4142 }
30382c73
IZ
4143 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4144 This should be changed ASAP! */
33b8afdf 4145 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 4146 r->reganch |= RE_USE_INTUIT;
33b8afdf 4147 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
4148 r->reganch |= RE_INTUIT_TAIL;
4149 }
1de06328
YO
4150 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4151 if ( (STRLEN)minlen < longest_float_length )
4152 minlen= longest_float_length;
4153 if ( (STRLEN)minlen < longest_fixed_length )
4154 minlen= longest_fixed_length;
4155 */
a0ed51b3
LW
4156 }
4157 else {
c277df42
IZ
4158 /* Several toplevels. Best we can is to set minlen. */
4159 I32 fake;
653099ff 4160 struct regnode_charclass_class ch_class;
cb434fcc 4161 I32 last_close = 0;
c277df42 4162
a3621e74 4163 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
07be1b83 4164
c277df42 4165 scan = r->program + 1;
830247a4 4166 cl_init(pRExC_state, &ch_class);
653099ff 4167 data.start_class = &ch_class;
cb434fcc 4168 data.last_closep = &last_close;
07be1b83 4169
1de06328 4170 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
07be1b83
YO
4171 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4172
786e8c11 4173 CHECK_RESTUDY_GOTO;
07be1b83 4174
33b8afdf 4175 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4176 = r->float_substr = r->float_utf8 = NULL;
653099ff 4177 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4178 && !cl_is_anything(data.start_class))
4179 {
1df70142 4180 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4181
a02a5408 4182 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4183 struct regnode_charclass_class);
4184 StructCopy(data.start_class,
830247a4 4185 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4186 struct regnode_charclass_class);
830247a4 4187 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4188 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4189 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4190 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4191 PerlIO_printf(Perl_debug_log,
a0288114 4192 "synthetic stclass \"%s\".\n",
3f7c398e 4193 SvPVX_const(sv));});
653099ff 4194 }
a0d0e21e
LW
4195 }
4196
1de06328
YO
4197 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4198 the "real" pattern. */
4199 if (r->minlen < minlen)
4200 r->minlen = minlen;
4201
b81d288d 4202 if (RExC_seen & REG_SEEN_GPOS)
c277df42 4203 r->reganch |= ROPT_GPOS_SEEN;
830247a4 4204 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 4205 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 4206 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 4207 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
4208 if (RExC_seen & REG_SEEN_CANY)
4209 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
4210 Newxz(r->startp, RExC_npar, I32);
4211 Newxz(r->endp, RExC_npar, I32);
fc8cd66c
YO
4212 if (RExC_charnames)
4213 SvREFCNT_dec((SV*)(RExC_charnames));
07be1b83 4214
f2278c82 4215 DEBUG_r( RX_DEBUG_on(r) );
be8e71aa
YO
4216 DEBUG_DUMP_r({
4217 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4218 regdump(r);
4219 });
8e9a8a48
YO
4220 DEBUG_OFFSETS_r(if (r->offsets) {
4221 const U32 len = r->offsets[0];
4222 U32 i;
4223 GET_RE_DEBUG_FLAGS_DECL;
4224 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4225 for (i = 1; i <= len; i++) {
4226 if (r->offsets[i*2-1] || r->offsets[i*2])
4227 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
786e8c11 4228 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
8e9a8a48
YO
4229 }
4230 PerlIO_printf(Perl_debug_log, "\n");
4231 });
a0d0e21e 4232 return(r);
a687059c
LW
4233}
4234
3dab1dad
YO
4235
4236#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4237 int rem=(int)(RExC_end - RExC_parse); \
4238 int cut; \
4239 int num; \
4240 int iscut=0; \
4241 if (rem>10) { \
4242 rem=10; \
4243 iscut=1; \
4244 } \
4245 cut=10-rem; \
4246 if (RExC_lastparse!=RExC_parse) \
4247 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4248 rem, RExC_parse, \
4249 cut + 4, \
4250 iscut ? "..." : "<" \
4251 ); \
4252 else \
4253 PerlIO_printf(Perl_debug_log,"%16s",""); \
4254 \
4255 if (SIZE_ONLY) \
4256 num=RExC_size; \
4257 else \
4258 num=REG_NODE_NUM(RExC_emit); \
4259 if (RExC_lastnum!=num) \
be8e71aa 4260 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4261 else \
be8e71aa
YO
4262 PerlIO_printf(Perl_debug_log,"|%4s",""); \
4263 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4264 (int)((depth*2)), "", \
3dab1dad
YO
4265 (funcname) \
4266 ); \
4267 RExC_lastnum=num; \
4268 RExC_lastparse=RExC_parse; \
4269})
4270
07be1b83
YO
4271
4272
3dab1dad
YO
4273#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4274 DEBUG_PARSE_MSG((funcname)); \
4275 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4276})
a687059c
LW
4277/*
4278 - reg - regular expression, i.e. main body or parenthesized thing
4279 *
4280 * Caller must absorb opening parenthesis.
4281 *
4282 * Combining parenthesis handling with the base level of regular expression
4283 * is a trifle forced, but the need to tie the tails of the branches to what
4284 * follows makes it hard to avoid.
4285 */
07be1b83
YO
4286#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4287#ifdef DEBUGGING
4288#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4289#else
4290#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4291#endif
3dab1dad 4292
76e3520e 4293STATIC regnode *
3dab1dad 4294S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4295 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4296{
27da23d5 4297 dVAR;
c277df42
IZ
4298 register regnode *ret; /* Will be the head of the group. */
4299 register regnode *br;
4300 register regnode *lastbr;
cbbf8932 4301 register regnode *ender = NULL;
a0d0e21e 4302 register I32 parno = 0;
cbbf8932
AL
4303 I32 flags;
4304 const I32 oregflags = RExC_flags;
6136c704
AL
4305 bool have_branch = 0;
4306 bool is_open = 0;
9d1d55b5
JP
4307
4308 /* for (?g), (?gc), and (?o) warnings; warning
4309 about (?c) will warn about (?g) -- japhy */
4310
6136c704
AL
4311#define WASTED_O 0x01
4312#define WASTED_G 0x02
4313#define WASTED_C 0x04
4314#define WASTED_GC (0x02|0x04)
cbbf8932 4315 I32 wastedflags = 0x00;
9d1d55b5 4316
fac92740 4317 char * parse_start = RExC_parse; /* MJD */
a28509cc 4318 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4319
3dab1dad
YO
4320 GET_RE_DEBUG_FLAGS_DECL;
4321 DEBUG_PARSE("reg ");
4322
4323
821b33a5 4324 *flagp = 0; /* Tentatively. */
a0d0e21e 4325
9d1d55b5 4326
a0d0e21e
LW
4327 /* Make an OPEN node, if parenthesized. */
4328 if (paren) {
fac92740 4329 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4330 U32 posflags = 0, negflags = 0;
4331 U32 *flagsp = &posflags;
6136c704 4332 bool is_logical = 0;
a28509cc 4333 const char * const seqstart = RExC_parse;
ca9dfc88 4334
830247a4
IZ
4335 RExC_parse++;
4336 paren = *RExC_parse++;
c277df42 4337 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4338 switch (paren) {
fac92740 4339 case '<': /* (?<...) */
830247a4 4340 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 4341 if (*RExC_parse == '!')
c277df42 4342 paren = ',';
b81d288d 4343 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 4344 goto unknown;
830247a4 4345 RExC_parse++;
fac92740
MJD
4346 case '=': /* (?=...) */
4347 case '!': /* (?!...) */
830247a4 4348 RExC_seen_zerolen++;
fac92740
MJD
4349 case ':': /* (?:...) */
4350 case '>': /* (?>...) */
a0d0e21e 4351 break;
fac92740
MJD
4352 case '$': /* (?$...) */
4353 case '@': /* (?@...) */
8615cb43 4354 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4355 break;
fac92740 4356 case '#': /* (?#...) */
830247a4
IZ
4357 while (*RExC_parse && *RExC_parse != ')')
4358 RExC_parse++;
4359 if (*RExC_parse != ')')
c277df42 4360 FAIL("Sequence (?#... not terminated");
830247a4 4361 nextchar(pRExC_state);
a0d0e21e
LW
4362 *flagp = TRYAGAIN;
4363 return NULL;
fac92740 4364 case 'p': /* (?p...) */
9014280d 4365 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 4366 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 4367 /* FALL THROUGH*/
fac92740 4368 case '?': /* (??...) */
6136c704 4369 is_logical = 1;
438a3801
YST
4370 if (*RExC_parse != '{')
4371 goto unknown;
830247a4 4372 paren = *RExC_parse++;
0f5d15d6 4373 /* FALL THROUGH */
fac92740 4374 case '{': /* (?{...}) */
c277df42 4375 {
c277df42
IZ
4376 I32 count = 1, n = 0;
4377 char c;
830247a4 4378 char *s = RExC_parse;
c277df42 4379
830247a4
IZ
4380 RExC_seen_zerolen++;
4381 RExC_seen |= REG_SEEN_EVAL;
4382 while (count && (c = *RExC_parse)) {
6136c704
AL
4383 if (c == '\\') {
4384 if (RExC_parse[1])
4385 RExC_parse++;
4386 }
b81d288d 4387 else if (c == '{')
c277df42 4388 count++;
b81d288d 4389 else if (c == '}')
c277df42 4390 count--;
830247a4 4391 RExC_parse++;
c277df42 4392 }
6136c704 4393 if (*RExC_parse != ')') {
b81d288d 4394 RExC_parse = s;
b45f050a
JF
4395 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
4396 }
c277df42 4397 if (!SIZE_ONLY) {
f3548bdc 4398 PAD *pad;
6136c704
AL
4399 OP_4tree *sop, *rop;
4400 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 4401
569233ed
SB
4402 ENTER;
4403 Perl_save_re_context(aTHX);
f3548bdc 4404 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
4405 sop->op_private |= OPpREFCOUNTED;
4406 /* re_dup will OpREFCNT_inc */
4407 OpREFCNT_set(sop, 1);
569233ed 4408 LEAVE;
c277df42 4409
830247a4
IZ
4410 n = add_data(pRExC_state, 3, "nop");
4411 RExC_rx->data->data[n] = (void*)rop;
4412 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 4413 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 4414 SvREFCNT_dec(sv);
a0ed51b3 4415 }
e24b16f9 4416 else { /* First pass */
830247a4 4417 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 4418 && IN_PERL_RUNTIME)
2cd61cdb
IZ
4419 /* No compiled RE interpolated, has runtime
4420 components ===> unsafe. */
4421 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 4422 if (PL_tainting && PL_tainted)
cc6b7395 4423 FAIL("Eval-group in insecure regular expression");
54df2634 4424#if PERL_VERSION > 8
923e4eb5 4425 if (IN_PERL_COMPILETIME)
b5c19bd7 4426 PL_cv_has_eval = 1;
54df2634 4427#endif
c277df42 4428 }
b5c19bd7 4429
830247a4 4430 nextchar(pRExC_state);
6136c704 4431 if (is_logical) {
830247a4 4432 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4433 if (!SIZE_ONLY)
4434 ret->flags = 2;
3dab1dad 4435 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 4436 /* deal with the length of this later - MJD */
0f5d15d6
IZ
4437 return ret;
4438 }
ccb2c380
MP
4439 ret = reganode(pRExC_state, EVAL, n);
4440 Set_Node_Length(ret, RExC_parse - parse_start + 1);
4441 Set_Node_Offset(ret, parse_start);
4442 return ret;
c277df42 4443 }
fac92740 4444 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 4445 {
fac92740 4446 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
4447 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
4448 || RExC_parse[1] == '<'
830247a4 4449 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
4450 I32 flag;
4451
830247a4 4452 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4453 if (!SIZE_ONLY)
4454 ret->flags = 1;
3dab1dad 4455 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 4456 goto insert_if;
b81d288d 4457 }
a0ed51b3 4458 }
830247a4 4459 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 4460 /* (?(1)...) */
6136c704 4461 char c;
830247a4 4462 parno = atoi(RExC_parse++);
c277df42 4463
830247a4
IZ
4464 while (isDIGIT(*RExC_parse))
4465 RExC_parse++;
fac92740 4466 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 4467
830247a4 4468 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 4469 vFAIL("Switch condition not recognized");
c277df42 4470 insert_if:
3dab1dad
YO
4471 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
4472 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 4473 if (br == NULL)
830247a4 4474 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 4475 else
3dab1dad 4476 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 4477 c = *nextchar(pRExC_state);
d1b80229
IZ
4478 if (flags&HASWIDTH)
4479 *flagp |= HASWIDTH;
c277df42 4480 if (c == '|') {
830247a4 4481 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
4482 regbranch(pRExC_state, &flags, 1,depth+1);
4483 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
4484 if (flags&HASWIDTH)
4485 *flagp |= HASWIDTH;
830247a4 4486 c = *nextchar(pRExC_state);
a0ed51b3
LW
4487 }
4488 else
c277df42
IZ
4489 lastbr = NULL;
4490 if (c != ')')
8615cb43 4491 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 4492 ender = reg_node(pRExC_state, TAIL);
3dab1dad 4493 REGTAIL(pRExC_state, br, ender);
c277df42 4494 if (lastbr) {
3dab1dad
YO
4495 REGTAIL(pRExC_state, lastbr, ender);
4496 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
4497 }
4498 else
3dab1dad 4499 REGTAIL(pRExC_state, ret, ender);
c277df42 4500 return ret;
a0ed51b3
LW
4501 }
4502 else {
830247a4 4503 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
4504 }
4505 }
1b1626e4 4506 case 0:
830247a4 4507 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 4508 vFAIL("Sequence (? incomplete");
1b1626e4 4509 break;
a0d0e21e 4510 default:
830247a4 4511 --RExC_parse;
fac92740 4512 parse_flags: /* (?i) */
830247a4 4513 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
4514 /* (?g), (?gc) and (?o) are useless here
4515 and must be globally applied -- japhy */
4516
4517 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
4518 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 4519 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
4520 if (! (wastedflags & wflagbit) ) {
4521 wastedflags |= wflagbit;
4522 vWARN5(
4523 RExC_parse + 1,
4524 "Useless (%s%c) - %suse /%c modifier",
4525 flagsp == &negflags ? "?-" : "?",
4526 *RExC_parse,
4527 flagsp == &negflags ? "don't " : "",
4528 *RExC_parse
4529 );
4530 }
4531 }
4532 }
4533 else if (*RExC_parse == 'c') {
4534 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
4535 if (! (wastedflags & WASTED_C) ) {
4536 wastedflags |= WASTED_GC;
9d1d55b5
JP
4537 vWARN3(
4538 RExC_parse + 1,
4539 "Useless (%sc) - %suse /gc modifier",
4540 flagsp == &negflags ? "?-" : "?",
4541 flagsp == &negflags ? "don't " : ""
4542 );
4543 }
4544 }
4545 }
4546 else { pmflag(flagsp, *RExC_parse); }
4547
830247a4 4548 ++RExC_parse;
ca9dfc88 4549 }
830247a4 4550 if (*RExC_parse == '-') {
ca9dfc88 4551 flagsp = &negflags;
9d1d55b5 4552 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 4553 ++RExC_parse;
ca9dfc88 4554 goto parse_flags;
48c036b1 4555 }
e2509266
JH
4556 RExC_flags |= posflags;
4557 RExC_flags &= ~negflags;
830247a4
IZ
4558 if (*RExC_parse == ':') {
4559 RExC_parse++;
ca9dfc88
IZ
4560 paren = ':';
4561 break;
4562 }
c277df42 4563 unknown:
830247a4
IZ
4564 if (*RExC_parse != ')') {
4565 RExC_parse++;
4566 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 4567 }
830247a4 4568 nextchar(pRExC_state);
a0d0e21e
LW
4569 *flagp = TRYAGAIN;
4570 return NULL;
4571 }
4572 }
fac92740 4573 else { /* (...) */
830247a4
IZ
4574 parno = RExC_npar;
4575 RExC_npar++;
4576 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
4577 Set_Node_Length(ret, 1); /* MJD */
4578 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 4579 is_open = 1;
a0d0e21e 4580 }
a0ed51b3 4581 }
fac92740 4582 else /* ! paren */
a0d0e21e
LW
4583 ret = NULL;
4584
4585 /* Pick up the branches, linking them together. */
fac92740 4586 parse_start = RExC_parse; /* MJD */
3dab1dad 4587 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 4588 /* branch_len = (paren != 0); */
2af232bd 4589
a0d0e21e
LW
4590 if (br == NULL)
4591 return(NULL);
830247a4
IZ
4592 if (*RExC_parse == '|') {
4593 if (!SIZE_ONLY && RExC_extralen) {
4594 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 4595 }
fac92740 4596 else { /* MJD */
830247a4 4597 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
4598 Set_Node_Length(br, paren != 0);
4599 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4600 }
c277df42
IZ
4601 have_branch = 1;
4602 if (SIZE_ONLY)
830247a4 4603 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
4604 }
4605 else if (paren == ':') {
c277df42
IZ
4606 *flagp |= flags&SIMPLE;
4607 }
6136c704 4608 if (is_open) { /* Starts with OPEN. */
3dab1dad 4609 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
4610 }
4611 else if (paren != '?') /* Not Conditional */
a0d0e21e 4612 ret = br;
32a0ca98 4613 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 4614 lastbr = br;
830247a4
IZ
4615 while (*RExC_parse == '|') {
4616 if (!SIZE_ONLY && RExC_extralen) {
4617 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 4618 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
4619 }
4620 if (SIZE_ONLY)
830247a4
IZ
4621 RExC_extralen += 2; /* Account for LONGJMP. */
4622 nextchar(pRExC_state);
3dab1dad 4623 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 4624
a687059c 4625 if (br == NULL)
a0d0e21e 4626 return(NULL);
3dab1dad 4627 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 4628 lastbr = br;
821b33a5
IZ
4629 if (flags&HASWIDTH)
4630 *flagp |= HASWIDTH;
a687059c 4631 *flagp |= flags&SPSTART;
a0d0e21e
LW
4632 }
4633
c277df42
IZ
4634 if (have_branch || paren != ':') {
4635 /* Make a closing node, and hook it on the end. */
4636 switch (paren) {
4637 case ':':
830247a4 4638 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
4639 break;
4640 case 1:
830247a4 4641 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
4642 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4643 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
4644 break;
4645 case '<':
c277df42
IZ
4646 case ',':
4647 case '=':
4648 case '!':
c277df42 4649 *flagp &= ~HASWIDTH;
821b33a5
IZ
4650 /* FALL THROUGH */
4651 case '>':
830247a4 4652 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
4653 break;
4654 case 0:
830247a4 4655 ender = reg_node(pRExC_state, END);
c277df42
IZ
4656 break;
4657 }
07be1b83 4658 REGTAIL_STUDY(pRExC_state, lastbr, ender);
a0d0e21e 4659
9674d46a 4660 if (have_branch && !SIZE_ONLY) {
c277df42 4661 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
4662 for (br = ret; br; br = regnext(br)) {
4663 const U8 op = PL_regkind[OP(br)];
4664 if (op == BRANCH) {
07be1b83 4665 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
4666 }
4667 else if (op == BRANCHJ) {
07be1b83 4668 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 4669 }
c277df42
IZ
4670 }
4671 }
a0d0e21e 4672 }
c277df42
IZ
4673
4674 {
e1ec3a88
AL
4675 const char *p;
4676 static const char parens[] = "=!<,>";
c277df42
IZ
4677
4678 if (paren && (p = strchr(parens, paren))) {
eb160463 4679 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
4680 int flag = (p - parens) > 1;
4681
4682 if (paren == '>')
4683 node = SUSPEND, flag = 0;
830247a4 4684 reginsert(pRExC_state, node,ret);
45948336
EP
4685 Set_Node_Cur_Length(ret);
4686 Set_Node_Offset(ret, parse_start + 1);
c277df42 4687 ret->flags = flag;
07be1b83 4688 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 4689 }
a0d0e21e
LW
4690 }
4691
4692 /* Check for proper termination. */
ce3e6498 4693 if (paren) {
e2509266 4694 RExC_flags = oregflags;
830247a4
IZ
4695 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4696 RExC_parse = oregcomp_parse;
380a0633 4697 vFAIL("Unmatched (");
ce3e6498 4698 }
a0ed51b3 4699 }
830247a4
IZ
4700 else if (!paren && RExC_parse < RExC_end) {
4701 if (*RExC_parse == ')') {
4702 RExC_parse++;
380a0633 4703 vFAIL("Unmatched )");
a0ed51b3
LW
4704 }
4705 else
b45f050a 4706 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
4707 /* NOTREACHED */
4708 }
a687059c 4709
a0d0e21e 4710 return(ret);
a687059c
LW
4711}
4712
4713/*
4714 - regbranch - one alternative of an | operator
4715 *
4716 * Implements the concatenation operator.
4717 */
76e3520e 4718STATIC regnode *
3dab1dad 4719S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 4720{
97aff369 4721 dVAR;
c277df42
IZ
4722 register regnode *ret;
4723 register regnode *chain = NULL;
4724 register regnode *latest;
4725 I32 flags = 0, c = 0;
3dab1dad
YO
4726 GET_RE_DEBUG_FLAGS_DECL;
4727 DEBUG_PARSE("brnc");
b81d288d 4728 if (first)
c277df42
IZ
4729 ret = NULL;
4730 else {
b81d288d 4731 if (!SIZE_ONLY && RExC_extralen)
830247a4 4732 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 4733 else {
830247a4 4734 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
4735 Set_Node_Length(ret, 1);
4736 }
c277df42
IZ
4737 }
4738
b81d288d 4739 if (!first && SIZE_ONLY)
830247a4 4740 RExC_extralen += 1; /* BRANCHJ */
b81d288d 4741
c277df42 4742 *flagp = WORST; /* Tentatively. */
a0d0e21e 4743
830247a4
IZ
4744 RExC_parse--;
4745 nextchar(pRExC_state);
4746 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 4747 flags &= ~TRYAGAIN;
3dab1dad 4748 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4749 if (latest == NULL) {
4750 if (flags & TRYAGAIN)
4751 continue;
4752 return(NULL);
a0ed51b3
LW
4753 }
4754 else if (ret == NULL)
c277df42 4755 ret = latest;
a0d0e21e 4756 *flagp |= flags&HASWIDTH;
c277df42 4757 if (chain == NULL) /* First piece. */
a0d0e21e
LW
4758 *flagp |= flags&SPSTART;
4759 else {
830247a4 4760 RExC_naughty++;
3dab1dad 4761 REGTAIL(pRExC_state, chain, latest);
a687059c 4762 }
a0d0e21e 4763 chain = latest;
c277df42
IZ
4764 c++;
4765 }
4766 if (chain == NULL) { /* Loop ran zero times. */
830247a4 4767 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
4768 if (ret == NULL)
4769 ret = chain;
4770 }
4771 if (c == 1) {
4772 *flagp |= flags&SIMPLE;
a0d0e21e 4773 }
a687059c 4774
d4c19fe8 4775 return ret;
a687059c
LW
4776}
4777
4778/*
4779 - regpiece - something followed by possible [*+?]
4780 *
4781 * Note that the branching code sequences used for ? and the general cases
4782 * of * and + are somewhat optimized: they use the same NOTHING node as
4783 * both the endmarker for their branch list and the body of the last branch.
4784 * It might seem that this node could be dispensed with entirely, but the
4785 * endmarker role is not redundant.
4786 */
76e3520e 4787STATIC regnode *
3dab1dad 4788S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4789{
97aff369 4790 dVAR;
c277df42 4791 register regnode *ret;
a0d0e21e
LW
4792 register char op;
4793 register char *next;
4794 I32 flags;
1df70142 4795 const char * const origparse = RExC_parse;
a0d0e21e 4796 I32 min;
c277df42 4797 I32 max = REG_INFTY;
fac92740 4798 char *parse_start;
10edeb5d 4799 const char *maxpos = NULL;
3dab1dad
YO
4800 GET_RE_DEBUG_FLAGS_DECL;
4801 DEBUG_PARSE("piec");
a0d0e21e 4802
3dab1dad 4803 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4804 if (ret == NULL) {
4805 if (flags & TRYAGAIN)
4806 *flagp |= TRYAGAIN;
4807 return(NULL);
4808 }
4809
830247a4 4810 op = *RExC_parse;
a0d0e21e 4811
830247a4 4812 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 4813 maxpos = NULL;
fac92740 4814 parse_start = RExC_parse; /* MJD */
830247a4 4815 next = RExC_parse + 1;
a0d0e21e
LW
4816 while (isDIGIT(*next) || *next == ',') {
4817 if (*next == ',') {
4818 if (maxpos)
4819 break;
4820 else
4821 maxpos = next;
a687059c 4822 }
a0d0e21e
LW
4823 next++;
4824 }
4825 if (*next == '}') { /* got one */
4826 if (!maxpos)
4827 maxpos = next;
830247a4
IZ
4828 RExC_parse++;
4829 min = atoi(RExC_parse);
a0d0e21e
LW
4830 if (*maxpos == ',')
4831 maxpos++;
4832 else
830247a4 4833 maxpos = RExC_parse;
a0d0e21e
LW
4834 max = atoi(maxpos);
4835 if (!max && *maxpos != '0')
c277df42
IZ
4836 max = REG_INFTY; /* meaning "infinity" */
4837 else if (max >= REG_INFTY)
8615cb43 4838 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
4839 RExC_parse = next;
4840 nextchar(pRExC_state);
a0d0e21e
LW
4841
4842 do_curly:
4843 if ((flags&SIMPLE)) {
830247a4
IZ
4844 RExC_naughty += 2 + RExC_naughty / 2;
4845 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
4846 Set_Node_Offset(ret, parse_start+1); /* MJD */
4847 Set_Node_Cur_Length(ret);
a0d0e21e
LW
4848 }
4849 else {
3dab1dad 4850 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
4851
4852 w->flags = 0;
3dab1dad 4853 REGTAIL(pRExC_state, ret, w);
830247a4
IZ
4854 if (!SIZE_ONLY && RExC_extralen) {
4855 reginsert(pRExC_state, LONGJMP,ret);
4856 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
4857 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4858 }
830247a4 4859 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
4860 /* MJD hk */
4861 Set_Node_Offset(ret, parse_start+1);
2af232bd 4862 Set_Node_Length(ret,
fac92740 4863 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 4864
830247a4 4865 if (!SIZE_ONLY && RExC_extralen)
c277df42 4866 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 4867 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 4868 if (SIZE_ONLY)
830247a4
IZ
4869 RExC_whilem_seen++, RExC_extralen += 3;
4870 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 4871 }
c277df42 4872 ret->flags = 0;
a0d0e21e
LW
4873
4874 if (min > 0)
821b33a5
IZ
4875 *flagp = WORST;
4876 if (max > 0)
4877 *flagp |= HASWIDTH;
a0d0e21e 4878 if (max && max < min)
8615cb43 4879 vFAIL("Can't do {n,m} with n > m");
c277df42 4880 if (!SIZE_ONLY) {
eb160463
GS
4881 ARG1_SET(ret, (U16)min);
4882 ARG2_SET(ret, (U16)max);
a687059c 4883 }
a687059c 4884
a0d0e21e 4885 goto nest_check;
a687059c 4886 }
a0d0e21e 4887 }
a687059c 4888
a0d0e21e
LW
4889 if (!ISMULT1(op)) {
4890 *flagp = flags;
a687059c 4891 return(ret);
a0d0e21e 4892 }
bb20fd44 4893
c277df42 4894#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
4895
4896 /* if this is reinstated, don't forget to put this back into perldiag:
4897
4898 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4899
4900 (F) The part of the regexp subject to either the * or + quantifier
4901 could match an empty string. The {#} shows in the regular
4902 expression about where the problem was discovered.
4903
4904 */
4905
bb20fd44 4906 if (!(flags&HASWIDTH) && op != '?')
b45f050a 4907 vFAIL("Regexp *+ operand could be empty");
b81d288d 4908#endif
bb20fd44 4909
fac92740 4910 parse_start = RExC_parse;
830247a4 4911 nextchar(pRExC_state);
a0d0e21e 4912
821b33a5 4913 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
4914
4915 if (op == '*' && (flags&SIMPLE)) {
830247a4 4916 reginsert(pRExC_state, STAR, ret);
c277df42 4917 ret->flags = 0;
830247a4 4918 RExC_naughty += 4;
a0d0e21e
LW
4919 }
4920 else if (op == '*') {
4921 min = 0;
4922 goto do_curly;
a0ed51b3
LW
4923 }
4924 else if (op == '+' && (flags&SIMPLE)) {
830247a4 4925 reginsert(pRExC_state, PLUS, ret);
c277df42 4926 ret->flags = 0;
830247a4 4927 RExC_naughty += 3;
a0d0e21e
LW
4928 }
4929 else if (op == '+') {
4930 min = 1;
4931 goto do_curly;
a0ed51b3
LW
4932 }
4933 else if (op == '?') {
a0d0e21e
LW
4934 min = 0; max = 1;
4935 goto do_curly;
4936 }
4937 nest_check:
041457d9 4938 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 4939 vWARN3(RExC_parse,
b45f050a 4940 "%.*s matches null string many times",
afd78fd5 4941 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 4942 origparse);
a0d0e21e
LW
4943 }
4944
830247a4
IZ
4945 if (*RExC_parse == '?') {
4946 nextchar(pRExC_state);
4947 reginsert(pRExC_state, MINMOD, ret);
3dab1dad 4948 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 4949 }
830247a4
IZ
4950 if (ISMULT2(RExC_parse)) {
4951 RExC_parse++;
b45f050a
JF
4952 vFAIL("Nested quantifiers");
4953 }
a0d0e21e
LW
4954
4955 return(ret);
a687059c
LW
4956}
4957
fc8cd66c
YO
4958
4959/* reg_namedseq(pRExC_state,UVp)
4960
4961 This is expected to be called by a parser routine that has
4962 recognized'\N' and needs to handle the rest. RExC_parse is
4963 expected to point at the first char following the N at the time
4964 of the call.
4965
4966 If valuep is non-null then it is assumed that we are parsing inside
4967 of a charclass definition and the first codepoint in the resolved
4968 string is returned via *valuep and the routine will return NULL.
4969 In this mode if a multichar string is returned from the charnames
4970 handler a warning will be issued, and only the first char in the
4971 sequence will be examined. If the string returned is zero length
4972 then the value of *valuep is undefined and NON-NULL will
4973 be returned to indicate failure. (This will NOT be a valid pointer
4974 to a regnode.)
4975
4976 If value is null then it is assumed that we are parsing normal text
4977 and inserts a new EXACT node into the program containing the resolved
4978 string and returns a pointer to the new node. If the string is
4979 zerolength a NOTHING node is emitted.
4980
4981 On success RExC_parse is set to the char following the endbrace.
4982 Parsing failures will generate a fatal errorvia vFAIL(...)
4983
4984 NOTE: We cache all results from the charnames handler locally in
4985 the RExC_charnames hash (created on first use) to prevent a charnames
4986 handler from playing silly-buggers and returning a short string and
4987 then a long string for a given pattern. Since the regexp program
4988 size is calculated during an initial parse this would result
4989 in a buffer overrun so we cache to prevent the charname result from
4990 changing during the course of the parse.
4991
4992 */
4993STATIC regnode *
4994S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
4995{
4996 char * name; /* start of the content of the name */
4997 char * endbrace; /* endbrace following the name */
4998 SV *sv_str = NULL;
4999 SV *sv_name = NULL;
5000 STRLEN len; /* this has various purposes throughout the code */
5001 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5002 regnode *ret = NULL;
5003
5004 if (*RExC_parse != '{') {
5005 vFAIL("Missing braces on \\N{}");
5006 }
5007 name = RExC_parse+1;
5008 endbrace = strchr(RExC_parse, '}');
5009 if ( ! endbrace ) {
5010 RExC_parse++;
5011 vFAIL("Missing right brace on \\N{}");
5012 }
5013 RExC_parse = endbrace + 1;
5014
5015
5016 /* RExC_parse points at the beginning brace,
5017 endbrace points at the last */
5018 if ( name[0]=='U' && name[1]=='+' ) {
5019 /* its a "unicode hex" notation {U+89AB} */
5020 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5021 | PERL_SCAN_DISALLOW_PREFIX
5022 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5023 UV cp;
196f1508 5024 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5025 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5026 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5027 cp = 0xFFFD;
5028 }
5029 if (cp > 0xff)
5030 RExC_utf8 = 1;
5031 if ( valuep ) {
5032 *valuep = cp;
5033 return NULL;
5034 }
5035 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5036 } else {
5037 /* fetch the charnames handler for this scope */
5038 HV * const table = GvHV(PL_hintgv);
5039 SV **cvp= table ?
5040 hv_fetchs(table, "charnames", FALSE) :
5041 NULL;
5042 SV *cv= cvp ? *cvp : NULL;
5043 HE *he_str;
5044 int count;
5045 /* create an SV with the name as argument */
5046 sv_name = newSVpvn(name, endbrace - name);
5047
5048 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5049 vFAIL2("Constant(\\N{%s}) unknown: "
5050 "(possibly a missing \"use charnames ...\")",
5051 SvPVX(sv_name));
5052 }
5053 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5054 vFAIL2("Constant(\\N{%s}): "
5055 "$^H{charnames} is not defined",SvPVX(sv_name));
5056 }
5057
5058
5059
5060 if (!RExC_charnames) {
5061 /* make sure our cache is allocated */
5062 RExC_charnames = newHV();
5063 }
5064 /* see if we have looked this one up before */
5065 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5066 if ( he_str ) {
5067 sv_str = HeVAL(he_str);
5068 cached = 1;
5069 } else {
5070 dSP ;
5071
5072 ENTER ;
5073 SAVETMPS ;
5074 PUSHMARK(SP) ;
5075
5076 XPUSHs(sv_name);
5077
5078 PUTBACK ;
5079
5080 count= call_sv(cv, G_SCALAR);
5081
5082 if (count == 1) { /* XXXX is this right? dmq */
5083 sv_str = POPs;
5084 SvREFCNT_inc_simple_void(sv_str);
5085 }
5086
5087 SPAGAIN ;
5088 PUTBACK ;
5089 FREETMPS ;
5090 LEAVE ;
5091
5092 if ( !sv_str || !SvOK(sv_str) ) {
5093 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5094 "did not return a defined value",SvPVX(sv_name));
5095 }
5096 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5097 cached = 1;
5098 }
5099 }
5100 if (valuep) {
5101 char *p = SvPV(sv_str, len);
5102 if (len) {
5103 STRLEN numlen = 1;
5104 if ( SvUTF8(sv_str) ) {
196f1508 5105 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5106 if (*valuep > 0x7F)
5107 RExC_utf8 = 1;
5108 /* XXXX
5109 We have to turn on utf8 for high bit chars otherwise
5110 we get failures with
5111
5112 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5113 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5114
5115 This is different from what \x{} would do with the same
5116 codepoint, where the condition is > 0xFF.
5117 - dmq
5118 */
5119
5120
5121 } else {
5122 *valuep = (UV)*p;
5123 /* warn if we havent used the whole string? */
5124 }
5125 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5126 vWARN2(RExC_parse,
5127 "Ignoring excess chars from \\N{%s} in character class",
5128 SvPVX(sv_name)
5129 );
5130 }
5131 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5132 vWARN2(RExC_parse,
5133 "Ignoring zero length \\N{%s} in character class",
5134 SvPVX(sv_name)
5135 );
5136 }
5137 if (sv_name)
5138 SvREFCNT_dec(sv_name);
5139 if (!cached)
5140 SvREFCNT_dec(sv_str);
5141 return len ? NULL : (regnode *)&len;
5142 } else if(SvCUR(sv_str)) {
5143
5144 char *s;
5145 char *p, *pend;
5146 STRLEN charlen = 1;
5147 char * parse_start = name-3; /* needed for the offsets */
5148 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5149
5150 ret = reg_node(pRExC_state,
5151 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5152 s= STRING(ret);
5153
5154 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5155 sv_utf8_upgrade(sv_str);
5156 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5157 RExC_utf8= 1;
5158 }
5159
5160 p = SvPV(sv_str, len);
5161 pend = p + len;
5162 /* len is the length written, charlen is the size the char read */
5163 for ( len = 0; p < pend; p += charlen ) {
5164 if (UTF) {
196f1508 5165 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5166 if (FOLD) {
5167 STRLEN foldlen,numlen;
5168 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5169 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5170 /* Emit all the Unicode characters. */
5171
5172 for (foldbuf = tmpbuf;
5173 foldlen;
5174 foldlen -= numlen)
5175 {
5176 uvc = utf8_to_uvchr(foldbuf, &numlen);
5177 if (numlen > 0) {
5178 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5179 s += unilen;
5180 len += unilen;
5181 /* In EBCDIC the numlen
5182 * and unilen can differ. */
5183 foldbuf += numlen;
5184 if (numlen >= foldlen)
5185 break;
5186 }
5187 else
5188 break; /* "Can't happen." */
5189 }
5190 } else {
5191 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5192 if (unilen > 0) {
5193 s += unilen;
5194 len += unilen;
5195 }
5196 }
5197 } else {
5198 len++;
5199 REGC(*p, s++);
5200 }
5201 }
5202 if (SIZE_ONLY) {
5203 RExC_size += STR_SZ(len);
5204 } else {
5205 STR_LEN(ret) = len;
5206 RExC_emit += STR_SZ(len);
5207 }
5208 Set_Node_Cur_Length(ret); /* MJD */
5209 RExC_parse--;
5210 nextchar(pRExC_state);
5211 } else {
5212 ret = reg_node(pRExC_state,NOTHING);
5213 }
5214 if (!cached) {
5215 SvREFCNT_dec(sv_str);
5216 }
5217 if (sv_name) {
5218 SvREFCNT_dec(sv_name);
5219 }
5220 return ret;
5221
5222}
5223
5224
5225
a687059c
LW
5226/*
5227 - regatom - the lowest level
5228 *
5229 * Optimization: gobbles an entire sequence of ordinary characters so that
5230 * it can turn them into a single node, which is smaller to store and
5231 * faster to run. Backslashed characters are exceptions, each becoming a
5232 * separate node; the code is simpler that way and it's not worth fixing.
5233 *
7f6f358c
YO
5234 * [Yes, it is worth fixing, some scripts can run twice the speed.]
5235 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
5236 */
76e3520e 5237STATIC regnode *
3dab1dad 5238S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5239{
97aff369 5240 dVAR;
cbbf8932 5241 register regnode *ret = NULL;
a0d0e21e 5242 I32 flags;
45948336 5243 char *parse_start = RExC_parse;
3dab1dad
YO
5244 GET_RE_DEBUG_FLAGS_DECL;
5245 DEBUG_PARSE("atom");
a0d0e21e
LW
5246 *flagp = WORST; /* Tentatively. */
5247
5248tryagain:
830247a4 5249 switch (*RExC_parse) {
a0d0e21e 5250 case '^':
830247a4
IZ
5251 RExC_seen_zerolen++;
5252 nextchar(pRExC_state);
e2509266 5253 if (RExC_flags & PMf_MULTILINE)
830247a4 5254 ret = reg_node(pRExC_state, MBOL);
e2509266 5255 else if (RExC_flags & PMf_SINGLELINE)
830247a4 5256 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 5257 else
830247a4 5258 ret = reg_node(pRExC_state, BOL);
fac92740 5259 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5260 break;
5261 case '$':
830247a4 5262 nextchar(pRExC_state);
b81d288d 5263 if (*RExC_parse)
830247a4 5264 RExC_seen_zerolen++;
e2509266 5265 if (RExC_flags & PMf_MULTILINE)
830247a4 5266 ret = reg_node(pRExC_state, MEOL);
e2509266 5267 else if (RExC_flags & PMf_SINGLELINE)
830247a4 5268 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 5269 else
830247a4 5270 ret = reg_node(pRExC_state, EOL);
fac92740 5271 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5272 break;
5273 case '.':
830247a4 5274 nextchar(pRExC_state);
e2509266 5275 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
5276 ret = reg_node(pRExC_state, SANY);
5277 else
5278 ret = reg_node(pRExC_state, REG_ANY);
5279 *flagp |= HASWIDTH|SIMPLE;
830247a4 5280 RExC_naughty++;
fac92740 5281 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5282 break;
5283 case '[':
b45f050a 5284 {
3dab1dad
YO
5285 char * const oregcomp_parse = ++RExC_parse;
5286 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
5287 if (*RExC_parse != ']') {
5288 RExC_parse = oregcomp_parse;
b45f050a
JF
5289 vFAIL("Unmatched [");
5290 }
830247a4 5291 nextchar(pRExC_state);
a0d0e21e 5292 *flagp |= HASWIDTH|SIMPLE;
fac92740 5293 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 5294 break;
b45f050a 5295 }
a0d0e21e 5296 case '(':
830247a4 5297 nextchar(pRExC_state);
3dab1dad 5298 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 5299 if (ret == NULL) {
bf93d4cc 5300 if (flags & TRYAGAIN) {
830247a4 5301 if (RExC_parse == RExC_end) {
bf93d4cc
GS
5302 /* Make parent create an empty node if needed. */
5303 *flagp |= TRYAGAIN;
5304 return(NULL);
5305 }
a0d0e21e 5306 goto tryagain;
bf93d4cc 5307 }
a0d0e21e
LW
5308 return(NULL);
5309 }
c277df42 5310 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
5311 break;
5312 case '|':
5313 case ')':
5314 if (flags & TRYAGAIN) {
5315 *flagp |= TRYAGAIN;
5316 return NULL;
5317 }
b45f050a 5318 vFAIL("Internal urp");
a0d0e21e
LW
5319 /* Supposed to be caught earlier. */
5320 break;
85afd4ae 5321 case '{':
830247a4
IZ
5322 if (!regcurly(RExC_parse)) {
5323 RExC_parse++;
85afd4ae
CS
5324 goto defchar;
5325 }
5326 /* FALL THROUGH */
a0d0e21e
LW
5327 case '?':
5328 case '+':
5329 case '*':
830247a4 5330 RExC_parse++;
b45f050a 5331 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
5332 break;
5333 case '\\':
830247a4 5334 switch (*++RExC_parse) {
a0d0e21e 5335 case 'A':
830247a4
IZ
5336 RExC_seen_zerolen++;
5337 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 5338 *flagp |= SIMPLE;
830247a4 5339 nextchar(pRExC_state);
fac92740 5340 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5341 break;
5342 case 'G':
830247a4
IZ
5343 ret = reg_node(pRExC_state, GPOS);
5344 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 5345 *flagp |= SIMPLE;
830247a4 5346 nextchar(pRExC_state);
fac92740 5347 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5348 break;
5349 case 'Z':
830247a4 5350 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 5351 *flagp |= SIMPLE;
a1917ab9 5352 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 5353 nextchar(pRExC_state);
a0d0e21e 5354 break;
b85d18e9 5355 case 'z':
830247a4 5356 ret = reg_node(pRExC_state, EOS);
b85d18e9 5357 *flagp |= SIMPLE;
830247a4
IZ
5358 RExC_seen_zerolen++; /* Do not optimize RE away */
5359 nextchar(pRExC_state);
fac92740 5360 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 5361 break;
4a2d328f 5362 case 'C':
f33976b4
DB
5363 ret = reg_node(pRExC_state, CANY);
5364 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 5365 *flagp |= HASWIDTH|SIMPLE;
830247a4 5366 nextchar(pRExC_state);
fac92740 5367 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
5368 break;
5369 case 'X':
830247a4 5370 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 5371 *flagp |= HASWIDTH;
830247a4 5372 nextchar(pRExC_state);
fac92740 5373 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 5374 break;
a0d0e21e 5375 case 'w':
eb160463 5376 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 5377 *flagp |= HASWIDTH|SIMPLE;
830247a4 5378 nextchar(pRExC_state);
fac92740 5379 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5380 break;
5381 case 'W':
eb160463 5382 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 5383 *flagp |= HASWIDTH|SIMPLE;
830247a4 5384 nextchar(pRExC_state);
fac92740 5385 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5386 break;
5387 case 'b':
830247a4
IZ
5388 RExC_seen_zerolen++;
5389 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 5390 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 5391 *flagp |= SIMPLE;
830247a4 5392 nextchar(pRExC_state);
fac92740 5393 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5394 break;
5395 case 'B':
830247a4
IZ
5396 RExC_seen_zerolen++;
5397 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 5398 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 5399 *flagp |= SIMPLE;
830247a4 5400 nextchar(pRExC_state);
fac92740 5401 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5402 break;
5403 case 's':
eb160463 5404 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 5405 *flagp |= HASWIDTH|SIMPLE;
830247a4 5406 nextchar(pRExC_state);
fac92740 5407 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5408 break;
5409 case 'S':
eb160463 5410 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 5411 *flagp |= HASWIDTH|SIMPLE;
830247a4 5412 nextchar(pRExC_state);
fac92740 5413 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5414 break;
5415 case 'd':
ffc61ed2 5416 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 5417 *flagp |= HASWIDTH|SIMPLE;
830247a4 5418 nextchar(pRExC_state);
fac92740 5419 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5420 break;
5421 case 'D':
ffc61ed2 5422 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 5423 *flagp |= HASWIDTH|SIMPLE;
830247a4 5424 nextchar(pRExC_state);
fac92740 5425 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 5426 break;
a14b48bc
LW
5427 case 'p':
5428 case 'P':
3568d838 5429 {
3dab1dad 5430 char* const oldregxend = RExC_end;
ccb2c380 5431 char* parse_start = RExC_parse - 2;
a14b48bc 5432
830247a4 5433 if (RExC_parse[1] == '{') {
3568d838 5434 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
5435 RExC_end = strchr(RExC_parse, '}');
5436 if (!RExC_end) {
3dab1dad 5437 const U8 c = (U8)*RExC_parse;
830247a4
IZ
5438 RExC_parse += 2;
5439 RExC_end = oldregxend;
0da60cf5 5440 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 5441 }
830247a4 5442 RExC_end++;
a14b48bc 5443 }
af6f566e 5444 else {
830247a4 5445 RExC_end = RExC_parse + 2;
af6f566e
HS
5446 if (RExC_end > oldregxend)
5447 RExC_end = oldregxend;
5448 }
830247a4 5449 RExC_parse--;
a14b48bc 5450
3dab1dad 5451 ret = regclass(pRExC_state,depth+1);
a14b48bc 5452
830247a4
IZ
5453 RExC_end = oldregxend;
5454 RExC_parse--;
ccb2c380
MP
5455
5456 Set_Node_Offset(ret, parse_start + 2);
5457 Set_Node_Cur_Length(ret);
830247a4 5458 nextchar(pRExC_state);
a14b48bc
LW
5459 *flagp |= HASWIDTH|SIMPLE;
5460 }
5461 break;
fc8cd66c
YO
5462 case 'N':
5463 /* Handle \N{NAME} here and not below because it can be
5464 multicharacter. join_exact() will join them up later on.
5465 Also this makes sure that things like /\N{BLAH}+/ and
5466 \N{BLAH} being multi char Just Happen. dmq*/
5467 ++RExC_parse;
5468 ret= reg_namedseq(pRExC_state, NULL);
5469 break;
a0d0e21e
LW
5470 case 'n':
5471 case 'r':
5472 case 't':
5473 case 'f':
5474 case 'e':
5475 case 'a':
5476 case 'x':
5477 case 'c':
5478 case '0':
5479 goto defchar;
5480 case '1': case '2': case '3': case '4':
5481 case '5': case '6': case '7': case '8': case '9':
5482 {
1df70142 5483 const I32 num = atoi(RExC_parse);
a0d0e21e 5484
830247a4 5485 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
5486 goto defchar;
5487 else {
3dab1dad 5488 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
5489 while (isDIGIT(*RExC_parse))
5490 RExC_parse++;
b45f050a 5491
eb160463 5492 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 5493 vFAIL("Reference to nonexistent group");
830247a4 5494 RExC_sawback = 1;
eb160463
GS
5495 ret = reganode(pRExC_state,
5496 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
5497 num);
a0d0e21e 5498 *flagp |= HASWIDTH;
2af232bd 5499
fac92740 5500 /* override incorrect value set in reganode MJD */
2af232bd 5501 Set_Node_Offset(ret, parse_start+1);
fac92740 5502 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
5503 RExC_parse--;
5504 nextchar(pRExC_state);
a0d0e21e
LW
5505 }
5506 }
5507 break;
5508 case '\0':
830247a4 5509 if (RExC_parse >= RExC_end)
b45f050a 5510 FAIL("Trailing \\");
a0d0e21e
LW
5511 /* FALL THROUGH */
5512 default:
a0288114 5513 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 5514 back into the quick-grab loop below */
45948336 5515 parse_start--;
a0d0e21e
LW
5516 goto defchar;
5517 }
5518 break;
4633a7c4
LW
5519
5520 case '#':
e2509266 5521 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
5522 while (RExC_parse < RExC_end && *RExC_parse != '\n')
5523 RExC_parse++;
830247a4 5524 if (RExC_parse < RExC_end)
4633a7c4
LW
5525 goto tryagain;
5526 }
5527 /* FALL THROUGH */
5528
a0d0e21e 5529 default: {
ba210ebe 5530 register STRLEN len;
58ae7d3f 5531 register UV ender;
a0d0e21e 5532 register char *p;
3dab1dad 5533 char *s;
80aecb99 5534 STRLEN foldlen;
89ebb4a3 5535 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
5536
5537 parse_start = RExC_parse - 1;
a0d0e21e 5538
830247a4 5539 RExC_parse++;
a0d0e21e
LW
5540
5541 defchar:
58ae7d3f 5542 ender = 0;
eb160463
GS
5543 ret = reg_node(pRExC_state,
5544 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 5545 s = STRING(ret);
830247a4
IZ
5546 for (len = 0, p = RExC_parse - 1;
5547 len < 127 && p < RExC_end;
a0d0e21e
LW
5548 len++)
5549 {
3dab1dad 5550 char * const oldp = p;
5b5a24f7 5551
e2509266 5552 if (RExC_flags & PMf_EXTENDED)
830247a4 5553 p = regwhite(p, RExC_end);
a0d0e21e
LW
5554 switch (*p) {
5555 case '^':
5556 case '$':
5557 case '.':
5558 case '[':
5559 case '(':
5560 case ')':
5561 case '|':
5562 goto loopdone;
5563 case '\\':
5564 switch (*++p) {
5565 case 'A':
1ed8eac0
JF
5566 case 'C':
5567 case 'X':
a0d0e21e
LW
5568 case 'G':
5569 case 'Z':
b85d18e9 5570 case 'z':
a0d0e21e
LW
5571 case 'w':
5572 case 'W':
5573 case 'b':
5574 case 'B':
5575 case 's':
5576 case 'S':
5577 case 'd':
5578 case 'D':
a14b48bc
LW
5579 case 'p':
5580 case 'P':
fc8cd66c 5581 case 'N':
a0d0e21e
LW
5582 --p;
5583 goto loopdone;
5584 case 'n':
5585 ender = '\n';
5586 p++;
a687059c 5587 break;
a0d0e21e
LW
5588 case 'r':
5589 ender = '\r';
5590 p++;
a687059c 5591 break;
a0d0e21e
LW
5592 case 't':
5593 ender = '\t';
5594 p++;
a687059c 5595 break;
a0d0e21e
LW
5596 case 'f':
5597 ender = '\f';
5598 p++;
a687059c 5599 break;
a0d0e21e 5600 case 'e':
c7f1f016 5601 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 5602 p++;
a687059c 5603 break;
a0d0e21e 5604 case 'a':
c7f1f016 5605 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 5606 p++;
a687059c 5607 break;
a0d0e21e 5608 case 'x':
a0ed51b3 5609 if (*++p == '{') {
1df70142 5610 char* const e = strchr(p, '}');
b81d288d 5611
b45f050a 5612 if (!e) {
830247a4 5613 RExC_parse = p + 1;
b45f050a
JF
5614 vFAIL("Missing right brace on \\x{}");
5615 }
de5f0749 5616 else {
a4c04bdc
NC
5617 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5618 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 5619 STRLEN numlen = e - p - 1;
53305cf1 5620 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
5621 if (ender > 0xff)
5622 RExC_utf8 = 1;
a0ed51b3
LW
5623 p = e + 1;
5624 }
a0ed51b3
LW
5625 }
5626 else {
a4c04bdc 5627 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 5628 STRLEN numlen = 2;
53305cf1 5629 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
5630 p += numlen;
5631 }
a687059c 5632 break;
a0d0e21e
LW
5633 case 'c':
5634 p++;
bbce6d69 5635 ender = UCHARAT(p++);
5636 ender = toCTRL(ender);
a687059c 5637 break;
a0d0e21e
LW
5638 case '0': case '1': case '2': case '3':case '4':
5639 case '5': case '6': case '7': case '8':case '9':
5640 if (*p == '0' ||
830247a4 5641 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 5642 I32 flags = 0;
1df70142 5643 STRLEN numlen = 3;
53305cf1 5644 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
5645 p += numlen;
5646 }
5647 else {
5648 --p;
5649 goto loopdone;
a687059c
LW
5650 }
5651 break;
a0d0e21e 5652 case '\0':
830247a4 5653 if (p >= RExC_end)
b45f050a 5654 FAIL("Trailing \\");
a687059c 5655 /* FALL THROUGH */
a0d0e21e 5656 default:
041457d9 5657 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 5658 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 5659 goto normal_default;
a0d0e21e
LW
5660 }
5661 break;
a687059c 5662 default:
a0ed51b3 5663 normal_default:
fd400ab9 5664 if (UTF8_IS_START(*p) && UTF) {
1df70142 5665 STRLEN numlen;
5e12f4fb 5666 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 5667 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
5668 p += numlen;
5669 }
5670 else
5671 ender = *p++;
a0d0e21e 5672 break;
a687059c 5673 }
e2509266 5674 if (RExC_flags & PMf_EXTENDED)
830247a4 5675 p = regwhite(p, RExC_end);
60a8b682
JH
5676 if (UTF && FOLD) {
5677 /* Prime the casefolded buffer. */
ac7e0132 5678 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 5679 }
a0d0e21e
LW
5680 if (ISMULT2(p)) { /* Back off on ?+*. */
5681 if (len)
5682 p = oldp;
16ea2a2e 5683 else if (UTF) {
80aecb99 5684 if (FOLD) {
60a8b682 5685 /* Emit all the Unicode characters. */
1df70142 5686 STRLEN numlen;
80aecb99
JH
5687 for (foldbuf = tmpbuf;
5688 foldlen;
5689 foldlen -= numlen) {
5690 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 5691 if (numlen > 0) {
71207a34 5692 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
5693 s += unilen;
5694 len += unilen;
5695 /* In EBCDIC the numlen
5696 * and unilen can differ. */
9dc45d57 5697 foldbuf += numlen;
47654450
JH
5698 if (numlen >= foldlen)
5699 break;
9dc45d57
JH
5700 }
5701 else
5702 break; /* "Can't happen." */
80aecb99
JH
5703 }
5704 }
5705 else {
71207a34 5706 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 5707 if (unilen > 0) {
0ebc6274
JH
5708 s += unilen;
5709 len += unilen;
9dc45d57 5710 }
80aecb99 5711 }
a0ed51b3 5712 }
a0d0e21e
LW
5713 else {
5714 len++;
eb160463 5715 REGC((char)ender, s++);
a0d0e21e
LW
5716 }
5717 break;
a687059c 5718 }
16ea2a2e 5719 if (UTF) {
80aecb99 5720 if (FOLD) {
60a8b682 5721 /* Emit all the Unicode characters. */
1df70142 5722 STRLEN numlen;
80aecb99
JH
5723 for (foldbuf = tmpbuf;
5724 foldlen;
5725 foldlen -= numlen) {
5726 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 5727 if (numlen > 0) {
71207a34 5728 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
5729 len += unilen;
5730 s += unilen;
5731 /* In EBCDIC the numlen
5732 * and unilen can differ. */
9dc45d57 5733 foldbuf += numlen;
47654450
JH
5734 if (numlen >= foldlen)
5735 break;
9dc45d57
JH
5736 }
5737 else
5738 break;
80aecb99
JH
5739 }
5740 }
5741 else {
71207a34 5742 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 5743 if (unilen > 0) {
0ebc6274
JH
5744 s += unilen;
5745 len += unilen;
9dc45d57 5746 }
80aecb99
JH
5747 }
5748 len--;
a0ed51b3
LW
5749 }
5750 else
eb160463 5751 REGC((char)ender, s++);
a0d0e21e
LW
5752 }
5753 loopdone:
830247a4 5754 RExC_parse = p - 1;
fac92740 5755 Set_Node_Cur_Length(ret); /* MJD */
830247a4 5756 nextchar(pRExC_state);
793db0cb
JH
5757 {
5758 /* len is STRLEN which is unsigned, need to copy to signed */
5759 IV iv = len;
5760 if (iv < 0)
5761 vFAIL("Internal disaster");
5762 }
a0d0e21e
LW
5763 if (len > 0)
5764 *flagp |= HASWIDTH;
090f7165 5765 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 5766 *flagp |= SIMPLE;
3dab1dad 5767
cd439c50 5768 if (SIZE_ONLY)
830247a4 5769 RExC_size += STR_SZ(len);
3dab1dad
YO
5770 else {
5771 STR_LEN(ret) = len;
830247a4 5772 RExC_emit += STR_SZ(len);
07be1b83 5773 }
3dab1dad 5774 }
a0d0e21e
LW
5775 break;
5776 }
a687059c 5777
60a8b682
JH
5778 /* If the encoding pragma is in effect recode the text of
5779 * any EXACT-kind nodes. */
fc8cd66c 5780 if (ret && PL_encoding && PL_regkind[OP(ret)] == EXACT) {
3dab1dad
YO
5781 const STRLEN oldlen = STR_LEN(ret);
5782 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
d0063567
DK
5783
5784 if (RExC_utf8)
5785 SvUTF8_on(sv);
5786 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
5787 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
5788 const STRLEN newlen = SvCUR(sv);
d0063567
DK
5789
5790 if (SvUTF8(sv))
5791 RExC_utf8 = 1;
5792 if (!SIZE_ONLY) {
a3621e74
YO
5793 GET_RE_DEBUG_FLAGS_DECL;
5794 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
5795 (int)oldlen, STRING(ret),
5796 (int)newlen, s));
5797 Copy(s, STRING(ret), newlen, char);
5798 STR_LEN(ret) += newlen - oldlen;
5799 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
5800 } else
5801 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
5802 }
a72c7584
JH
5803 }
5804
a0d0e21e 5805 return(ret);
a687059c
LW
5806}
5807
873ef191 5808STATIC char *
5f66b61c 5809S_regwhite(char *p, const char *e)
5b5a24f7
CS
5810{
5811 while (p < e) {
5812 if (isSPACE(*p))
5813 ++p;
5814 else if (*p == '#') {
5815 do {
5816 p++;
5817 } while (p < e && *p != '\n');
5818 }
5819 else
5820 break;
5821 }
5822 return p;
5823}
5824
b8c5462f
JH
5825/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
5826 Character classes ([:foo:]) can also be negated ([:^foo:]).
5827 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
5828 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 5829 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
5830
5831#define POSIXCC_DONE(c) ((c) == ':')
5832#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
5833#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
5834
b8c5462f 5835STATIC I32
830247a4 5836S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 5837{
97aff369 5838 dVAR;
936ed897 5839 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 5840
830247a4 5841 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 5842 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 5843 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 5844 const char c = UCHARAT(RExC_parse);
097eb12c 5845 char* const s = RExC_parse++;
b81d288d 5846
9a86a77b 5847 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
5848 RExC_parse++;
5849 if (RExC_parse == RExC_end)
620e46c5 5850 /* Grandfather lone [:, [=, [. */
830247a4 5851 RExC_parse = s;
620e46c5 5852 else {
3dab1dad 5853 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
5854 assert(*t == c);
5855
9a86a77b 5856 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 5857 const char *posixcc = s + 1;
830247a4 5858 RExC_parse++; /* skip over the ending ] */
3dab1dad 5859
b8c5462f 5860 if (*s == ':') {
1df70142
AL
5861 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5862 const I32 skip = t - posixcc;
80916619
NC
5863
5864 /* Initially switch on the length of the name. */
5865 switch (skip) {
5866 case 4:
3dab1dad
YO
5867 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5868 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 5869 break;
80916619
NC
5870 case 5:
5871 /* Names all of length 5. */
5872 /* alnum alpha ascii blank cntrl digit graph lower
5873 print punct space upper */
5874 /* Offset 4 gives the best switch position. */
5875 switch (posixcc[4]) {
5876 case 'a':
3dab1dad
YO
5877 if (memEQ(posixcc, "alph", 4)) /* alpha */
5878 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
5879 break;
5880 case 'e':
3dab1dad
YO
5881 if (memEQ(posixcc, "spac", 4)) /* space */
5882 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
5883 break;
5884 case 'h':
3dab1dad
YO
5885 if (memEQ(posixcc, "grap", 4)) /* graph */
5886 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
5887 break;
5888 case 'i':
3dab1dad
YO
5889 if (memEQ(posixcc, "asci", 4)) /* ascii */
5890 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
5891 break;
5892 case 'k':
3dab1dad
YO
5893 if (memEQ(posixcc, "blan", 4)) /* blank */
5894 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
5895 break;
5896 case 'l':
3dab1dad
YO
5897 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5898 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
5899 break;
5900 case 'm':
3dab1dad
YO
5901 if (memEQ(posixcc, "alnu", 4)) /* alnum */
5902 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
5903 break;
5904 case 'r':
3dab1dad
YO
5905 if (memEQ(posixcc, "lowe", 4)) /* lower */
5906 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
5907 else if (memEQ(posixcc, "uppe", 4)) /* upper */
5908 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
5909 break;
5910 case 't':
3dab1dad
YO
5911 if (memEQ(posixcc, "digi", 4)) /* digit */
5912 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
5913 else if (memEQ(posixcc, "prin", 4)) /* print */
5914 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
5915 else if (memEQ(posixcc, "punc", 4)) /* punct */
5916 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 5917 break;
b8c5462f
JH
5918 }
5919 break;
80916619 5920 case 6:
3dab1dad
YO
5921 if (memEQ(posixcc, "xdigit", 6))
5922 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
5923 break;
5924 }
80916619
NC
5925
5926 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
5927 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
5928 t - s - 1, s + 1);
80916619
NC
5929 assert (posixcc[skip] == ':');
5930 assert (posixcc[skip+1] == ']');
b45f050a 5931 } else if (!SIZE_ONLY) {
b8c5462f 5932 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 5933
830247a4 5934 /* adjust RExC_parse so the warning shows after
b45f050a 5935 the class closes */
9a86a77b 5936 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 5937 RExC_parse++;
b45f050a
JF
5938 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5939 }
b8c5462f
JH
5940 } else {
5941 /* Maternal grandfather:
5942 * "[:" ending in ":" but not in ":]" */
830247a4 5943 RExC_parse = s;
767d463e 5944 }
620e46c5
JH
5945 }
5946 }
5947
b8c5462f
JH
5948 return namedclass;
5949}
5950
5951STATIC void
830247a4 5952S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 5953{
97aff369 5954 dVAR;
3dab1dad 5955 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
5956 const char *s = RExC_parse;
5957 const char c = *s++;
b8c5462f 5958
3dab1dad 5959 while (isALNUM(*s))
b8c5462f
JH
5960 s++;
5961 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
5962 if (ckWARN(WARN_REGEXP))
5963 vWARN3(s+2,
5964 "POSIX syntax [%c %c] belongs inside character classes",
5965 c, c);
b45f050a
JF
5966
5967 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 5968 if (POSIXCC_NOTYET(c)) {
830247a4 5969 /* adjust RExC_parse so the error shows after
b45f050a 5970 the class closes */
9a86a77b 5971 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 5972 NOOP;
b45f050a
JF
5973 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
5974 }
b8c5462f
JH
5975 }
5976 }
620e46c5
JH
5977}
5978
7f6f358c
YO
5979
5980/*
5981 parse a class specification and produce either an ANYOF node that
5982 matches the pattern. If the pattern matches a single char only and
5983 that char is < 256 then we produce an EXACT node instead.
5984*/
76e3520e 5985STATIC regnode *
3dab1dad 5986S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 5987{
97aff369 5988 dVAR;
9ef43ace 5989 register UV value = 0;
9a86a77b 5990 register UV nextvalue;
3568d838 5991 register IV prevvalue = OOB_UNICODE;
ffc61ed2 5992 register IV range = 0;
c277df42 5993 register regnode *ret;
ba210ebe 5994 STRLEN numlen;
ffc61ed2 5995 IV namedclass;
cbbf8932 5996 char *rangebegin = NULL;
936ed897 5997 bool need_class = 0;
c445ea15 5998 SV *listsv = NULL;
ffc61ed2 5999 UV n;
9e55ce06 6000 bool optimize_invert = TRUE;
cbbf8932 6001 AV* unicode_alternate = NULL;
1b2d223b
JH
6002#ifdef EBCDIC
6003 UV literal_endpoint = 0;
6004#endif
7f6f358c 6005 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6006
3dab1dad 6007 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6008 case we need to change the emitted regop to an EXACT. */
07be1b83 6009 const char * orig_parse = RExC_parse;
72f13be8 6010 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6011#ifndef DEBUGGING
6012 PERL_UNUSED_ARG(depth);
6013#endif
72f13be8 6014
3dab1dad 6015 DEBUG_PARSE("clas");
7f6f358c
YO
6016
6017 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6018 ret = reganode(pRExC_state, ANYOF, 0);
6019
6020 if (!SIZE_ONLY)
6021 ANYOF_FLAGS(ret) = 0;
6022
9a86a77b 6023 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6024 RExC_naughty++;
6025 RExC_parse++;
6026 if (!SIZE_ONLY)
6027 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6028 }
a0d0e21e 6029
73060fc4 6030 if (SIZE_ONLY) {
830247a4 6031 RExC_size += ANYOF_SKIP;
73060fc4
JH
6032 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6033 }
936ed897 6034 else {
830247a4 6035 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6036 if (FOLD)
6037 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6038 if (LOC)
6039 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6040 ANYOF_BITMAP_ZERO(ret);
396482e1 6041 listsv = newSVpvs("# comment\n");
a0d0e21e 6042 }
b8c5462f 6043
9a86a77b
JH
6044 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6045
b938889d 6046 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6047 checkposixcc(pRExC_state);
b8c5462f 6048
f064b6ad
HS
6049 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6050 if (UCHARAT(RExC_parse) == ']')
6051 goto charclassloop;
ffc61ed2 6052
fc8cd66c 6053parseit:
9a86a77b 6054 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6055
6056 charclassloop:
6057
6058 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6059
73b437c8 6060 if (!range)
830247a4 6061 rangebegin = RExC_parse;
ffc61ed2 6062 if (UTF) {
5e12f4fb 6063 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6064 RExC_end - RExC_parse,
9f7f3913 6065 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6066 RExC_parse += numlen;
6067 }
6068 else
6069 value = UCHARAT(RExC_parse++);
7f6f358c 6070
9a86a77b
JH
6071 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6072 if (value == '[' && POSIXCC(nextvalue))
830247a4 6073 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6074 else if (value == '\\') {
ffc61ed2 6075 if (UTF) {
5e12f4fb 6076 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6077 RExC_end - RExC_parse,
9f7f3913 6078 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6079 RExC_parse += numlen;
6080 }
6081 else
6082 value = UCHARAT(RExC_parse++);
470c3474 6083 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6084 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6085 * be a problem later if we want switch on Unicode.
6086 * A similar issue a little bit later when switching on
6087 * namedclass. --jhi */
ffc61ed2 6088 switch ((I32)value) {
b8c5462f
JH
6089 case 'w': namedclass = ANYOF_ALNUM; break;
6090 case 'W': namedclass = ANYOF_NALNUM; break;
6091 case 's': namedclass = ANYOF_SPACE; break;
6092 case 'S': namedclass = ANYOF_NSPACE; break;
6093 case 'd': namedclass = ANYOF_DIGIT; break;
6094 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6095 case 'N': /* Handle \N{NAME} in class */
6096 {
6097 /* We only pay attention to the first char of
6098 multichar strings being returned. I kinda wonder
6099 if this makes sense as it does change the behaviour
6100 from earlier versions, OTOH that behaviour was broken
6101 as well. */
6102 UV v; /* value is register so we cant & it /grrr */
6103 if (reg_namedseq(pRExC_state, &v)) {
6104 goto parseit;
6105 }
6106 value= v;
6107 }
6108 break;
ffc61ed2
JH
6109 case 'p':
6110 case 'P':
3dab1dad
YO
6111 {
6112 char *e;
af6f566e 6113 if (RExC_parse >= RExC_end)
2a4859cd 6114 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 6115 if (*RExC_parse == '{') {
1df70142 6116 const U8 c = (U8)value;
ffc61ed2
JH
6117 e = strchr(RExC_parse++, '}');
6118 if (!e)
0da60cf5 6119 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
6120 while (isSPACE(UCHARAT(RExC_parse)))
6121 RExC_parse++;
6122 if (e == RExC_parse)
0da60cf5 6123 vFAIL2("Empty \\%c{}", c);
ffc61ed2 6124 n = e - RExC_parse;
ab13f0c7
JH
6125 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6126 n--;
ffc61ed2
JH
6127 }
6128 else {
6129 e = RExC_parse;
6130 n = 1;
6131 }
6132 if (!SIZE_ONLY) {
ab13f0c7
JH
6133 if (UCHARAT(RExC_parse) == '^') {
6134 RExC_parse++;
6135 n--;
6136 value = value == 'p' ? 'P' : 'p'; /* toggle */
6137 while (isSPACE(UCHARAT(RExC_parse))) {
6138 RExC_parse++;
6139 n--;
6140 }
6141 }
097eb12c
AL
6142 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
6143 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
6144 }
6145 RExC_parse = e + 1;
6146 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 6147 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 6148 }
f81125e2 6149 break;
b8c5462f
JH
6150 case 'n': value = '\n'; break;
6151 case 'r': value = '\r'; break;
6152 case 't': value = '\t'; break;
6153 case 'f': value = '\f'; break;
6154 case 'b': value = '\b'; break;
c7f1f016
NIS
6155 case 'e': value = ASCII_TO_NATIVE('\033');break;
6156 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 6157 case 'x':
ffc61ed2 6158 if (*RExC_parse == '{') {
a4c04bdc
NC
6159 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6160 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 6161 char * const e = strchr(RExC_parse++, '}');
b81d288d 6162 if (!e)
ffc61ed2 6163 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
6164
6165 numlen = e - RExC_parse;
6166 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6167 RExC_parse = e + 1;
6168 }
6169 else {
a4c04bdc 6170 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
6171 numlen = 2;
6172 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6173 RExC_parse += numlen;
6174 }
b8c5462f
JH
6175 break;
6176 case 'c':
830247a4 6177 value = UCHARAT(RExC_parse++);
b8c5462f
JH
6178 value = toCTRL(value);
6179 break;
6180 case '0': case '1': case '2': case '3': case '4':
6181 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
6182 {
6183 I32 flags = 0;
6184 numlen = 3;
6185 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 6186 RExC_parse += numlen;
b8c5462f 6187 break;
53305cf1 6188 }
1028017a 6189 default:
041457d9 6190 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
6191 vWARN2(RExC_parse,
6192 "Unrecognized escape \\%c in character class passed through",
6193 (int)value);
1028017a 6194 break;
b8c5462f 6195 }
ffc61ed2 6196 } /* end of \blah */
1b2d223b
JH
6197#ifdef EBCDIC
6198 else
6199 literal_endpoint++;
6200#endif
ffc61ed2
JH
6201
6202 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
6203
6204 if (!SIZE_ONLY && !need_class)
936ed897 6205 ANYOF_CLASS_ZERO(ret);
ffc61ed2 6206
936ed897 6207 need_class = 1;
ffc61ed2
JH
6208
6209 /* a bad range like a-\d, a-[:digit:] ? */
6210 if (range) {
73b437c8 6211 if (!SIZE_ONLY) {
afd78fd5 6212 if (ckWARN(WARN_REGEXP)) {
097eb12c 6213 const int w =
afd78fd5
JH
6214 RExC_parse >= rangebegin ?
6215 RExC_parse - rangebegin : 0;
830247a4 6216 vWARN4(RExC_parse,
b45f050a 6217 "False [] range \"%*.*s\"",
097eb12c 6218 w, w, rangebegin);
afd78fd5 6219 }
3568d838
JH
6220 if (prevvalue < 256) {
6221 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
6222 ANYOF_BITMAP_SET(ret, '-');
6223 }
6224 else {
6225 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
6226 Perl_sv_catpvf(aTHX_ listsv,
3568d838 6227 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 6228 }
b8c5462f 6229 }
ffc61ed2
JH
6230
6231 range = 0; /* this was not a true range */
73b437c8 6232 }
ffc61ed2 6233
73b437c8 6234 if (!SIZE_ONLY) {
c49a72a9
NC
6235 const char *what = NULL;
6236 char yesno = 0;
6237
3568d838
JH
6238 if (namedclass > OOB_NAMEDCLASS)
6239 optimize_invert = FALSE;
e2962f66
JH
6240 /* Possible truncation here but in some 64-bit environments
6241 * the compiler gets heartburn about switch on 64-bit values.
6242 * A similar issue a little earlier when switching on value.
98f323fa 6243 * --jhi */
e2962f66 6244 switch ((I32)namedclass) {
73b437c8
JH
6245 case ANYOF_ALNUM:
6246 if (LOC)
936ed897 6247 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
6248 else {
6249 for (value = 0; value < 256; value++)
6250 if (isALNUM(value))
936ed897 6251 ANYOF_BITMAP_SET(ret, value);
73b437c8 6252 }
c49a72a9
NC
6253 yesno = '+';
6254 what = "Word";
73b437c8
JH
6255 break;
6256 case ANYOF_NALNUM:
6257 if (LOC)
936ed897 6258 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
6259 else {
6260 for (value = 0; value < 256; value++)
6261 if (!isALNUM(value))
936ed897 6262 ANYOF_BITMAP_SET(ret, value);
73b437c8 6263 }
c49a72a9
NC
6264 yesno = '!';
6265 what = "Word";
73b437c8 6266 break;
ffc61ed2 6267 case ANYOF_ALNUMC:
73b437c8 6268 if (LOC)
ffc61ed2 6269 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
6270 else {
6271 for (value = 0; value < 256; value++)
ffc61ed2 6272 if (isALNUMC(value))
936ed897 6273 ANYOF_BITMAP_SET(ret, value);
73b437c8 6274 }
c49a72a9
NC
6275 yesno = '+';
6276 what = "Alnum";
73b437c8
JH
6277 break;
6278 case ANYOF_NALNUMC:
6279 if (LOC)
936ed897 6280 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
6281 else {
6282 for (value = 0; value < 256; value++)
6283 if (!isALNUMC(value))
936ed897 6284 ANYOF_BITMAP_SET(ret, value);
73b437c8 6285 }
c49a72a9
NC
6286 yesno = '!';
6287 what = "Alnum";
73b437c8
JH
6288 break;
6289 case ANYOF_ALPHA:
6290 if (LOC)
936ed897 6291 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
6292 else {
6293 for (value = 0; value < 256; value++)
6294 if (isALPHA(value))
936ed897 6295 ANYOF_BITMAP_SET(ret, value);
73b437c8 6296 }
c49a72a9
NC
6297 yesno = '+';
6298 what = "Alpha";
73b437c8
JH
6299 break;
6300 case ANYOF_NALPHA:
6301 if (LOC)
936ed897 6302 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
6303 else {
6304 for (value = 0; value < 256; value++)
6305 if (!isALPHA(value))
936ed897 6306 ANYOF_BITMAP_SET(ret, value);
73b437c8 6307 }
c49a72a9
NC
6308 yesno = '!';
6309 what = "Alpha";
73b437c8
JH
6310 break;
6311 case ANYOF_ASCII:
6312 if (LOC)
936ed897 6313 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 6314 else {
c7f1f016 6315#ifndef EBCDIC
1ba5c669
JH
6316 for (value = 0; value < 128; value++)
6317 ANYOF_BITMAP_SET(ret, value);
6318#else /* EBCDIC */
ffbc6a93 6319 for (value = 0; value < 256; value++) {
3a3c4447
JH
6320 if (isASCII(value))
6321 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 6322 }
1ba5c669 6323#endif /* EBCDIC */
73b437c8 6324 }
c49a72a9
NC
6325 yesno = '+';
6326 what = "ASCII";
73b437c8
JH
6327 break;
6328 case ANYOF_NASCII:
6329 if (LOC)
936ed897 6330 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 6331 else {
c7f1f016 6332#ifndef EBCDIC
1ba5c669
JH
6333 for (value = 128; value < 256; value++)
6334 ANYOF_BITMAP_SET(ret, value);
6335#else /* EBCDIC */
ffbc6a93 6336 for (value = 0; value < 256; value++) {
3a3c4447
JH
6337 if (!isASCII(value))
6338 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 6339 }
1ba5c669 6340#endif /* EBCDIC */
73b437c8 6341 }
c49a72a9
NC
6342 yesno = '!';
6343 what = "ASCII";
73b437c8 6344 break;
aaa51d5e
JF
6345 case ANYOF_BLANK:
6346 if (LOC)
6347 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
6348 else {
6349 for (value = 0; value < 256; value++)
6350 if (isBLANK(value))
6351 ANYOF_BITMAP_SET(ret, value);
6352 }
c49a72a9
NC
6353 yesno = '+';
6354 what = "Blank";
aaa51d5e
JF
6355 break;
6356 case ANYOF_NBLANK:
6357 if (LOC)
6358 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
6359 else {
6360 for (value = 0; value < 256; value++)
6361 if (!isBLANK(value))
6362 ANYOF_BITMAP_SET(ret, value);
6363 }
c49a72a9
NC
6364 yesno = '!';
6365 what = "Blank";
aaa51d5e 6366 break;
73b437c8
JH
6367 case ANYOF_CNTRL:
6368 if (LOC)
936ed897 6369 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
6370 else {
6371 for (value = 0; value < 256; value++)
6372 if (isCNTRL(value))
936ed897 6373 ANYOF_BITMAP_SET(ret, value);
73b437c8 6374 }
c49a72a9
NC
6375 yesno = '+';
6376 what = "Cntrl";
73b437c8
JH
6377 break;
6378 case ANYOF_NCNTRL:
6379 if (LOC)
936ed897 6380 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
6381 else {
6382 for (value = 0; value < 256; value++)
6383 if (!isCNTRL(value))
936ed897 6384 ANYOF_BITMAP_SET(ret, value);
73b437c8 6385 }
c49a72a9
NC
6386 yesno = '!';
6387 what = "Cntrl";
ffc61ed2
JH
6388 break;
6389 case ANYOF_DIGIT:
6390 if (LOC)
6391 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
6392 else {
6393 /* consecutive digits assumed */
6394 for (value = '0'; value <= '9'; value++)
6395 ANYOF_BITMAP_SET(ret, value);
6396 }
c49a72a9
NC
6397 yesno = '+';
6398 what = "Digit";
ffc61ed2
JH
6399 break;
6400 case ANYOF_NDIGIT:
6401 if (LOC)
6402 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
6403 else {
6404 /* consecutive digits assumed */
6405 for (value = 0; value < '0'; value++)
6406 ANYOF_BITMAP_SET(ret, value);
6407 for (value = '9' + 1; value < 256; value++)
6408 ANYOF_BITMAP_SET(ret, value);
6409 }
c49a72a9
NC
6410 yesno = '!';
6411 what = "Digit";
73b437c8
JH
6412 break;
6413 case ANYOF_GRAPH:
6414 if (LOC)
936ed897 6415 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
6416 else {
6417 for (value = 0; value < 256; value++)
6418 if (isGRAPH(value))
936ed897 6419 ANYOF_BITMAP_SET(ret, value);
73b437c8 6420 }
c49a72a9
NC
6421 yesno = '+';
6422 what = "Graph";
73b437c8
JH
6423 break;
6424 case ANYOF_NGRAPH:
6425 if (LOC)
936ed897 6426 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
6427 else {
6428 for (value = 0; value < 256; value++)
6429 if (!isGRAPH(value))
936ed897 6430 ANYOF_BITMAP_SET(ret, value);
73b437c8 6431 }
c49a72a9
NC
6432 yesno = '!';
6433 what = "Graph";
73b437c8
JH
6434 break;
6435 case ANYOF_LOWER:
6436 if (LOC)
936ed897 6437 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
6438 else {
6439 for (value = 0; value < 256; value++)
6440 if (isLOWER(value))
936ed897 6441 ANYOF_BITMAP_SET(ret, value);
73b437c8 6442 }
c49a72a9
NC
6443 yesno = '+';
6444 what = "Lower";
73b437c8
JH
6445 break;
6446 case ANYOF_NLOWER:
6447 if (LOC)
936ed897 6448 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
6449 else {
6450 for (value = 0; value < 256; value++)
6451 if (!isLOWER(value))
936ed897 6452 ANYOF_BITMAP_SET(ret, value);
73b437c8 6453 }
c49a72a9
NC
6454 yesno = '!';
6455 what = "Lower";
73b437c8
JH
6456 break;
6457 case ANYOF_PRINT:
6458 if (LOC)
936ed897 6459 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
6460 else {
6461 for (value = 0; value < 256; value++)
6462 if (isPRINT(value))
936ed897 6463 ANYOF_BITMAP_SET(ret, value);
73b437c8 6464 }
c49a72a9
NC
6465 yesno = '+';
6466 what = "Print";
73b437c8
JH
6467 break;
6468 case ANYOF_NPRINT:
6469 if (LOC)
936ed897 6470 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
6471 else {
6472 for (value = 0; value < 256; value++)
6473 if (!isPRINT(value))
936ed897 6474 ANYOF_BITMAP_SET(ret, value);
73b437c8 6475 }
c49a72a9
NC
6476 yesno = '!';
6477 what = "Print";
73b437c8 6478 break;
aaa51d5e
JF
6479 case ANYOF_PSXSPC:
6480 if (LOC)
6481 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
6482 else {
6483 for (value = 0; value < 256; value++)
6484 if (isPSXSPC(value))
6485 ANYOF_BITMAP_SET(ret, value);
6486 }
c49a72a9
NC
6487 yesno = '+';
6488 what = "Space";
aaa51d5e
JF
6489 break;
6490 case ANYOF_NPSXSPC:
6491 if (LOC)
6492 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
6493 else {
6494 for (value = 0; value < 256; value++)
6495 if (!isPSXSPC(value))
6496 ANYOF_BITMAP_SET(ret, value);
6497 }
c49a72a9
NC
6498 yesno = '!';
6499 what = "Space";
aaa51d5e 6500 break;
73b437c8
JH
6501 case ANYOF_PUNCT:
6502 if (LOC)
936ed897 6503 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
6504 else {
6505 for (value = 0; value < 256; value++)
6506 if (isPUNCT(value))
936ed897 6507 ANYOF_BITMAP_SET(ret, value);
73b437c8 6508 }
c49a72a9
NC
6509 yesno = '+';
6510 what = "Punct";
73b437c8
JH
6511 break;
6512 case ANYOF_NPUNCT:
6513 if (LOC)
936ed897 6514 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
6515 else {
6516 for (value = 0; value < 256; value++)
6517 if (!isPUNCT(value))
936ed897 6518 ANYOF_BITMAP_SET(ret, value);
73b437c8 6519 }
c49a72a9
NC
6520 yesno = '!';
6521 what = "Punct";
ffc61ed2
JH
6522 break;
6523 case ANYOF_SPACE:
6524 if (LOC)
6525 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
6526 else {
6527 for (value = 0; value < 256; value++)
6528 if (isSPACE(value))
6529 ANYOF_BITMAP_SET(ret, value);
6530 }
c49a72a9
NC
6531 yesno = '+';
6532 what = "SpacePerl";
ffc61ed2
JH
6533 break;
6534 case ANYOF_NSPACE:
6535 if (LOC)
6536 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
6537 else {
6538 for (value = 0; value < 256; value++)
6539 if (!isSPACE(value))
6540 ANYOF_BITMAP_SET(ret, value);
6541 }
c49a72a9
NC
6542 yesno = '!';
6543 what = "SpacePerl";
73b437c8
JH
6544 break;
6545 case ANYOF_UPPER:
6546 if (LOC)
936ed897 6547 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
6548 else {
6549 for (value = 0; value < 256; value++)
6550 if (isUPPER(value))
936ed897 6551 ANYOF_BITMAP_SET(ret, value);
73b437c8 6552 }
c49a72a9
NC
6553 yesno = '+';
6554 what = "Upper";
73b437c8
JH
6555 break;
6556 case ANYOF_NUPPER:
6557 if (LOC)
936ed897 6558 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
6559 else {
6560 for (value = 0; value < 256; value++)
6561 if (!isUPPER(value))
936ed897 6562 ANYOF_BITMAP_SET(ret, value);
73b437c8 6563 }
c49a72a9
NC
6564 yesno = '!';
6565 what = "Upper";
73b437c8
JH
6566 break;
6567 case ANYOF_XDIGIT:
6568 if (LOC)
936ed897 6569 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
6570 else {
6571 for (value = 0; value < 256; value++)
6572 if (isXDIGIT(value))
936ed897 6573 ANYOF_BITMAP_SET(ret, value);
73b437c8 6574 }
c49a72a9
NC
6575 yesno = '+';
6576 what = "XDigit";
73b437c8
JH
6577 break;
6578 case ANYOF_NXDIGIT:
6579 if (LOC)
936ed897 6580 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
6581 else {
6582 for (value = 0; value < 256; value++)
6583 if (!isXDIGIT(value))
936ed897 6584 ANYOF_BITMAP_SET(ret, value);
73b437c8 6585 }
c49a72a9
NC
6586 yesno = '!';
6587 what = "XDigit";
73b437c8 6588 break;
f81125e2
JP
6589 case ANYOF_MAX:
6590 /* this is to handle \p and \P */
6591 break;
73b437c8 6592 default:
b45f050a 6593 vFAIL("Invalid [::] class");
73b437c8 6594 break;
b8c5462f 6595 }
c49a72a9
NC
6596 if (what) {
6597 /* Strings such as "+utf8::isWord\n" */
6598 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
6599 }
b8c5462f 6600 if (LOC)
936ed897 6601 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 6602 continue;
a0d0e21e 6603 }
ffc61ed2
JH
6604 } /* end of namedclass \blah */
6605
a0d0e21e 6606 if (range) {
eb160463 6607 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
6608 const int w = RExC_parse - rangebegin;
6609 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 6610 range = 0; /* not a valid range */
73b437c8 6611 }
a0d0e21e
LW
6612 }
6613 else {
3568d838 6614 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
6615 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
6616 RExC_parse[1] != ']') {
6617 RExC_parse++;
ffc61ed2
JH
6618
6619 /* a bad range like \w-, [:word:]- ? */
6620 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 6621 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 6622 const int w =
afd78fd5
JH
6623 RExC_parse >= rangebegin ?
6624 RExC_parse - rangebegin : 0;
830247a4 6625 vWARN4(RExC_parse,
b45f050a 6626 "False [] range \"%*.*s\"",
097eb12c 6627 w, w, rangebegin);
afd78fd5 6628 }
73b437c8 6629 if (!SIZE_ONLY)
936ed897 6630 ANYOF_BITMAP_SET(ret, '-');
73b437c8 6631 } else
ffc61ed2
JH
6632 range = 1; /* yeah, it's a range! */
6633 continue; /* but do it the next time */
a0d0e21e 6634 }
a687059c 6635 }
ffc61ed2 6636
93733859 6637 /* now is the next time */
07be1b83 6638 /*stored += (value - prevvalue + 1);*/
ae5c130c 6639 if (!SIZE_ONLY) {
3568d838 6640 if (prevvalue < 256) {
1df70142 6641 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 6642 IV i;
3568d838 6643#ifdef EBCDIC
1b2d223b
JH
6644 /* In EBCDIC [\x89-\x91] should include
6645 * the \x8e but [i-j] should not. */
6646 if (literal_endpoint == 2 &&
6647 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
6648 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 6649 {
3568d838
JH
6650 if (isLOWER(prevvalue)) {
6651 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
6652 if (isLOWER(i))
6653 ANYOF_BITMAP_SET(ret, i);
6654 } else {
3568d838 6655 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
6656 if (isUPPER(i))
6657 ANYOF_BITMAP_SET(ret, i);
6658 }
8ada0baa 6659 }
ffc61ed2 6660 else
8ada0baa 6661#endif
07be1b83
YO
6662 for (i = prevvalue; i <= ceilvalue; i++) {
6663 if (!ANYOF_BITMAP_TEST(ret,i)) {
6664 stored++;
6665 ANYOF_BITMAP_SET(ret, i);
6666 }
6667 }
3568d838 6668 }
a5961de5 6669 if (value > 255 || UTF) {
1df70142
AL
6670 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
6671 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 6672 stored+=2; /* can't optimize this class */
ffc61ed2 6673 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 6674 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 6675 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
6676 prevnatvalue, natvalue);
6677 }
6678 else if (prevnatvalue == natvalue) {
6679 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 6680 if (FOLD) {
89ebb4a3 6681 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 6682 STRLEN foldlen;
1df70142 6683 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 6684
e294cc5d
JH
6685#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
6686 if (RExC_precomp[0] == ':' &&
6687 RExC_precomp[1] == '[' &&
6688 (f == 0xDF || f == 0x92)) {
6689 f = NATIVE_TO_UNI(f);
6690 }
6691#endif
c840d2a2
JH
6692 /* If folding and foldable and a single
6693 * character, insert also the folded version
6694 * to the charclass. */
9e55ce06 6695 if (f != value) {
e294cc5d
JH
6696#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
6697 if ((RExC_precomp[0] == ':' &&
6698 RExC_precomp[1] == '[' &&
6699 (f == 0xA2 &&
6700 (value == 0xFB05 || value == 0xFB06))) ?
6701 foldlen == ((STRLEN)UNISKIP(f) - 1) :
6702 foldlen == (STRLEN)UNISKIP(f) )
6703#else
eb160463 6704 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 6705#endif
9e55ce06
JH
6706 Perl_sv_catpvf(aTHX_ listsv,
6707 "%04"UVxf"\n", f);
6708 else {
6709 /* Any multicharacter foldings
6710 * require the following transform:
6711 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
6712 * where E folds into "pq" and F folds
6713 * into "rst", all other characters
6714 * fold to single characters. We save
6715 * away these multicharacter foldings,
6716 * to be later saved as part of the
6717 * additional "s" data. */
6718 SV *sv;
6719
6720 if (!unicode_alternate)
6721 unicode_alternate = newAV();
6722 sv = newSVpvn((char*)foldbuf, foldlen);
6723 SvUTF8_on(sv);
6724 av_push(unicode_alternate, sv);
6725 }
6726 }
254ba52a 6727
60a8b682
JH
6728 /* If folding and the value is one of the Greek
6729 * sigmas insert a few more sigmas to make the
6730 * folding rules of the sigmas to work right.
6731 * Note that not all the possible combinations
6732 * are handled here: some of them are handled
9e55ce06
JH
6733 * by the standard folding rules, and some of
6734 * them (literal or EXACTF cases) are handled
6735 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
6736 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
6737 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 6738 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 6739 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 6740 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
6741 }
6742 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
6743 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 6744 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
6745 }
6746 }
ffc61ed2 6747 }
1b2d223b
JH
6748#ifdef EBCDIC
6749 literal_endpoint = 0;
6750#endif
8ada0baa 6751 }
ffc61ed2
JH
6752
6753 range = 0; /* this range (if it was one) is done now */
a0d0e21e 6754 }
ffc61ed2 6755
936ed897 6756 if (need_class) {
4f66b38d 6757 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 6758 if (SIZE_ONLY)
830247a4 6759 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 6760 else
830247a4 6761 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 6762 }
ffc61ed2 6763
7f6f358c
YO
6764
6765 if (SIZE_ONLY)
6766 return ret;
6767 /****** !SIZE_ONLY AFTER HERE *********/
6768
6769 if( stored == 1 && value < 256
6770 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
6771 ) {
6772 /* optimize single char class to an EXACT node
6773 but *only* when its not a UTF/high char */
07be1b83
YO
6774 const char * cur_parse= RExC_parse;
6775 RExC_emit = (regnode *)orig_emit;
6776 RExC_parse = (char *)orig_parse;
7f6f358c
YO
6777 ret = reg_node(pRExC_state,
6778 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 6779 RExC_parse = (char *)cur_parse;
7f6f358c
YO
6780 *STRING(ret)= (char)value;
6781 STR_LEN(ret)= 1;
6782 RExC_emit += STR_SZ(1);
6783 return ret;
6784 }
ae5c130c 6785 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 6786 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
6787 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
6788 ) {
a0ed51b3 6789 for (value = 0; value < 256; ++value) {
936ed897 6790 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 6791 UV fold = PL_fold[value];
ffc61ed2
JH
6792
6793 if (fold != value)
6794 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
6795 }
6796 }
936ed897 6797 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 6798 }
ffc61ed2 6799
ae5c130c 6800 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 6801 if (optimize_invert &&
ffc61ed2
JH
6802 /* If the only flag is inversion. */
6803 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 6804 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 6805 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 6806 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 6807 }
7f6f358c 6808 {
097eb12c 6809 AV * const av = newAV();
ffc61ed2 6810 SV *rv;
9e55ce06 6811 /* The 0th element stores the character class description
6a0407ee 6812 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
6813 * to initialize the appropriate swash (which gets stored in
6814 * the 1st element), and also useful for dumping the regnode.
6815 * The 2nd element stores the multicharacter foldings,
6a0407ee 6816 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
6817 av_store(av, 0, listsv);
6818 av_store(av, 1, NULL);
9e55ce06 6819 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 6820 rv = newRV_noinc((SV*)av);
19860706 6821 n = add_data(pRExC_state, 1, "s");
830247a4 6822 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 6823 ARG_SET(ret, n);
a0ed51b3 6824 }
a0ed51b3
LW
6825 return ret;
6826}
6827
76e3520e 6828STATIC char*
830247a4 6829S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 6830{
097eb12c 6831 char* const retval = RExC_parse++;
a0d0e21e 6832
4633a7c4 6833 for (;;) {
830247a4
IZ
6834 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
6835 RExC_parse[2] == '#') {
e994fd66
AE
6836 while (*RExC_parse != ')') {
6837 if (RExC_parse == RExC_end)
6838 FAIL("Sequence (?#... not terminated");
830247a4 6839 RExC_parse++;
e994fd66 6840 }
830247a4 6841 RExC_parse++;
4633a7c4
LW
6842 continue;
6843 }
e2509266 6844 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
6845 if (isSPACE(*RExC_parse)) {
6846 RExC_parse++;
748a9306
LW
6847 continue;
6848 }
830247a4 6849 else if (*RExC_parse == '#') {
e994fd66
AE
6850 while (RExC_parse < RExC_end)
6851 if (*RExC_parse++ == '\n') break;
748a9306
LW
6852 continue;
6853 }
748a9306 6854 }
4633a7c4 6855 return retval;
a0d0e21e 6856 }
a687059c
LW
6857}
6858
6859/*
c277df42 6860- reg_node - emit a node
a0d0e21e 6861*/
76e3520e 6862STATIC regnode * /* Location. */
830247a4 6863S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 6864{
97aff369 6865 dVAR;
c277df42 6866 register regnode *ptr;
504618e9 6867 regnode * const ret = RExC_emit;
07be1b83 6868 GET_RE_DEBUG_FLAGS_DECL;
a687059c 6869
c277df42 6870 if (SIZE_ONLY) {
830247a4
IZ
6871 SIZE_ALIGN(RExC_size);
6872 RExC_size += 1;
a0d0e21e
LW
6873 return(ret);
6874 }
c277df42 6875 NODE_ALIGN_FILL(ret);
a0d0e21e 6876 ptr = ret;
c277df42 6877 FILL_ADVANCE_NODE(ptr, op);
fac92740 6878 if (RExC_offsets) { /* MJD */
07be1b83 6879 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
6880 "reg_node", __LINE__,
6881 reg_name[op],
07be1b83
YO
6882 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6883 ? "Overwriting end of array!\n" : "OK",
6884 (UV)(RExC_emit - RExC_emit_start),
6885 (UV)(RExC_parse - RExC_start),
6886 (UV)RExC_offsets[0]));
ccb2c380 6887 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 6888 }
07be1b83 6889
830247a4 6890 RExC_emit = ptr;
a687059c 6891
a0d0e21e 6892 return(ret);
a687059c
LW
6893}
6894
6895/*
a0d0e21e
LW
6896- reganode - emit a node with an argument
6897*/
76e3520e 6898STATIC regnode * /* Location. */
830247a4 6899S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 6900{
97aff369 6901 dVAR;
c277df42 6902 register regnode *ptr;
504618e9 6903 regnode * const ret = RExC_emit;
07be1b83 6904 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 6905
c277df42 6906 if (SIZE_ONLY) {
830247a4
IZ
6907 SIZE_ALIGN(RExC_size);
6908 RExC_size += 2;
a0d0e21e
LW
6909 return(ret);
6910 }
fe14fcc3 6911
c277df42 6912 NODE_ALIGN_FILL(ret);
a0d0e21e 6913 ptr = ret;
c277df42 6914 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 6915 if (RExC_offsets) { /* MJD */
07be1b83 6916 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6917 "reganode",
ccb2c380
MP
6918 __LINE__,
6919 reg_name[op],
07be1b83 6920 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 6921 "Overwriting end of array!\n" : "OK",
07be1b83
YO
6922 (UV)(RExC_emit - RExC_emit_start),
6923 (UV)(RExC_parse - RExC_start),
6924 (UV)RExC_offsets[0]));
ccb2c380 6925 Set_Cur_Node_Offset;
fac92740
MJD
6926 }
6927
830247a4 6928 RExC_emit = ptr;
fe14fcc3 6929
a0d0e21e 6930 return(ret);
fe14fcc3
LW
6931}
6932
6933/*
cd439c50 6934- reguni - emit (if appropriate) a Unicode character
a0ed51b3 6935*/
71207a34
AL
6936STATIC STRLEN
6937S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 6938{
97aff369 6939 dVAR;
71207a34 6940 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
6941}
6942
6943/*
a0d0e21e
LW
6944- reginsert - insert an operator in front of already-emitted operand
6945*
6946* Means relocating the operand.
6947*/
76e3520e 6948STATIC void
830247a4 6949S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 6950{
97aff369 6951 dVAR;
c277df42
IZ
6952 register regnode *src;
6953 register regnode *dst;
6954 register regnode *place;
504618e9 6955 const int offset = regarglen[(U8)op];
07be1b83 6956 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 6957/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
6958
6959 if (SIZE_ONLY) {
830247a4 6960 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
6961 return;
6962 }
a687059c 6963
830247a4
IZ
6964 src = RExC_emit;
6965 RExC_emit += NODE_STEP_REGNODE + offset;
6966 dst = RExC_emit;
fac92740 6967 while (src > opnd) {
c277df42 6968 StructCopy(--src, --dst, regnode);
fac92740 6969 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 6970 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 6971 "reg_insert",
ccb2c380
MP
6972 __LINE__,
6973 reg_name[op],
07be1b83
YO
6974 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
6975 ? "Overwriting end of array!\n" : "OK",
6976 (UV)(src - RExC_emit_start),
6977 (UV)(dst - RExC_emit_start),
6978 (UV)RExC_offsets[0]));
ccb2c380
MP
6979 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
6980 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
6981 }
6982 }
6983
a0d0e21e
LW
6984
6985 place = opnd; /* Op node, where operand used to be. */
fac92740 6986 if (RExC_offsets) { /* MJD */
07be1b83 6987 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 6988 "reginsert",
ccb2c380
MP
6989 __LINE__,
6990 reg_name[op],
07be1b83 6991 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 6992 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
6993 (UV)(place - RExC_emit_start),
6994 (UV)(RExC_parse - RExC_start),
786e8c11 6995 (UV)RExC_offsets[0]));
ccb2c380 6996 Set_Node_Offset(place, RExC_parse);
45948336 6997 Set_Node_Length(place, 1);
fac92740 6998 }
c277df42
IZ
6999 src = NEXTOPER(place);
7000 FILL_ADVANCE_NODE(place, op);
7001 Zero(src, offset, regnode);
a687059c
LW
7002}
7003
7004/*
c277df42 7005- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7006- SEE ALSO: regtail_study
a0d0e21e 7007*/
097eb12c 7008/* TODO: All three parms should be const */
76e3520e 7009STATIC void
3dab1dad 7010S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7011{
97aff369 7012 dVAR;
c277df42 7013 register regnode *scan;
72f13be8 7014 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7015#ifndef DEBUGGING
7016 PERL_UNUSED_ARG(depth);
7017#endif
a0d0e21e 7018
c277df42 7019 if (SIZE_ONLY)
a0d0e21e
LW
7020 return;
7021
7022 /* Find last node. */
7023 scan = p;
7024 for (;;) {
504618e9 7025 regnode * const temp = regnext(scan);
3dab1dad
YO
7026 DEBUG_PARSE_r({
7027 SV * const mysv=sv_newmortal();
7028 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7029 regprop(RExC_rx, mysv, scan);
7030 PerlIO_printf(Perl_debug_log, "~ %s (%d)\n",
7031 SvPV_nolen_const(mysv), REG_NODE_NUM(scan));
7032 });
7033 if (temp == NULL)
7034 break;
7035 scan = temp;
7036 }
7037
7038 if (reg_off_by_arg[OP(scan)]) {
7039 ARG_SET(scan, val - scan);
7040 }
7041 else {
7042 NEXT_OFF(scan) = val - scan;
7043 }
7044}
7045
07be1b83 7046#ifdef DEBUGGING
3dab1dad
YO
7047/*
7048- regtail_study - set the next-pointer at the end of a node chain of p to val.
7049- Look for optimizable sequences at the same time.
7050- currently only looks for EXACT chains.
07be1b83
YO
7051
7052This is expermental code. The idea is to use this routine to perform
7053in place optimizations on branches and groups as they are constructed,
7054with the long term intention of removing optimization from study_chunk so
7055that it is purely analytical.
7056
7057Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7058to control which is which.
7059
3dab1dad
YO
7060*/
7061/* TODO: All four parms should be const */
07be1b83 7062
3dab1dad
YO
7063STATIC U8
7064S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7065{
7066 dVAR;
7067 register regnode *scan;
07be1b83
YO
7068 U8 exact = PSEUDO;
7069#ifdef EXPERIMENTAL_INPLACESCAN
7070 I32 min = 0;
7071#endif
7072
3dab1dad
YO
7073 GET_RE_DEBUG_FLAGS_DECL;
7074
07be1b83 7075
3dab1dad
YO
7076 if (SIZE_ONLY)
7077 return exact;
7078
7079 /* Find last node. */
7080
7081 scan = p;
7082 for (;;) {
7083 regnode * const temp = regnext(scan);
07be1b83
YO
7084#ifdef EXPERIMENTAL_INPLACESCAN
7085 if (PL_regkind[OP(scan)] == EXACT)
7086 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7087 return EXACT;
7088#endif
3dab1dad
YO
7089 if ( exact ) {
7090 switch (OP(scan)) {
7091 case EXACT:
7092 case EXACTF:
7093 case EXACTFL:
7094 if( exact == PSEUDO )
7095 exact= OP(scan);
07be1b83
YO
7096 else if ( exact != OP(scan) )
7097 exact= 0;
3dab1dad
YO
7098 case NOTHING:
7099 break;
7100 default:
7101 exact= 0;
7102 }
7103 }
7104 DEBUG_PARSE_r({
7105 SV * const mysv=sv_newmortal();
7106 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
7107 regprop(RExC_rx, mysv, scan);
7108 PerlIO_printf(Perl_debug_log, "~ %s (%s) (%d)\n",
7109 SvPV_nolen_const(mysv),
7110 reg_name[exact],
7111 REG_NODE_NUM(scan));
7112 });
a0d0e21e
LW
7113 if (temp == NULL)
7114 break;
7115 scan = temp;
7116 }
07be1b83
YO
7117 DEBUG_PARSE_r({
7118 SV * const mysv_val=sv_newmortal();
7119 DEBUG_PARSE_MSG("");
7120 regprop(RExC_rx, mysv_val, val);
7121 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
7122 SvPV_nolen_const(mysv_val),
7123 REG_NODE_NUM(val),
7124 val - scan
7125 );
7126 });
c277df42
IZ
7127 if (reg_off_by_arg[OP(scan)]) {
7128 ARG_SET(scan, val - scan);
a0ed51b3
LW
7129 }
7130 else {
c277df42
IZ
7131 NEXT_OFF(scan) = val - scan;
7132 }
3dab1dad
YO
7133
7134 return exact;
a687059c 7135}
07be1b83 7136#endif
a687059c
LW
7137
7138/*
a687059c
LW
7139 - regcurly - a little FSA that accepts {\d+,?\d*}
7140 */
79072805 7141STATIC I32
5f66b61c 7142S_regcurly(register const char *s)
a687059c
LW
7143{
7144 if (*s++ != '{')
7145 return FALSE;
f0fcb552 7146 if (!isDIGIT(*s))
a687059c 7147 return FALSE;
f0fcb552 7148 while (isDIGIT(*s))
a687059c
LW
7149 s++;
7150 if (*s == ',')
7151 s++;
f0fcb552 7152 while (isDIGIT(*s))
a687059c
LW
7153 s++;
7154 if (*s != '}')
7155 return FALSE;
7156 return TRUE;
7157}
7158
a687059c
LW
7159
7160/*
fd181c75 7161 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
7162 */
7163void
097eb12c 7164Perl_regdump(pTHX_ const regexp *r)
a687059c 7165{
35ff7856 7166#ifdef DEBUGGING
97aff369 7167 dVAR;
c445ea15 7168 SV * const sv = sv_newmortal();
ab3bbdeb 7169 SV *dsv= sv_newmortal();
a687059c 7170
786e8c11 7171 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
7172
7173 /* Header fields of interest. */
ab3bbdeb
YO
7174 if (r->anchored_substr) {
7175 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
7176 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 7177 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7178 "anchored %s%s at %"IVdf" ",
7179 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 7180 (IV)r->anchored_offset);
ab3bbdeb
YO
7181 } else if (r->anchored_utf8) {
7182 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
7183 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 7184 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7185 "anchored utf8 %s%s at %"IVdf" ",
7186 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 7187 (IV)r->anchored_offset);
ab3bbdeb
YO
7188 }
7189 if (r->float_substr) {
7190 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
7191 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 7192 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7193 "floating %s%s at %"IVdf"..%"UVuf" ",
7194 s, RE_SV_TAIL(r->float_substr),
7b0972df 7195 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
7196 } else if (r->float_utf8) {
7197 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
7198 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 7199 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7200 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
7201 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 7202 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 7203 }
33b8afdf 7204 if (r->check_substr || r->check_utf8)
b81d288d 7205 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
7206 (const char *)
7207 (r->check_substr == r->float_substr
7208 && r->check_utf8 == r->float_utf8
7209 ? "(checking floating" : "(checking anchored"));
c277df42
IZ
7210 if (r->reganch & ROPT_NOSCAN)
7211 PerlIO_printf(Perl_debug_log, " noscan");
7212 if (r->reganch & ROPT_CHECK_ALL)
7213 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 7214 if (r->check_substr || r->check_utf8)
c277df42
IZ
7215 PerlIO_printf(Perl_debug_log, ") ");
7216
46fc3d4c 7217 if (r->regstclass) {
32fc9b6a 7218 regprop(r, sv, r->regstclass);
1de06328 7219 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 7220 }
774d564b 7221 if (r->reganch & ROPT_ANCH) {
7222 PerlIO_printf(Perl_debug_log, "anchored");
7223 if (r->reganch & ROPT_ANCH_BOL)
7224 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
7225 if (r->reganch & ROPT_ANCH_MBOL)
7226 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
7227 if (r->reganch & ROPT_ANCH_SBOL)
7228 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 7229 if (r->reganch & ROPT_ANCH_GPOS)
7230 PerlIO_printf(Perl_debug_log, "(GPOS)");
7231 PerlIO_putc(Perl_debug_log, ' ');
7232 }
c277df42
IZ
7233 if (r->reganch & ROPT_GPOS_SEEN)
7234 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 7235 if (r->reganch & ROPT_SKIP)
760ac839 7236 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 7237 if (r->reganch & ROPT_IMPLICIT)
760ac839 7238 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 7239 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
7240 if (r->reganch & ROPT_EVAL_SEEN)
7241 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 7242 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 7243#else
96a5add6 7244 PERL_UNUSED_CONTEXT;
65e66c80 7245 PERL_UNUSED_ARG(r);
17c3b450 7246#endif /* DEBUGGING */
a687059c
LW
7247}
7248
7249/*
a0d0e21e
LW
7250- regprop - printable representation of opcode
7251*/
46fc3d4c 7252void
32fc9b6a 7253Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 7254{
35ff7856 7255#ifdef DEBUGGING
97aff369 7256 dVAR;
9b155405 7257 register int k;
1de06328 7258 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 7259
54dc92de 7260 sv_setpvn(sv, "", 0);
9b155405 7261 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
7262 /* It would be nice to FAIL() here, but this may be called from
7263 regexec.c, and it would be hard to supply pRExC_state. */
7264 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 7265 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 7266
3dab1dad 7267 k = PL_regkind[OP(o)];
9b155405 7268
2a782b5b 7269 if (k == EXACT) {
396482e1 7270 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
7271 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
7272 * is a crude hack but it may be the best for now since
7273 * we have no flag "this EXACTish node was UTF-8"
7274 * --jhi */
7275 const char * const s =
ddc5bc0f 7276 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
7277 PL_colors[0], PL_colors[1],
7278 PERL_PV_ESCAPE_UNI_DETECT |
7279 PERL_PV_PRETTY_ELIPSES |
7280 PERL_PV_PRETTY_LTGT
7281 );
7282 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 7283 } else if (k == TRIE) {
3dab1dad 7284 /* print the details of the trie in dumpuntil instead, as
4f639d21 7285 * prog->data isn't available here */
1de06328
YO
7286 const char op = OP(o);
7287 const I32 n = ARG(o);
7288 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
7289 (reg_ac_data *)prog->data->data[n] :
7290 NULL;
7291 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
7292 (reg_trie_data*)prog->data->data[n] :
7293 ac->trie;
7294
7295 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
7296 DEBUG_TRIE_COMPILE_r(
7297 Perl_sv_catpvf(aTHX_ sv,
7298 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
7299 (UV)trie->startstate,
7300 (IV)trie->laststate-1,
7301 (UV)trie->wordcount,
7302 (UV)trie->minlen,
7303 (UV)trie->maxlen,
7304 (UV)TRIE_CHARCOUNT(trie),
7305 (UV)trie->uniquecharcount
7306 )
7307 );
7308 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
7309 int i;
7310 int rangestart = -1;
f46cb337 7311 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
7312 Perl_sv_catpvf(aTHX_ sv, "[");
7313 for (i = 0; i <= 256; i++) {
7314 if (i < 256 && BITMAP_TEST(bitmap,i)) {
7315 if (rangestart == -1)
7316 rangestart = i;
7317 } else if (rangestart != -1) {
7318 if (i <= rangestart + 3)
7319 for (; rangestart < i; rangestart++)
7320 put_byte(sv, rangestart);
7321 else {
7322 put_byte(sv, rangestart);
7323 sv_catpvs(sv, "-");
7324 put_byte(sv, i - 1);
7325 }
7326 rangestart = -1;
7327 }
7328 }
7329 Perl_sv_catpvf(aTHX_ sv, "]");
7330 }
7331
a3621e74 7332 } else if (k == CURLY) {
cb434fcc 7333 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
7334 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
7335 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 7336 }
2c2d71f5
JH
7337 else if (k == WHILEM && o->flags) /* Ordinal/of */
7338 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 7339 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 7340 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 7341 else if (k == LOGICAL)
04ebc1ab 7342 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
7343 else if (k == ANYOF) {
7344 int i, rangestart = -1;
2d03de9c 7345 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
7346
7347 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
7348 static const char * const anyofs[] = {
653099ff
GS
7349 "\\w",
7350 "\\W",
7351 "\\s",
7352 "\\S",
7353 "\\d",
7354 "\\D",
7355 "[:alnum:]",
7356 "[:^alnum:]",
7357 "[:alpha:]",
7358 "[:^alpha:]",
7359 "[:ascii:]",
7360 "[:^ascii:]",
7361 "[:ctrl:]",
7362 "[:^ctrl:]",
7363 "[:graph:]",
7364 "[:^graph:]",
7365 "[:lower:]",
7366 "[:^lower:]",
7367 "[:print:]",
7368 "[:^print:]",
7369 "[:punct:]",
7370 "[:^punct:]",
7371 "[:upper:]",
aaa51d5e 7372 "[:^upper:]",
653099ff 7373 "[:xdigit:]",
aaa51d5e
JF
7374 "[:^xdigit:]",
7375 "[:space:]",
7376 "[:^space:]",
7377 "[:blank:]",
7378 "[:^blank:]"
653099ff
GS
7379 };
7380
19860706 7381 if (flags & ANYOF_LOCALE)
396482e1 7382 sv_catpvs(sv, "{loc}");
19860706 7383 if (flags & ANYOF_FOLD)
396482e1 7384 sv_catpvs(sv, "{i}");
653099ff 7385 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 7386 if (flags & ANYOF_INVERT)
396482e1 7387 sv_catpvs(sv, "^");
ffc61ed2
JH
7388 for (i = 0; i <= 256; i++) {
7389 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
7390 if (rangestart == -1)
7391 rangestart = i;
7392 } else if (rangestart != -1) {
7393 if (i <= rangestart + 3)
7394 for (; rangestart < i; rangestart++)
653099ff 7395 put_byte(sv, rangestart);
ffc61ed2
JH
7396 else {
7397 put_byte(sv, rangestart);
396482e1 7398 sv_catpvs(sv, "-");
ffc61ed2 7399 put_byte(sv, i - 1);
653099ff 7400 }
ffc61ed2 7401 rangestart = -1;
653099ff 7402 }
847a199f 7403 }
ffc61ed2
JH
7404
7405 if (o->flags & ANYOF_CLASS)
bb7a0f54 7406 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
7407 if (ANYOF_CLASS_TEST(o,i))
7408 sv_catpv(sv, anyofs[i]);
7409
7410 if (flags & ANYOF_UNICODE)
396482e1 7411 sv_catpvs(sv, "{unicode}");
1aa99e6b 7412 else if (flags & ANYOF_UNICODE_ALL)
396482e1 7413 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
7414
7415 {
7416 SV *lv;
32fc9b6a 7417 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 7418
ffc61ed2
JH
7419 if (lv) {
7420 if (sw) {
89ebb4a3 7421 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 7422
ffc61ed2 7423 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 7424 uvchr_to_utf8(s, i);
ffc61ed2 7425
3568d838 7426 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
7427 if (rangestart == -1)
7428 rangestart = i;
7429 } else if (rangestart != -1) {
ffc61ed2
JH
7430 if (i <= rangestart + 3)
7431 for (; rangestart < i; rangestart++) {
2d03de9c
AL
7432 const U8 * const e = uvchr_to_utf8(s,rangestart);
7433 U8 *p;
7434 for(p = s; p < e; p++)
ffc61ed2
JH
7435 put_byte(sv, *p);
7436 }
7437 else {
2d03de9c
AL
7438 const U8 *e = uvchr_to_utf8(s,rangestart);
7439 U8 *p;
7440 for (p = s; p < e; p++)
ffc61ed2 7441 put_byte(sv, *p);
396482e1 7442 sv_catpvs(sv, "-");
2d03de9c
AL
7443 e = uvchr_to_utf8(s, i-1);
7444 for (p = s; p < e; p++)
1df70142 7445 put_byte(sv, *p);
ffc61ed2
JH
7446 }
7447 rangestart = -1;
7448 }
19860706 7449 }
ffc61ed2 7450
396482e1 7451 sv_catpvs(sv, "..."); /* et cetera */
19860706 7452 }
fde631ed 7453
ffc61ed2 7454 {
2e0de35c 7455 char *s = savesvpv(lv);
c445ea15 7456 char * const origs = s;
b81d288d 7457
3dab1dad
YO
7458 while (*s && *s != '\n')
7459 s++;
b81d288d 7460
ffc61ed2 7461 if (*s == '\n') {
2d03de9c 7462 const char * const t = ++s;
ffc61ed2
JH
7463
7464 while (*s) {
7465 if (*s == '\n')
7466 *s = ' ';
7467 s++;
7468 }
7469 if (s[-1] == ' ')
7470 s[-1] = 0;
7471
7472 sv_catpv(sv, t);
fde631ed 7473 }
b81d288d 7474
ffc61ed2 7475 Safefree(origs);
fde631ed
JH
7476 }
7477 }
653099ff 7478 }
ffc61ed2 7479
653099ff
GS
7480 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
7481 }
9b155405 7482 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 7483 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 7484#else
96a5add6 7485 PERL_UNUSED_CONTEXT;
65e66c80
SP
7486 PERL_UNUSED_ARG(sv);
7487 PERL_UNUSED_ARG(o);
f9049ba1 7488 PERL_UNUSED_ARG(prog);
17c3b450 7489#endif /* DEBUGGING */
35ff7856 7490}
a687059c 7491
cad2e5aa
JH
7492SV *
7493Perl_re_intuit_string(pTHX_ regexp *prog)
7494{ /* Assume that RE_INTUIT is set */
97aff369 7495 dVAR;
a3621e74 7496 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
7497 PERL_UNUSED_CONTEXT;
7498
a3621e74 7499 DEBUG_COMPILE_r(
cfd0369c 7500 {
2d03de9c 7501 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 7502 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
7503
7504 if (!PL_colorset) reginitcolors();
7505 PerlIO_printf(Perl_debug_log,
a0288114 7506 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
7507 PL_colors[4],
7508 prog->check_substr ? "" : "utf8 ",
7509 PL_colors[5],PL_colors[0],
cad2e5aa
JH
7510 s,
7511 PL_colors[1],
7512 (strlen(s) > 60 ? "..." : ""));
7513 } );
7514
33b8afdf 7515 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
7516}
7517
2b69d0c2 7518void
864dbfa3 7519Perl_pregfree(pTHX_ struct regexp *r)
a687059c 7520{
27da23d5 7521 dVAR;
0df25f3d 7522
fc32ee4a 7523 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 7524
7821416a
IZ
7525 if (!r || (--r->refcnt > 0))
7526 return;
ab3bbdeb 7527 DEBUG_COMPILE_r({
0df25f3d
YO
7528 if (!PL_colorset)
7529 reginitcolors();
ab3bbdeb
YO
7530 if (RX_DEBUG(r)){
7531 SV *dsv= sv_newmortal();
7532 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
7533 dsv, r->precomp, r->prelen, 60);
7534 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
7535 PL_colors[4],PL_colors[5],s);
7536 }
9e55ce06 7537 });
cad2e5aa 7538
43c5f42d
NC
7539 /* gcov results gave these as non-null 100% of the time, so there's no
7540 optimisation in checking them before calling Safefree */
7541 Safefree(r->precomp);
7542 Safefree(r->offsets); /* 20010421 MJD */
ed252734 7543 RX_MATCH_COPY_FREE(r);
f8c7b90f 7544#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
7545 if (r->saved_copy)
7546 SvREFCNT_dec(r->saved_copy);
7547#endif
a193d654
GS
7548 if (r->substrs) {
7549 if (r->anchored_substr)
7550 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
7551 if (r->anchored_utf8)
7552 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
7553 if (r->float_substr)
7554 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
7555 if (r->float_utf8)
7556 SvREFCNT_dec(r->float_utf8);
2779dcf1 7557 Safefree(r->substrs);
a193d654 7558 }
c277df42
IZ
7559 if (r->data) {
7560 int n = r->data->count;
f3548bdc
DM
7561 PAD* new_comppad = NULL;
7562 PAD* old_comppad;
4026c95a 7563 PADOFFSET refcnt;
dfad63ad 7564
c277df42 7565 while (--n >= 0) {
261faec3 7566 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
7567 switch (r->data->what[n]) {
7568 case 's':
7569 SvREFCNT_dec((SV*)r->data->data[n]);
7570 break;
653099ff
GS
7571 case 'f':
7572 Safefree(r->data->data[n]);
7573 break;
dfad63ad
HS
7574 case 'p':
7575 new_comppad = (AV*)r->data->data[n];
7576 break;
c277df42 7577 case 'o':
dfad63ad 7578 if (new_comppad == NULL)
cea2e8a9 7579 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
7580 PAD_SAVE_LOCAL(old_comppad,
7581 /* Watch out for global destruction's random ordering. */
c445ea15 7582 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 7583 );
b34c0dd4 7584 OP_REFCNT_LOCK;
4026c95a
SH
7585 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
7586 OP_REFCNT_UNLOCK;
7587 if (!refcnt)
9b978d73 7588 op_free((OP_4tree*)r->data->data[n]);
9b978d73 7589
f3548bdc 7590 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
7591 SvREFCNT_dec((SV*)new_comppad);
7592 new_comppad = NULL;
c277df42
IZ
7593 break;
7594 case 'n':
9e55ce06 7595 break;
07be1b83 7596 case 'T':
be8e71aa
YO
7597 { /* Aho Corasick add-on structure for a trie node.
7598 Used in stclass optimization only */
07be1b83
YO
7599 U32 refcount;
7600 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
7601 OP_REFCNT_LOCK;
7602 refcount = --aho->refcount;
7603 OP_REFCNT_UNLOCK;
7604 if ( !refcount ) {
7605 Safefree(aho->states);
7606 Safefree(aho->fail);
7607 aho->trie=NULL; /* not necessary to free this as it is
7608 handled by the 't' case */
7609 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 7610 Safefree(r->regstclass);
07be1b83
YO
7611 }
7612 }
7613 break;
a3621e74 7614 case 't':
07be1b83 7615 {
be8e71aa 7616 /* trie structure. */
07be1b83
YO
7617 U32 refcount;
7618 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
7619 OP_REFCNT_LOCK;
7620 refcount = --trie->refcount;
7621 OP_REFCNT_UNLOCK;
7622 if ( !refcount ) {
7623 Safefree(trie->charmap);
7624 if (trie->widecharmap)
7625 SvREFCNT_dec((SV*)trie->widecharmap);
7626 Safefree(trie->states);
7627 Safefree(trie->trans);
7628 if (trie->bitmap)
7629 Safefree(trie->bitmap);
7630 if (trie->wordlen)
7631 Safefree(trie->wordlen);
786e8c11
YO
7632 if (trie->jump)
7633 Safefree(trie->jump);
7634 if (trie->nextword)
7635 Safefree(trie->nextword);
a3621e74 7636#ifdef DEBUGGING
be8e71aa
YO
7637 if (RX_DEBUG(r)) {
7638 if (trie->words)
7639 SvREFCNT_dec((SV*)trie->words);
7640 if (trie->revcharmap)
7641 SvREFCNT_dec((SV*)trie->revcharmap);
7642 }
a3621e74 7643#endif
07be1b83 7644 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 7645 }
07be1b83
YO
7646 }
7647 break;
c277df42 7648 default:
830247a4 7649 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
7650 }
7651 }
7652 Safefree(r->data->what);
7653 Safefree(r->data);
a0d0e21e
LW
7654 }
7655 Safefree(r->startp);
7656 Safefree(r->endp);
7657 Safefree(r);
a687059c 7658}
c277df42 7659
76234dfb 7660#ifndef PERL_IN_XSUB_RE
c277df42
IZ
7661/*
7662 - regnext - dig the "next" pointer out of a node
c277df42
IZ
7663 */
7664regnode *
864dbfa3 7665Perl_regnext(pTHX_ register regnode *p)
c277df42 7666{
97aff369 7667 dVAR;
c277df42
IZ
7668 register I32 offset;
7669
3280af22 7670 if (p == &PL_regdummy)
c277df42
IZ
7671 return(NULL);
7672
7673 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
7674 if (offset == 0)
7675 return(NULL);
7676
c277df42 7677 return(p+offset);
c277df42 7678}
76234dfb 7679#endif
c277df42 7680
01f988be 7681STATIC void
cea2e8a9 7682S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
7683{
7684 va_list args;
7685 STRLEN l1 = strlen(pat1);
7686 STRLEN l2 = strlen(pat2);
7687 char buf[512];
06bf62c7 7688 SV *msv;
73d840c0 7689 const char *message;
c277df42
IZ
7690
7691 if (l1 > 510)
7692 l1 = 510;
7693 if (l1 + l2 > 510)
7694 l2 = 510 - l1;
7695 Copy(pat1, buf, l1 , char);
7696 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
7697 buf[l1 + l2] = '\n';
7698 buf[l1 + l2 + 1] = '\0';
8736538c
AS
7699#ifdef I_STDARG
7700 /* ANSI variant takes additional second argument */
c277df42 7701 va_start(args, pat2);
8736538c
AS
7702#else
7703 va_start(args);
7704#endif
5a844595 7705 msv = vmess(buf, &args);
c277df42 7706 va_end(args);
cfd0369c 7707 message = SvPV_const(msv,l1);
c277df42
IZ
7708 if (l1 > 512)
7709 l1 = 512;
7710 Copy(message, buf, l1 , char);
197cf9b9 7711 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 7712 Perl_croak(aTHX_ "%s", buf);
c277df42 7713}
a0ed51b3
LW
7714
7715/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
7716
76234dfb 7717#ifndef PERL_IN_XSUB_RE
a0ed51b3 7718void
864dbfa3 7719Perl_save_re_context(pTHX)
b81d288d 7720{
97aff369 7721 dVAR;
1ade1aa1
NC
7722
7723 struct re_save_state *state;
7724
7725 SAVEVPTR(PL_curcop);
7726 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
7727
7728 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
7729 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
7730 SSPUSHINT(SAVEt_RE_STATE);
7731
46ab3289 7732 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 7733
a0ed51b3 7734 PL_reg_start_tmp = 0;
a0ed51b3 7735 PL_reg_start_tmpl = 0;
c445ea15 7736 PL_reg_oldsaved = NULL;
a5db57d6 7737 PL_reg_oldsavedlen = 0;
a5db57d6 7738 PL_reg_maxiter = 0;
a5db57d6 7739 PL_reg_leftiter = 0;
c445ea15 7740 PL_reg_poscache = NULL;
a5db57d6 7741 PL_reg_poscache_size = 0;
1ade1aa1
NC
7742#ifdef PERL_OLD_COPY_ON_WRITE
7743 PL_nrs = NULL;
7744#endif
ada6e8a9 7745
c445ea15
AL
7746 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
7747 if (PL_curpm) {
7748 const REGEXP * const rx = PM_GETRE(PL_curpm);
7749 if (rx) {
1df70142 7750 U32 i;
ada6e8a9 7751 for (i = 1; i <= rx->nparens; i++) {
1df70142 7752 char digits[TYPE_CHARS(long)];
d9fad198 7753 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
7754 GV *const *const gvp
7755 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
7756
b37c2d43
AL
7757 if (gvp) {
7758 GV * const gv = *gvp;
7759 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
7760 save_scalar(gv);
49f27e4b 7761 }
ada6e8a9
AMS
7762 }
7763 }
7764 }
a0ed51b3 7765}
76234dfb 7766#endif
51371543 7767
51371543 7768static void
acfe0abc 7769clear_re(pTHX_ void *r)
51371543 7770{
97aff369 7771 dVAR;
51371543
GS
7772 ReREFCNT_dec((regexp *)r);
7773}
ffbc6a93 7774
a28509cc
AL
7775#ifdef DEBUGGING
7776
7777STATIC void
7778S_put_byte(pTHX_ SV *sv, int c)
7779{
7780 if (isCNTRL(c) || c == 255 || !isPRINT(c))
7781 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
7782 else if (c == '-' || c == ']' || c == '\\' || c == '^')
7783 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
7784 else
7785 Perl_sv_catpvf(aTHX_ sv, "%c", c);
7786}
7787
786e8c11 7788
3dab1dad
YO
7789#define CLEAR_OPTSTART \
7790 if (optstart) STMT_START { \
07be1b83 7791 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
7792 optstart=NULL; \
7793 } STMT_END
7794
786e8c11 7795#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 7796
b5a2f8d8
NC
7797STATIC const regnode *
7798S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
7799 const regnode *last, const regnode *plast,
7800 SV* sv, I32 indent, U32 depth)
a28509cc 7801{
97aff369 7802 dVAR;
786e8c11 7803 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 7804 register const regnode *next;
3dab1dad
YO
7805 const regnode *optstart= NULL;
7806 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 7807
786e8c11
YO
7808#ifdef DEBUG_DUMPUNTIL
7809 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
7810 last ? last-start : 0,plast ? plast-start : 0);
7811#endif
7812
7813 if (plast && plast < last)
7814 last= plast;
7815
7816 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
7817 /* While that wasn't END last time... */
7818
7819 NODE_ALIGN(node);
7820 op = OP(node);
7821 if (op == CLOSE)
786e8c11 7822 indent--;
b5a2f8d8 7823 next = regnext((regnode *)node);
07be1b83 7824
a28509cc 7825 /* Where, what. */
8e11feef 7826 if (OP(node) == OPTIMIZED) {
e68ec53f 7827 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 7828 optstart = node;
3dab1dad 7829 else
8e11feef 7830 goto after_print;
3dab1dad
YO
7831 } else
7832 CLEAR_OPTSTART;
07be1b83 7833
32fc9b6a 7834 regprop(r, sv, node);
a28509cc 7835 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 7836 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
7837
7838 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
7839 if (next == NULL) /* Next ptr. */
7840 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
7841 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
7842 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
7843 else
7844 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 7845
1de06328 7846 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 7847 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
7848 }
7849
a28509cc
AL
7850 after_print:
7851 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
7852 assert(next);
7853 {
7854 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
7855 ? regnext((regnode *)next)
7856 : next);
be8e71aa
YO
7857 if (last && nnode > last)
7858 nnode = last;
786e8c11 7859 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 7860 }
a28509cc
AL
7861 }
7862 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 7863 assert(next);
786e8c11 7864 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
7865 }
7866 else if ( PL_regkind[(U8)op] == TRIE ) {
1de06328 7867 const char op = OP(node);
a28509cc 7868 const I32 n = ARG(node);
1de06328
YO
7869 const reg_ac_data * const ac = op>=AHOCORASICK ?
7870 (reg_ac_data *)r->data->data[n] :
7871 NULL;
7872 const reg_trie_data * const trie = op<AHOCORASICK ?
7873 (reg_trie_data*)r->data->data[n] :
7874 ac->trie;
786e8c11 7875 const regnode *nextbranch= NULL;
a28509cc 7876 I32 word_idx;
1de06328 7877 sv_setpvn(sv, "", 0);
786e8c11 7878 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 7879 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
7880
7881 PerlIO_printf(Perl_debug_log, "%*s%s ",
7882 (int)(2*(indent+3)), "",
7883 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
7884 PL_colors[0], PL_colors[1],
7885 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
7886 PERL_PV_PRETTY_ELIPSES |
7887 PERL_PV_PRETTY_LTGT
786e8c11
YO
7888 )
7889 : "???"
7890 );
7891 if (trie->jump) {
7892 U16 dist= trie->jump[word_idx+1];
7893 PerlIO_printf(Perl_debug_log, "(%u)\n",(next - dist) - start);
7894 if (dist) {
7895 if (!nextbranch)
7896 nextbranch= next - trie->jump[0];
7897 DUMPUNTIL(next - dist, nextbranch);
7898 }
7899 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
7900 nextbranch= regnext((regnode *)nextbranch);
7901 } else {
7902 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 7903 }
786e8c11
YO
7904 }
7905 if (last && next > last)
7906 node= last;
7907 else
7908 node= next;
a28509cc 7909 }
786e8c11
YO
7910 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
7911 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
7912 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
7913 }
7914 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 7915 assert(next);
786e8c11 7916 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
7917 }
7918 else if ( op == PLUS || op == STAR) {
786e8c11 7919 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
7920 }
7921 else if (op == ANYOF) {
7922 /* arglen 1 + class block */
7923 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
7924 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
7925 node = NEXTOPER(node);
7926 }
7927 else if (PL_regkind[(U8)op] == EXACT) {
7928 /* Literal string, where present. */
7929 node += NODE_SZ_STR(node) - 1;
7930 node = NEXTOPER(node);
7931 }
7932 else {
7933 node = NEXTOPER(node);
7934 node += regarglen[(U8)op];
7935 }
7936 if (op == CURLYX || op == OPEN)
786e8c11 7937 indent++;
a28509cc 7938 else if (op == WHILEM)
786e8c11 7939 indent--;
a28509cc 7940 }
3dab1dad 7941 CLEAR_OPTSTART;
786e8c11
YO
7942#ifdef DEBUG_DUMPUNTIL
7943 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
7944#endif
1de06328 7945 return node;
a28509cc
AL
7946}
7947
7948#endif /* DEBUGGING */
7949
241d1a3b
NC
7950/*
7951 * Local variables:
7952 * c-indentation-style: bsd
7953 * c-basic-offset: 4
7954 * indent-tabs-mode: t
7955 * End:
7956 *
37442d52
RGS
7957 * ex: set ts=8 sts=4 sw=4 noet:
7958 */