This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Typo fix, by Robin Barker
[perl5.git] / regcomp.c
... / ...
CommitLineData
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
16 */
17
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
32#ifdef PERL_EXT_RE_BUILD
33#include "re_top.h"
34#endif
35
36/*
37 * pregcomp and pregexec -- regsub and regerror are not used in perl
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
61 ****
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
71#define PERL_IN_REGCOMP_C
72#include "perl.h"
73
74#ifndef PERL_IN_XSUB_RE
75# include "INTERN.h"
76#endif
77
78#define REG_COMP_C
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
84
85#ifdef op
86#undef op
87#endif /* op */
88
89#ifdef MSDOS
90# if defined(BUGGY_MSC6)
91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
92# pragma optimize("a",off)
93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
96#endif /* MSDOS */
97
98#ifndef STATIC
99#define STATIC static
100#endif
101
102typedef struct RExC_state_t {
103 U32 flags; /* are we folding, multilining? */
104 char *precomp; /* uncompiled string. */
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
107 char *start; /* Start of input for compile */
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
111 regnode *emit_start; /* Start of emitted-code area */
112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
119 I32 nestroot; /* root parens we are in - used by accept */
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
126 I32 utf8;
127 HV *charnames; /* cache of named sequences */
128 HV *paren_names; /* Paren names */
129
130 regnode **recurse; /* Recurse regops */
131 I32 recurse_count; /* Number of recurse regops */
132#if ADD_TO_REGEXEC
133 char *starttry; /* -Dr: where regtry was called. */
134#define RExC_starttry (pRExC_state->starttry)
135#endif
136#ifdef DEBUGGING
137 const char *lastparse;
138 I32 lastnum;
139 AV *paren_name_list; /* idx -> name */
140#define RExC_lastparse (pRExC_state->lastparse)
141#define RExC_lastnum (pRExC_state->lastnum)
142#define RExC_paren_name_list (pRExC_state->paren_name_list)
143#endif
144} RExC_state_t;
145
146#define RExC_flags (pRExC_state->flags)
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
149#define RExC_rxi (pRExC_state->rxi)
150#define RExC_start (pRExC_state->start)
151#define RExC_end (pRExC_state->end)
152#define RExC_parse (pRExC_state->parse)
153#define RExC_whilem_seen (pRExC_state->whilem_seen)
154#define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */
155#define RExC_emit (pRExC_state->emit)
156#define RExC_emit_start (pRExC_state->emit_start)
157#define RExC_naughty (pRExC_state->naughty)
158#define RExC_sawback (pRExC_state->sawback)
159#define RExC_seen (pRExC_state->seen)
160#define RExC_size (pRExC_state->size)
161#define RExC_npar (pRExC_state->npar)
162#define RExC_nestroot (pRExC_state->nestroot)
163#define RExC_extralen (pRExC_state->extralen)
164#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
165#define RExC_seen_evals (pRExC_state->seen_evals)
166#define RExC_utf8 (pRExC_state->utf8)
167#define RExC_charnames (pRExC_state->charnames)
168#define RExC_open_parens (pRExC_state->open_parens)
169#define RExC_close_parens (pRExC_state->close_parens)
170#define RExC_opend (pRExC_state->opend)
171#define RExC_paren_names (pRExC_state->paren_names)
172#define RExC_recurse (pRExC_state->recurse)
173#define RExC_recurse_count (pRExC_state->recurse_count)
174
175
176#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
177#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
178 ((*s) == '{' && regcurly(s)))
179
180#ifdef SPSTART
181#undef SPSTART /* dratted cpp namespace... */
182#endif
183/*
184 * Flags to be passed up and down.
185 */
186#define WORST 0 /* Worst case. */
187#define HASWIDTH 0x1 /* Known to match non-null strings. */
188#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
189#define SPSTART 0x4 /* Starts with * or +. */
190#define TRYAGAIN 0x8 /* Weeded out a declaration. */
191
192#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
193
194/* whether trie related optimizations are enabled */
195#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
196#define TRIE_STUDY_OPT
197#define FULL_TRIE_STUDY
198#define TRIE_STCLASS
199#endif
200
201
202
203#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
204#define PBITVAL(paren) (1 << ((paren) & 7))
205#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
206#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
207#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
208
209
210/* About scan_data_t.
211
212 During optimisation we recurse through the regexp program performing
213 various inplace (keyhole style) optimisations. In addition study_chunk
214 and scan_commit populate this data structure with information about
215 what strings MUST appear in the pattern. We look for the longest
216 string that must appear for at a fixed location, and we look for the
217 longest string that may appear at a floating location. So for instance
218 in the pattern:
219
220 /FOO[xX]A.*B[xX]BAR/
221
222 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
223 strings (because they follow a .* construct). study_chunk will identify
224 both FOO and BAR as being the longest fixed and floating strings respectively.
225
226 The strings can be composites, for instance
227
228 /(f)(o)(o)/
229
230 will result in a composite fixed substring 'foo'.
231
232 For each string some basic information is maintained:
233
234 - offset or min_offset
235 This is the position the string must appear at, or not before.
236 It also implicitly (when combined with minlenp) tells us how many
237 character must match before the string we are searching.
238 Likewise when combined with minlenp and the length of the string
239 tells us how many characters must appear after the string we have
240 found.
241
242 - max_offset
243 Only used for floating strings. This is the rightmost point that
244 the string can appear at. Ifset to I32 max it indicates that the
245 string can occur infinitely far to the right.
246
247 - minlenp
248 A pointer to the minimum length of the pattern that the string
249 was found inside. This is important as in the case of positive
250 lookahead or positive lookbehind we can have multiple patterns
251 involved. Consider
252
253 /(?=FOO).*F/
254
255 The minimum length of the pattern overall is 3, the minimum length
256 of the lookahead part is 3, but the minimum length of the part that
257 will actually match is 1. So 'FOO's minimum length is 3, but the
258 minimum length for the F is 1. This is important as the minimum length
259 is used to determine offsets in front of and behind the string being
260 looked for. Since strings can be composites this is the length of the
261 pattern at the time it was commited with a scan_commit. Note that
262 the length is calculated by study_chunk, so that the minimum lengths
263 are not known until the full pattern has been compiled, thus the
264 pointer to the value.
265
266 - lookbehind
267
268 In the case of lookbehind the string being searched for can be
269 offset past the start point of the final matching string.
270 If this value was just blithely removed from the min_offset it would
271 invalidate some of the calculations for how many chars must match
272 before or after (as they are derived from min_offset and minlen and
273 the length of the string being searched for).
274 When the final pattern is compiled and the data is moved from the
275 scan_data_t structure into the regexp structure the information
276 about lookbehind is factored in, with the information that would
277 have been lost precalculated in the end_shift field for the
278 associated string.
279
280 The fields pos_min and pos_delta are used to store the minimum offset
281 and the delta to the maximum offset at the current point in the pattern.
282
283*/
284
285typedef struct scan_data_t {
286 /*I32 len_min; unused */
287 /*I32 len_delta; unused */
288 I32 pos_min;
289 I32 pos_delta;
290 SV *last_found;
291 I32 last_end; /* min value, <0 unless valid. */
292 I32 last_start_min;
293 I32 last_start_max;
294 SV **longest; /* Either &l_fixed, or &l_float. */
295 SV *longest_fixed; /* longest fixed string found in pattern */
296 I32 offset_fixed; /* offset where it starts */
297 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
298 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
299 SV *longest_float; /* longest floating string found in pattern */
300 I32 offset_float_min; /* earliest point in string it can appear */
301 I32 offset_float_max; /* latest point in string it can appear */
302 I32 *minlen_float; /* pointer to the minlen relevent to the string */
303 I32 lookbehind_float; /* is the position of the string modified by LB */
304 I32 flags;
305 I32 whilem_c;
306 I32 *last_closep;
307 struct regnode_charclass_class *start_class;
308} scan_data_t;
309
310/*
311 * Forward declarations for pregcomp()'s friends.
312 */
313
314static const scan_data_t zero_scan_data =
315 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
316
317#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
318#define SF_BEFORE_SEOL 0x0001
319#define SF_BEFORE_MEOL 0x0002
320#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
321#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
322
323#ifdef NO_UNARY_PLUS
324# define SF_FIX_SHIFT_EOL (0+2)
325# define SF_FL_SHIFT_EOL (0+4)
326#else
327# define SF_FIX_SHIFT_EOL (+2)
328# define SF_FL_SHIFT_EOL (+4)
329#endif
330
331#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
332#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
333
334#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
335#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
336#define SF_IS_INF 0x0040
337#define SF_HAS_PAR 0x0080
338#define SF_IN_PAR 0x0100
339#define SF_HAS_EVAL 0x0200
340#define SCF_DO_SUBSTR 0x0400
341#define SCF_DO_STCLASS_AND 0x0800
342#define SCF_DO_STCLASS_OR 0x1000
343#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
344#define SCF_WHILEM_VISITED_POS 0x2000
345
346#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
347#define SCF_SEEN_ACCEPT 0x8000
348
349#define UTF (RExC_utf8 != 0)
350#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
351#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
352
353#define OOB_UNICODE 12345678
354#define OOB_NAMEDCLASS -1
355
356#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
357#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
358
359
360/* length of regex to show in messages that don't mark a position within */
361#define RegexLengthToShowInErrorMessages 127
362
363/*
364 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
365 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
366 * op/pragma/warn/regcomp.
367 */
368#define MARKER1 "<-- HERE" /* marker as it appears in the description */
369#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
370
371#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
372
373/*
374 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
375 * arg. Show regex, up to a maximum length. If it's too long, chop and add
376 * "...".
377 */
378#define _FAIL(code) STMT_START { \
379 const char *ellipses = ""; \
380 IV len = RExC_end - RExC_precomp; \
381 \
382 if (!SIZE_ONLY) \
383 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
384 if (len > RegexLengthToShowInErrorMessages) { \
385 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
386 len = RegexLengthToShowInErrorMessages - 10; \
387 ellipses = "..."; \
388 } \
389 code; \
390} STMT_END
391
392#define FAIL(msg) _FAIL( \
393 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
394 msg, (int)len, RExC_precomp, ellipses))
395
396#define FAIL2(msg,arg) _FAIL( \
397 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
398 arg, (int)len, RExC_precomp, ellipses))
399
400/*
401 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
402 */
403#define Simple_vFAIL(m) STMT_START { \
404 const IV offset = RExC_parse - RExC_precomp; \
405 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
406 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
407} STMT_END
408
409/*
410 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
411 */
412#define vFAIL(m) STMT_START { \
413 if (!SIZE_ONLY) \
414 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
415 Simple_vFAIL(m); \
416} STMT_END
417
418/*
419 * Like Simple_vFAIL(), but accepts two arguments.
420 */
421#define Simple_vFAIL2(m,a1) STMT_START { \
422 const IV offset = RExC_parse - RExC_precomp; \
423 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
424 (int)offset, RExC_precomp, RExC_precomp + offset); \
425} STMT_END
426
427/*
428 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
429 */
430#define vFAIL2(m,a1) STMT_START { \
431 if (!SIZE_ONLY) \
432 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
433 Simple_vFAIL2(m, a1); \
434} STMT_END
435
436
437/*
438 * Like Simple_vFAIL(), but accepts three arguments.
439 */
440#define Simple_vFAIL3(m, a1, a2) STMT_START { \
441 const IV offset = RExC_parse - RExC_precomp; \
442 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
443 (int)offset, RExC_precomp, RExC_precomp + offset); \
444} STMT_END
445
446/*
447 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
448 */
449#define vFAIL3(m,a1,a2) STMT_START { \
450 if (!SIZE_ONLY) \
451 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
452 Simple_vFAIL3(m, a1, a2); \
453} STMT_END
454
455/*
456 * Like Simple_vFAIL(), but accepts four arguments.
457 */
458#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
459 const IV offset = RExC_parse - RExC_precomp; \
460 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
461 (int)offset, RExC_precomp, RExC_precomp + offset); \
462} STMT_END
463
464#define vWARN(loc,m) STMT_START { \
465 const IV offset = loc - RExC_precomp; \
466 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
467 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
468} STMT_END
469
470#define vWARNdep(loc,m) STMT_START { \
471 const IV offset = loc - RExC_precomp; \
472 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
473 "%s" REPORT_LOCATION, \
474 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
475} STMT_END
476
477
478#define vWARN2(loc, m, a1) STMT_START { \
479 const IV offset = loc - RExC_precomp; \
480 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
481 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
482} STMT_END
483
484#define vWARN3(loc, m, a1, a2) STMT_START { \
485 const IV offset = loc - RExC_precomp; \
486 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
487 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
488} STMT_END
489
490#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
491 const IV offset = loc - RExC_precomp; \
492 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
493 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
494} STMT_END
495
496#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
497 const IV offset = loc - RExC_precomp; \
498 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
499 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
500} STMT_END
501
502
503/* Allow for side effects in s */
504#define REGC(c,s) STMT_START { \
505 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
506} STMT_END
507
508/* Macros for recording node offsets. 20001227 mjd@plover.com
509 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
510 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
511 * Element 0 holds the number n.
512 * Position is 1 indexed.
513 */
514
515#define Set_Node_Offset_To_R(node,byte) STMT_START { \
516 if (! SIZE_ONLY) { \
517 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
518 __LINE__, (int)(node), (int)(byte))); \
519 if((node) < 0) { \
520 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
521 } else { \
522 RExC_offsets[2*(node)-1] = (byte); \
523 } \
524 } \
525} STMT_END
526
527#define Set_Node_Offset(node,byte) \
528 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
529#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
530
531#define Set_Node_Length_To_R(node,len) STMT_START { \
532 if (! SIZE_ONLY) { \
533 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
534 __LINE__, (int)(node), (int)(len))); \
535 if((node) < 0) { \
536 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
537 } else { \
538 RExC_offsets[2*(node)] = (len); \
539 } \
540 } \
541} STMT_END
542
543#define Set_Node_Length(node,len) \
544 Set_Node_Length_To_R((node)-RExC_emit_start, len)
545#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
546#define Set_Node_Cur_Length(node) \
547 Set_Node_Length(node, RExC_parse - parse_start)
548
549/* Get offsets and lengths */
550#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
551#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
552
553#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
554 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
555 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
556} STMT_END
557
558
559#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
560#define EXPERIMENTAL_INPLACESCAN
561#endif
562
563#define DEBUG_STUDYDATA(str,data,depth) \
564DEBUG_OPTIMISE_MORE_r(if(data){ \
565 PerlIO_printf(Perl_debug_log, \
566 "%*s" str "Pos:%"IVdf"/%"IVdf \
567 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
568 (int)(depth)*2, "", \
569 (IV)((data)->pos_min), \
570 (IV)((data)->pos_delta), \
571 (UV)((data)->flags), \
572 (IV)((data)->whilem_c), \
573 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
574 is_inf ? "INF " : "" \
575 ); \
576 if ((data)->last_found) \
577 PerlIO_printf(Perl_debug_log, \
578 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
579 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
580 SvPVX_const((data)->last_found), \
581 (IV)((data)->last_end), \
582 (IV)((data)->last_start_min), \
583 (IV)((data)->last_start_max), \
584 ((data)->longest && \
585 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
586 SvPVX_const((data)->longest_fixed), \
587 (IV)((data)->offset_fixed), \
588 ((data)->longest && \
589 (data)->longest==&((data)->longest_float)) ? "*" : "", \
590 SvPVX_const((data)->longest_float), \
591 (IV)((data)->offset_float_min), \
592 (IV)((data)->offset_float_max) \
593 ); \
594 PerlIO_printf(Perl_debug_log,"\n"); \
595});
596
597static void clear_re(pTHX_ void *r);
598
599/* Mark that we cannot extend a found fixed substring at this point.
600 Update the longest found anchored substring and the longest found
601 floating substrings if needed. */
602
603STATIC void
604S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
605{
606 const STRLEN l = CHR_SVLEN(data->last_found);
607 const STRLEN old_l = CHR_SVLEN(*data->longest);
608 GET_RE_DEBUG_FLAGS_DECL;
609
610 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
611 SvSetMagicSV(*data->longest, data->last_found);
612 if (*data->longest == data->longest_fixed) {
613 data->offset_fixed = l ? data->last_start_min : data->pos_min;
614 if (data->flags & SF_BEFORE_EOL)
615 data->flags
616 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
617 else
618 data->flags &= ~SF_FIX_BEFORE_EOL;
619 data->minlen_fixed=minlenp;
620 data->lookbehind_fixed=0;
621 }
622 else { /* *data->longest == data->longest_float */
623 data->offset_float_min = l ? data->last_start_min : data->pos_min;
624 data->offset_float_max = (l
625 ? data->last_start_max
626 : data->pos_min + data->pos_delta);
627 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
628 data->offset_float_max = I32_MAX;
629 if (data->flags & SF_BEFORE_EOL)
630 data->flags
631 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
632 else
633 data->flags &= ~SF_FL_BEFORE_EOL;
634 data->minlen_float=minlenp;
635 data->lookbehind_float=0;
636 }
637 }
638 SvCUR_set(data->last_found, 0);
639 {
640 SV * const sv = data->last_found;
641 if (SvUTF8(sv) && SvMAGICAL(sv)) {
642 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
643 if (mg)
644 mg->mg_len = 0;
645 }
646 }
647 data->last_end = -1;
648 data->flags &= ~SF_BEFORE_EOL;
649 DEBUG_STUDYDATA("cl_anything: ",data,0);
650}
651
652/* Can match anything (initialization) */
653STATIC void
654S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
655{
656 ANYOF_CLASS_ZERO(cl);
657 ANYOF_BITMAP_SETALL(cl);
658 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
659 if (LOC)
660 cl->flags |= ANYOF_LOCALE;
661}
662
663/* Can match anything (initialization) */
664STATIC int
665S_cl_is_anything(const struct regnode_charclass_class *cl)
666{
667 int value;
668
669 for (value = 0; value <= ANYOF_MAX; value += 2)
670 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
671 return 1;
672 if (!(cl->flags & ANYOF_UNICODE_ALL))
673 return 0;
674 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
675 return 0;
676 return 1;
677}
678
679/* Can match anything (initialization) */
680STATIC void
681S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
682{
683 Zero(cl, 1, struct regnode_charclass_class);
684 cl->type = ANYOF;
685 cl_anything(pRExC_state, cl);
686}
687
688STATIC void
689S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
690{
691 Zero(cl, 1, struct regnode_charclass_class);
692 cl->type = ANYOF;
693 cl_anything(pRExC_state, cl);
694 if (LOC)
695 cl->flags |= ANYOF_LOCALE;
696}
697
698/* 'And' a given class with another one. Can create false positives */
699/* We assume that cl is not inverted */
700STATIC void
701S_cl_and(struct regnode_charclass_class *cl,
702 const struct regnode_charclass_class *and_with)
703{
704
705 assert(and_with->type == ANYOF);
706 if (!(and_with->flags & ANYOF_CLASS)
707 && !(cl->flags & ANYOF_CLASS)
708 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
709 && !(and_with->flags & ANYOF_FOLD)
710 && !(cl->flags & ANYOF_FOLD)) {
711 int i;
712
713 if (and_with->flags & ANYOF_INVERT)
714 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
715 cl->bitmap[i] &= ~and_with->bitmap[i];
716 else
717 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
718 cl->bitmap[i] &= and_with->bitmap[i];
719 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
720 if (!(and_with->flags & ANYOF_EOS))
721 cl->flags &= ~ANYOF_EOS;
722
723 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
724 !(and_with->flags & ANYOF_INVERT)) {
725 cl->flags &= ~ANYOF_UNICODE_ALL;
726 cl->flags |= ANYOF_UNICODE;
727 ARG_SET(cl, ARG(and_with));
728 }
729 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
730 !(and_with->flags & ANYOF_INVERT))
731 cl->flags &= ~ANYOF_UNICODE_ALL;
732 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
733 !(and_with->flags & ANYOF_INVERT))
734 cl->flags &= ~ANYOF_UNICODE;
735}
736
737/* 'OR' a given class with another one. Can create false positives */
738/* We assume that cl is not inverted */
739STATIC void
740S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
741{
742 if (or_with->flags & ANYOF_INVERT) {
743 /* We do not use
744 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
745 * <= (B1 | !B2) | (CL1 | !CL2)
746 * which is wasteful if CL2 is small, but we ignore CL2:
747 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
748 * XXXX Can we handle case-fold? Unclear:
749 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
750 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
751 */
752 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
753 && !(or_with->flags & ANYOF_FOLD)
754 && !(cl->flags & ANYOF_FOLD) ) {
755 int i;
756
757 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
758 cl->bitmap[i] |= ~or_with->bitmap[i];
759 } /* XXXX: logic is complicated otherwise */
760 else {
761 cl_anything(pRExC_state, cl);
762 }
763 } else {
764 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
765 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
766 && (!(or_with->flags & ANYOF_FOLD)
767 || (cl->flags & ANYOF_FOLD)) ) {
768 int i;
769
770 /* OR char bitmap and class bitmap separately */
771 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
772 cl->bitmap[i] |= or_with->bitmap[i];
773 if (or_with->flags & ANYOF_CLASS) {
774 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
775 cl->classflags[i] |= or_with->classflags[i];
776 cl->flags |= ANYOF_CLASS;
777 }
778 }
779 else { /* XXXX: logic is complicated, leave it along for a moment. */
780 cl_anything(pRExC_state, cl);
781 }
782 }
783 if (or_with->flags & ANYOF_EOS)
784 cl->flags |= ANYOF_EOS;
785
786 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
787 ARG(cl) != ARG(or_with)) {
788 cl->flags |= ANYOF_UNICODE_ALL;
789 cl->flags &= ~ANYOF_UNICODE;
790 }
791 if (or_with->flags & ANYOF_UNICODE_ALL) {
792 cl->flags |= ANYOF_UNICODE_ALL;
793 cl->flags &= ~ANYOF_UNICODE;
794 }
795}
796
797#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
798#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
799#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
800#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
801
802
803#ifdef DEBUGGING
804/*
805 dump_trie(trie,widecharmap,revcharmap)
806 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
807 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
808
809 These routines dump out a trie in a somewhat readable format.
810 The _interim_ variants are used for debugging the interim
811 tables that are used to generate the final compressed
812 representation which is what dump_trie expects.
813
814 Part of the reason for their existance is to provide a form
815 of documentation as to how the different representations function.
816
817*/
818
819/*
820 Dumps the final compressed table form of the trie to Perl_debug_log.
821 Used for debugging make_trie().
822*/
823
824STATIC void
825S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
826 AV *revcharmap, U32 depth)
827{
828 U32 state;
829 SV *sv=sv_newmortal();
830 int colwidth= widecharmap ? 6 : 4;
831 GET_RE_DEBUG_FLAGS_DECL;
832
833
834 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
835 (int)depth * 2 + 2,"",
836 "Match","Base","Ofs" );
837
838 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
839 SV ** const tmp = av_fetch( revcharmap, state, 0);
840 if ( tmp ) {
841 PerlIO_printf( Perl_debug_log, "%*s",
842 colwidth,
843 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
844 PL_colors[0], PL_colors[1],
845 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
846 PERL_PV_ESCAPE_FIRSTCHAR
847 )
848 );
849 }
850 }
851 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
852 (int)depth * 2 + 2,"");
853
854 for( state = 0 ; state < trie->uniquecharcount ; state++ )
855 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
856 PerlIO_printf( Perl_debug_log, "\n");
857
858 for( state = 1 ; state < trie->statecount ; state++ ) {
859 const U32 base = trie->states[ state ].trans.base;
860
861 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
862
863 if ( trie->states[ state ].wordnum ) {
864 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
865 } else {
866 PerlIO_printf( Perl_debug_log, "%6s", "" );
867 }
868
869 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
870
871 if ( base ) {
872 U32 ofs = 0;
873
874 while( ( base + ofs < trie->uniquecharcount ) ||
875 ( base + ofs - trie->uniquecharcount < trie->lasttrans
876 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
877 ofs++;
878
879 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
880
881 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
882 if ( ( base + ofs >= trie->uniquecharcount ) &&
883 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
884 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
885 {
886 PerlIO_printf( Perl_debug_log, "%*"UVXf,
887 colwidth,
888 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
889 } else {
890 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
891 }
892 }
893
894 PerlIO_printf( Perl_debug_log, "]");
895
896 }
897 PerlIO_printf( Perl_debug_log, "\n" );
898 }
899}
900/*
901 Dumps a fully constructed but uncompressed trie in list form.
902 List tries normally only are used for construction when the number of
903 possible chars (trie->uniquecharcount) is very high.
904 Used for debugging make_trie().
905*/
906STATIC void
907S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
908 HV *widecharmap, AV *revcharmap, U32 next_alloc,
909 U32 depth)
910{
911 U32 state;
912 SV *sv=sv_newmortal();
913 int colwidth= widecharmap ? 6 : 4;
914 GET_RE_DEBUG_FLAGS_DECL;
915 /* print out the table precompression. */
916 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
917 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
918 "------:-----+-----------------\n" );
919
920 for( state=1 ; state < next_alloc ; state ++ ) {
921 U16 charid;
922
923 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
924 (int)depth * 2 + 2,"", (UV)state );
925 if ( ! trie->states[ state ].wordnum ) {
926 PerlIO_printf( Perl_debug_log, "%5s| ","");
927 } else {
928 PerlIO_printf( Perl_debug_log, "W%4x| ",
929 trie->states[ state ].wordnum
930 );
931 }
932 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
933 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
934 if ( tmp ) {
935 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
936 colwidth,
937 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
938 PL_colors[0], PL_colors[1],
939 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
940 PERL_PV_ESCAPE_FIRSTCHAR
941 ) ,
942 TRIE_LIST_ITEM(state,charid).forid,
943 (UV)TRIE_LIST_ITEM(state,charid).newstate
944 );
945 if (!(charid % 10))
946 PerlIO_printf(Perl_debug_log, "\n%*s| ",
947 (int)((depth * 2) + 14), "");
948 }
949 }
950 PerlIO_printf( Perl_debug_log, "\n");
951 }
952}
953
954/*
955 Dumps a fully constructed but uncompressed trie in table form.
956 This is the normal DFA style state transition table, with a few
957 twists to facilitate compression later.
958 Used for debugging make_trie().
959*/
960STATIC void
961S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
962 HV *widecharmap, AV *revcharmap, U32 next_alloc,
963 U32 depth)
964{
965 U32 state;
966 U16 charid;
967 SV *sv=sv_newmortal();
968 int colwidth= widecharmap ? 6 : 4;
969 GET_RE_DEBUG_FLAGS_DECL;
970
971 /*
972 print out the table precompression so that we can do a visual check
973 that they are identical.
974 */
975
976 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
977
978 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
979 SV ** const tmp = av_fetch( revcharmap, charid, 0);
980 if ( tmp ) {
981 PerlIO_printf( Perl_debug_log, "%*s",
982 colwidth,
983 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
984 PL_colors[0], PL_colors[1],
985 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
986 PERL_PV_ESCAPE_FIRSTCHAR
987 )
988 );
989 }
990 }
991
992 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
993
994 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
995 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
996 }
997
998 PerlIO_printf( Perl_debug_log, "\n" );
999
1000 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1001
1002 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1003 (int)depth * 2 + 2,"",
1004 (UV)TRIE_NODENUM( state ) );
1005
1006 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1007 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1008 if (v)
1009 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1010 else
1011 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1012 }
1013 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1014 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1015 } else {
1016 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1017 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1018 }
1019 }
1020}
1021
1022#endif
1023
1024/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1025 startbranch: the first branch in the whole branch sequence
1026 first : start branch of sequence of branch-exact nodes.
1027 May be the same as startbranch
1028 last : Thing following the last branch.
1029 May be the same as tail.
1030 tail : item following the branch sequence
1031 count : words in the sequence
1032 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1033 depth : indent depth
1034
1035Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1036
1037A trie is an N'ary tree where the branches are determined by digital
1038decomposition of the key. IE, at the root node you look up the 1st character and
1039follow that branch repeat until you find the end of the branches. Nodes can be
1040marked as "accepting" meaning they represent a complete word. Eg:
1041
1042 /he|she|his|hers/
1043
1044would convert into the following structure. Numbers represent states, letters
1045following numbers represent valid transitions on the letter from that state, if
1046the number is in square brackets it represents an accepting state, otherwise it
1047will be in parenthesis.
1048
1049 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1050 | |
1051 | (2)
1052 | |
1053 (1) +-i->(6)-+-s->[7]
1054 |
1055 +-s->(3)-+-h->(4)-+-e->[5]
1056
1057 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1058
1059This shows that when matching against the string 'hers' we will begin at state 1
1060read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1061then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1062is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1063single traverse. We store a mapping from accepting to state to which word was
1064matched, and then when we have multiple possibilities we try to complete the
1065rest of the regex in the order in which they occured in the alternation.
1066
1067The only prior NFA like behaviour that would be changed by the TRIE support is
1068the silent ignoring of duplicate alternations which are of the form:
1069
1070 / (DUPE|DUPE) X? (?{ ... }) Y /x
1071
1072Thus EVAL blocks follwing a trie may be called a different number of times with
1073and without the optimisation. With the optimisations dupes will be silently
1074ignored. This inconsistant behaviour of EVAL type nodes is well established as
1075the following demonstrates:
1076
1077 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1078
1079which prints out 'word' three times, but
1080
1081 'words'=~/(word|word|word)(?{ print $1 })S/
1082
1083which doesnt print it out at all. This is due to other optimisations kicking in.
1084
1085Example of what happens on a structural level:
1086
1087The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1088
1089 1: CURLYM[1] {1,32767}(18)
1090 5: BRANCH(8)
1091 6: EXACT <ac>(16)
1092 8: BRANCH(11)
1093 9: EXACT <ad>(16)
1094 11: BRANCH(14)
1095 12: EXACT <ab>(16)
1096 16: SUCCEED(0)
1097 17: NOTHING(18)
1098 18: END(0)
1099
1100This would be optimizable with startbranch=5, first=5, last=16, tail=16
1101and should turn into:
1102
1103 1: CURLYM[1] {1,32767}(18)
1104 5: TRIE(16)
1105 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1106 <ac>
1107 <ad>
1108 <ab>
1109 16: SUCCEED(0)
1110 17: NOTHING(18)
1111 18: END(0)
1112
1113Cases where tail != last would be like /(?foo|bar)baz/:
1114
1115 1: BRANCH(4)
1116 2: EXACT <foo>(8)
1117 4: BRANCH(7)
1118 5: EXACT <bar>(8)
1119 7: TAIL(8)
1120 8: EXACT <baz>(10)
1121 10: END(0)
1122
1123which would be optimizable with startbranch=1, first=1, last=7, tail=8
1124and would end up looking like:
1125
1126 1: TRIE(8)
1127 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1128 <foo>
1129 <bar>
1130 7: TAIL(8)
1131 8: EXACT <baz>(10)
1132 10: END(0)
1133
1134 d = uvuni_to_utf8_flags(d, uv, 0);
1135
1136is the recommended Unicode-aware way of saying
1137
1138 *(d++) = uv;
1139*/
1140
1141#define TRIE_STORE_REVCHAR \
1142 STMT_START { \
1143 SV *tmp = newSVpvs(""); \
1144 if (UTF) SvUTF8_on(tmp); \
1145 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
1146 av_push( revcharmap, tmp ); \
1147 } STMT_END
1148
1149#define TRIE_READ_CHAR STMT_START { \
1150 wordlen++; \
1151 if ( UTF ) { \
1152 if ( folder ) { \
1153 if ( foldlen > 0 ) { \
1154 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1155 foldlen -= len; \
1156 scan += len; \
1157 len = 0; \
1158 } else { \
1159 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1160 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1161 foldlen -= UNISKIP( uvc ); \
1162 scan = foldbuf + UNISKIP( uvc ); \
1163 } \
1164 } else { \
1165 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1166 } \
1167 } else { \
1168 uvc = (U32)*uc; \
1169 len = 1; \
1170 } \
1171} STMT_END
1172
1173
1174
1175#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1176 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1177 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1178 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1179 } \
1180 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1181 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1182 TRIE_LIST_CUR( state )++; \
1183} STMT_END
1184
1185#define TRIE_LIST_NEW(state) STMT_START { \
1186 Newxz( trie->states[ state ].trans.list, \
1187 4, reg_trie_trans_le ); \
1188 TRIE_LIST_CUR( state ) = 1; \
1189 TRIE_LIST_LEN( state ) = 4; \
1190} STMT_END
1191
1192#define TRIE_HANDLE_WORD(state) STMT_START { \
1193 U16 dupe= trie->states[ state ].wordnum; \
1194 regnode * const noper_next = regnext( noper ); \
1195 \
1196 if (trie->wordlen) \
1197 trie->wordlen[ curword ] = wordlen; \
1198 DEBUG_r({ \
1199 /* store the word for dumping */ \
1200 SV* tmp; \
1201 if (OP(noper) != NOTHING) \
1202 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1203 else \
1204 tmp = newSVpvn( "", 0 ); \
1205 if ( UTF ) SvUTF8_on( tmp ); \
1206 av_push( trie_words, tmp ); \
1207 }); \
1208 \
1209 curword++; \
1210 \
1211 if ( noper_next < tail ) { \
1212 if (!trie->jump) \
1213 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1214 trie->jump[curword] = (U16)(noper_next - convert); \
1215 if (!jumper) \
1216 jumper = noper_next; \
1217 if (!nextbranch) \
1218 nextbranch= regnext(cur); \
1219 } \
1220 \
1221 if ( dupe ) { \
1222 /* So it's a dupe. This means we need to maintain a */\
1223 /* linked-list from the first to the next. */\
1224 /* we only allocate the nextword buffer when there */\
1225 /* a dupe, so first time we have to do the allocation */\
1226 if (!trie->nextword) \
1227 trie->nextword = (U16 *) \
1228 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
1229 while ( trie->nextword[dupe] ) \
1230 dupe= trie->nextword[dupe]; \
1231 trie->nextword[dupe]= curword; \
1232 } else { \
1233 /* we haven't inserted this word yet. */ \
1234 trie->states[ state ].wordnum = curword; \
1235 } \
1236} STMT_END
1237
1238
1239#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1240 ( ( base + charid >= ucharcount \
1241 && base + charid < ubound \
1242 && state == trie->trans[ base - ucharcount + charid ].check \
1243 && trie->trans[ base - ucharcount + charid ].next ) \
1244 ? trie->trans[ base - ucharcount + charid ].next \
1245 : ( state==1 ? special : 0 ) \
1246 )
1247
1248#define MADE_TRIE 1
1249#define MADE_JUMP_TRIE 2
1250#define MADE_EXACT_TRIE 4
1251
1252STATIC I32
1253S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1254{
1255 dVAR;
1256 /* first pass, loop through and scan words */
1257 reg_trie_data *trie;
1258 HV *widecharmap = NULL;
1259 AV *revcharmap = newAV();
1260 regnode *cur;
1261 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1262 STRLEN len = 0;
1263 UV uvc = 0;
1264 U16 curword = 0;
1265 U32 next_alloc = 0;
1266 regnode *jumper = NULL;
1267 regnode *nextbranch = NULL;
1268 regnode *convert = NULL;
1269 /* we just use folder as a flag in utf8 */
1270 const U8 * const folder = ( flags == EXACTF
1271 ? PL_fold
1272 : ( flags == EXACTFL
1273 ? PL_fold_locale
1274 : NULL
1275 )
1276 );
1277
1278#ifdef DEBUGGING
1279 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1280 AV *trie_words = NULL;
1281 /* along with revcharmap, this only used during construction but both are
1282 * useful during debugging so we store them in the struct when debugging.
1283 */
1284#else
1285 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1286 STRLEN trie_charcount=0;
1287#endif
1288 SV *re_trie_maxbuff;
1289 GET_RE_DEBUG_FLAGS_DECL;
1290#ifndef DEBUGGING
1291 PERL_UNUSED_ARG(depth);
1292#endif
1293
1294 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1295 trie->refcount = 1;
1296 trie->startstate = 1;
1297 trie->wordcount = word_count;
1298 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1299 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1300 if (!(UTF && folder))
1301 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1302 DEBUG_r({
1303 trie_words = newAV();
1304 });
1305
1306 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1307 if (!SvIOK(re_trie_maxbuff)) {
1308 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1309 }
1310 DEBUG_OPTIMISE_r({
1311 PerlIO_printf( Perl_debug_log,
1312 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1313 (int)depth * 2 + 2, "",
1314 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1315 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1316 (int)depth);
1317 });
1318
1319 /* Find the node we are going to overwrite */
1320 if ( first == startbranch && OP( last ) != BRANCH ) {
1321 /* whole branch chain */
1322 convert = first;
1323 } else {
1324 /* branch sub-chain */
1325 convert = NEXTOPER( first );
1326 }
1327
1328 /* -- First loop and Setup --
1329
1330 We first traverse the branches and scan each word to determine if it
1331 contains widechars, and how many unique chars there are, this is
1332 important as we have to build a table with at least as many columns as we
1333 have unique chars.
1334
1335 We use an array of integers to represent the character codes 0..255
1336 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1337 native representation of the character value as the key and IV's for the
1338 coded index.
1339
1340 *TODO* If we keep track of how many times each character is used we can
1341 remap the columns so that the table compression later on is more
1342 efficient in terms of memory by ensuring most common value is in the
1343 middle and the least common are on the outside. IMO this would be better
1344 than a most to least common mapping as theres a decent chance the most
1345 common letter will share a node with the least common, meaning the node
1346 will not be compressable. With a middle is most common approach the worst
1347 case is when we have the least common nodes twice.
1348
1349 */
1350
1351 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1352 regnode * const noper = NEXTOPER( cur );
1353 const U8 *uc = (U8*)STRING( noper );
1354 const U8 * const e = uc + STR_LEN( noper );
1355 STRLEN foldlen = 0;
1356 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1357 const U8 *scan = (U8*)NULL;
1358 U32 wordlen = 0; /* required init */
1359 STRLEN chars=0;
1360
1361 if (OP(noper) == NOTHING) {
1362 trie->minlen= 0;
1363 continue;
1364 }
1365 if (trie->bitmap) {
1366 TRIE_BITMAP_SET(trie,*uc);
1367 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1368 }
1369 for ( ; uc < e ; uc += len ) {
1370 TRIE_CHARCOUNT(trie)++;
1371 TRIE_READ_CHAR;
1372 chars++;
1373 if ( uvc < 256 ) {
1374 if ( !trie->charmap[ uvc ] ) {
1375 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1376 if ( folder )
1377 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1378 TRIE_STORE_REVCHAR;
1379 }
1380 } else {
1381 SV** svpp;
1382 if ( !widecharmap )
1383 widecharmap = newHV();
1384
1385 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1386
1387 if ( !svpp )
1388 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1389
1390 if ( !SvTRUE( *svpp ) ) {
1391 sv_setiv( *svpp, ++trie->uniquecharcount );
1392 TRIE_STORE_REVCHAR;
1393 }
1394 }
1395 }
1396 if( cur == first ) {
1397 trie->minlen=chars;
1398 trie->maxlen=chars;
1399 } else if (chars < trie->minlen) {
1400 trie->minlen=chars;
1401 } else if (chars > trie->maxlen) {
1402 trie->maxlen=chars;
1403 }
1404
1405 } /* end first pass */
1406 DEBUG_TRIE_COMPILE_r(
1407 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1408 (int)depth * 2 + 2,"",
1409 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1410 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1411 (int)trie->minlen, (int)trie->maxlen )
1412 );
1413 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
1414
1415 /*
1416 We now know what we are dealing with in terms of unique chars and
1417 string sizes so we can calculate how much memory a naive
1418 representation using a flat table will take. If it's over a reasonable
1419 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1420 conservative but potentially much slower representation using an array
1421 of lists.
1422
1423 At the end we convert both representations into the same compressed
1424 form that will be used in regexec.c for matching with. The latter
1425 is a form that cannot be used to construct with but has memory
1426 properties similar to the list form and access properties similar
1427 to the table form making it both suitable for fast searches and
1428 small enough that its feasable to store for the duration of a program.
1429
1430 See the comment in the code where the compressed table is produced
1431 inplace from the flat tabe representation for an explanation of how
1432 the compression works.
1433
1434 */
1435
1436
1437 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1438 /*
1439 Second Pass -- Array Of Lists Representation
1440
1441 Each state will be represented by a list of charid:state records
1442 (reg_trie_trans_le) the first such element holds the CUR and LEN
1443 points of the allocated array. (See defines above).
1444
1445 We build the initial structure using the lists, and then convert
1446 it into the compressed table form which allows faster lookups
1447 (but cant be modified once converted).
1448 */
1449
1450 STRLEN transcount = 1;
1451
1452 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1453 "%*sCompiling trie using list compiler\n",
1454 (int)depth * 2 + 2, ""));
1455
1456 trie->states = (reg_trie_state *)
1457 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1458 sizeof(reg_trie_state) );
1459 TRIE_LIST_NEW(1);
1460 next_alloc = 2;
1461
1462 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1463
1464 regnode * const noper = NEXTOPER( cur );
1465 U8 *uc = (U8*)STRING( noper );
1466 const U8 * const e = uc + STR_LEN( noper );
1467 U32 state = 1; /* required init */
1468 U16 charid = 0; /* sanity init */
1469 U8 *scan = (U8*)NULL; /* sanity init */
1470 STRLEN foldlen = 0; /* required init */
1471 U32 wordlen = 0; /* required init */
1472 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1473
1474 if (OP(noper) != NOTHING) {
1475 for ( ; uc < e ; uc += len ) {
1476
1477 TRIE_READ_CHAR;
1478
1479 if ( uvc < 256 ) {
1480 charid = trie->charmap[ uvc ];
1481 } else {
1482 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1483 if ( !svpp ) {
1484 charid = 0;
1485 } else {
1486 charid=(U16)SvIV( *svpp );
1487 }
1488 }
1489 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1490 if ( charid ) {
1491
1492 U16 check;
1493 U32 newstate = 0;
1494
1495 charid--;
1496 if ( !trie->states[ state ].trans.list ) {
1497 TRIE_LIST_NEW( state );
1498 }
1499 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1500 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1501 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1502 break;
1503 }
1504 }
1505 if ( ! newstate ) {
1506 newstate = next_alloc++;
1507 TRIE_LIST_PUSH( state, charid, newstate );
1508 transcount++;
1509 }
1510 state = newstate;
1511 } else {
1512 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1513 }
1514 }
1515 }
1516 TRIE_HANDLE_WORD(state);
1517
1518 } /* end second pass */
1519
1520 /* next alloc is the NEXT state to be allocated */
1521 trie->statecount = next_alloc;
1522 trie->states = (reg_trie_state *)
1523 PerlMemShared_realloc( trie->states,
1524 next_alloc
1525 * sizeof(reg_trie_state) );
1526
1527 /* and now dump it out before we compress it */
1528 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1529 revcharmap, next_alloc,
1530 depth+1)
1531 );
1532
1533 trie->trans = (reg_trie_trans *)
1534 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1535 {
1536 U32 state;
1537 U32 tp = 0;
1538 U32 zp = 0;
1539
1540
1541 for( state=1 ; state < next_alloc ; state ++ ) {
1542 U32 base=0;
1543
1544 /*
1545 DEBUG_TRIE_COMPILE_MORE_r(
1546 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1547 );
1548 */
1549
1550 if (trie->states[state].trans.list) {
1551 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1552 U16 maxid=minid;
1553 U16 idx;
1554
1555 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1556 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1557 if ( forid < minid ) {
1558 minid=forid;
1559 } else if ( forid > maxid ) {
1560 maxid=forid;
1561 }
1562 }
1563 if ( transcount < tp + maxid - minid + 1) {
1564 transcount *= 2;
1565 trie->trans = (reg_trie_trans *)
1566 PerlMemShared_realloc( trie->trans,
1567 transcount
1568 * sizeof(reg_trie_trans) );
1569 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1570 }
1571 base = trie->uniquecharcount + tp - minid;
1572 if ( maxid == minid ) {
1573 U32 set = 0;
1574 for ( ; zp < tp ; zp++ ) {
1575 if ( ! trie->trans[ zp ].next ) {
1576 base = trie->uniquecharcount + zp - minid;
1577 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1578 trie->trans[ zp ].check = state;
1579 set = 1;
1580 break;
1581 }
1582 }
1583 if ( !set ) {
1584 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1585 trie->trans[ tp ].check = state;
1586 tp++;
1587 zp = tp;
1588 }
1589 } else {
1590 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1591 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1592 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1593 trie->trans[ tid ].check = state;
1594 }
1595 tp += ( maxid - minid + 1 );
1596 }
1597 Safefree(trie->states[ state ].trans.list);
1598 }
1599 /*
1600 DEBUG_TRIE_COMPILE_MORE_r(
1601 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1602 );
1603 */
1604 trie->states[ state ].trans.base=base;
1605 }
1606 trie->lasttrans = tp + 1;
1607 }
1608 } else {
1609 /*
1610 Second Pass -- Flat Table Representation.
1611
1612 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1613 We know that we will need Charcount+1 trans at most to store the data
1614 (one row per char at worst case) So we preallocate both structures
1615 assuming worst case.
1616
1617 We then construct the trie using only the .next slots of the entry
1618 structs.
1619
1620 We use the .check field of the first entry of the node temporarily to
1621 make compression both faster and easier by keeping track of how many non
1622 zero fields are in the node.
1623
1624 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1625 transition.
1626
1627 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1628 number representing the first entry of the node, and state as a
1629 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1630 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1631 are 2 entrys per node. eg:
1632
1633 A B A B
1634 1. 2 4 1. 3 7
1635 2. 0 3 3. 0 5
1636 3. 0 0 5. 0 0
1637 4. 0 0 7. 0 0
1638
1639 The table is internally in the right hand, idx form. However as we also
1640 have to deal with the states array which is indexed by nodenum we have to
1641 use TRIE_NODENUM() to convert.
1642
1643 */
1644 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1645 "%*sCompiling trie using table compiler\n",
1646 (int)depth * 2 + 2, ""));
1647
1648 trie->trans = (reg_trie_trans *)
1649 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1650 * trie->uniquecharcount + 1,
1651 sizeof(reg_trie_trans) );
1652 trie->states = (reg_trie_state *)
1653 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1654 sizeof(reg_trie_state) );
1655 next_alloc = trie->uniquecharcount + 1;
1656
1657
1658 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1659
1660 regnode * const noper = NEXTOPER( cur );
1661 const U8 *uc = (U8*)STRING( noper );
1662 const U8 * const e = uc + STR_LEN( noper );
1663
1664 U32 state = 1; /* required init */
1665
1666 U16 charid = 0; /* sanity init */
1667 U32 accept_state = 0; /* sanity init */
1668 U8 *scan = (U8*)NULL; /* sanity init */
1669
1670 STRLEN foldlen = 0; /* required init */
1671 U32 wordlen = 0; /* required init */
1672 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1673
1674 if ( OP(noper) != NOTHING ) {
1675 for ( ; uc < e ; uc += len ) {
1676
1677 TRIE_READ_CHAR;
1678
1679 if ( uvc < 256 ) {
1680 charid = trie->charmap[ uvc ];
1681 } else {
1682 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1683 charid = svpp ? (U16)SvIV(*svpp) : 0;
1684 }
1685 if ( charid ) {
1686 charid--;
1687 if ( !trie->trans[ state + charid ].next ) {
1688 trie->trans[ state + charid ].next = next_alloc;
1689 trie->trans[ state ].check++;
1690 next_alloc += trie->uniquecharcount;
1691 }
1692 state = trie->trans[ state + charid ].next;
1693 } else {
1694 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1695 }
1696 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1697 }
1698 }
1699 accept_state = TRIE_NODENUM( state );
1700 TRIE_HANDLE_WORD(accept_state);
1701
1702 } /* end second pass */
1703
1704 /* and now dump it out before we compress it */
1705 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1706 revcharmap,
1707 next_alloc, depth+1));
1708
1709 {
1710 /*
1711 * Inplace compress the table.*
1712
1713 For sparse data sets the table constructed by the trie algorithm will
1714 be mostly 0/FAIL transitions or to put it another way mostly empty.
1715 (Note that leaf nodes will not contain any transitions.)
1716
1717 This algorithm compresses the tables by eliminating most such
1718 transitions, at the cost of a modest bit of extra work during lookup:
1719
1720 - Each states[] entry contains a .base field which indicates the
1721 index in the state[] array wheres its transition data is stored.
1722
1723 - If .base is 0 there are no valid transitions from that node.
1724
1725 - If .base is nonzero then charid is added to it to find an entry in
1726 the trans array.
1727
1728 -If trans[states[state].base+charid].check!=state then the
1729 transition is taken to be a 0/Fail transition. Thus if there are fail
1730 transitions at the front of the node then the .base offset will point
1731 somewhere inside the previous nodes data (or maybe even into a node
1732 even earlier), but the .check field determines if the transition is
1733 valid.
1734
1735 XXX - wrong maybe?
1736 The following process inplace converts the table to the compressed
1737 table: We first do not compress the root node 1,and mark its all its
1738 .check pointers as 1 and set its .base pointer as 1 as well. This
1739 allows to do a DFA construction from the compressed table later, and
1740 ensures that any .base pointers we calculate later are greater than
1741 0.
1742
1743 - We set 'pos' to indicate the first entry of the second node.
1744
1745 - We then iterate over the columns of the node, finding the first and
1746 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1747 and set the .check pointers accordingly, and advance pos
1748 appropriately and repreat for the next node. Note that when we copy
1749 the next pointers we have to convert them from the original
1750 NODEIDX form to NODENUM form as the former is not valid post
1751 compression.
1752
1753 - If a node has no transitions used we mark its base as 0 and do not
1754 advance the pos pointer.
1755
1756 - If a node only has one transition we use a second pointer into the
1757 structure to fill in allocated fail transitions from other states.
1758 This pointer is independent of the main pointer and scans forward
1759 looking for null transitions that are allocated to a state. When it
1760 finds one it writes the single transition into the "hole". If the
1761 pointer doesnt find one the single transition is appended as normal.
1762
1763 - Once compressed we can Renew/realloc the structures to release the
1764 excess space.
1765
1766 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1767 specifically Fig 3.47 and the associated pseudocode.
1768
1769 demq
1770 */
1771 const U32 laststate = TRIE_NODENUM( next_alloc );
1772 U32 state, charid;
1773 U32 pos = 0, zp=0;
1774 trie->statecount = laststate;
1775
1776 for ( state = 1 ; state < laststate ; state++ ) {
1777 U8 flag = 0;
1778 const U32 stateidx = TRIE_NODEIDX( state );
1779 const U32 o_used = trie->trans[ stateidx ].check;
1780 U32 used = trie->trans[ stateidx ].check;
1781 trie->trans[ stateidx ].check = 0;
1782
1783 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1784 if ( flag || trie->trans[ stateidx + charid ].next ) {
1785 if ( trie->trans[ stateidx + charid ].next ) {
1786 if (o_used == 1) {
1787 for ( ; zp < pos ; zp++ ) {
1788 if ( ! trie->trans[ zp ].next ) {
1789 break;
1790 }
1791 }
1792 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1793 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1794 trie->trans[ zp ].check = state;
1795 if ( ++zp > pos ) pos = zp;
1796 break;
1797 }
1798 used--;
1799 }
1800 if ( !flag ) {
1801 flag = 1;
1802 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1803 }
1804 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1805 trie->trans[ pos ].check = state;
1806 pos++;
1807 }
1808 }
1809 }
1810 trie->lasttrans = pos + 1;
1811 trie->states = (reg_trie_state *)
1812 PerlMemShared_realloc( trie->states, laststate
1813 * sizeof(reg_trie_state) );
1814 DEBUG_TRIE_COMPILE_MORE_r(
1815 PerlIO_printf( Perl_debug_log,
1816 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1817 (int)depth * 2 + 2,"",
1818 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1819 (IV)next_alloc,
1820 (IV)pos,
1821 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1822 );
1823
1824 } /* end table compress */
1825 }
1826 DEBUG_TRIE_COMPILE_MORE_r(
1827 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1828 (int)depth * 2 + 2, "",
1829 (UV)trie->statecount,
1830 (UV)trie->lasttrans)
1831 );
1832 /* resize the trans array to remove unused space */
1833 trie->trans = (reg_trie_trans *)
1834 PerlMemShared_realloc( trie->trans, trie->lasttrans
1835 * sizeof(reg_trie_trans) );
1836
1837 /* and now dump out the compressed format */
1838 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1839
1840 { /* Modify the program and insert the new TRIE node*/
1841 U8 nodetype =(U8)(flags & 0xFF);
1842 char *str=NULL;
1843
1844#ifdef DEBUGGING
1845 regnode *optimize = NULL;
1846 U32 mjd_offset = 0;
1847 U32 mjd_nodelen = 0;
1848#endif
1849 /*
1850 This means we convert either the first branch or the first Exact,
1851 depending on whether the thing following (in 'last') is a branch
1852 or not and whther first is the startbranch (ie is it a sub part of
1853 the alternation or is it the whole thing.)
1854 Assuming its a sub part we conver the EXACT otherwise we convert
1855 the whole branch sequence, including the first.
1856 */
1857 /* Find the node we are going to overwrite */
1858 if ( first != startbranch || OP( last ) == BRANCH ) {
1859 /* branch sub-chain */
1860 NEXT_OFF( first ) = (U16)(last - first);
1861 DEBUG_r({
1862 mjd_offset= Node_Offset((convert));
1863 mjd_nodelen= Node_Length((convert));
1864 });
1865 /* whole branch chain */
1866 } else {
1867 DEBUG_r({
1868 const regnode *nop = NEXTOPER( convert );
1869 mjd_offset= Node_Offset((nop));
1870 mjd_nodelen= Node_Length((nop));
1871 });
1872 }
1873
1874 DEBUG_OPTIMISE_r(
1875 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1876 (int)depth * 2 + 2, "",
1877 (UV)mjd_offset, (UV)mjd_nodelen)
1878 );
1879
1880 /* But first we check to see if there is a common prefix we can
1881 split out as an EXACT and put in front of the TRIE node. */
1882 trie->startstate= 1;
1883 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1884 U32 state;
1885 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1886 U32 ofs = 0;
1887 I32 idx = -1;
1888 U32 count = 0;
1889 const U32 base = trie->states[ state ].trans.base;
1890
1891 if ( trie->states[state].wordnum )
1892 count = 1;
1893
1894 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1895 if ( ( base + ofs >= trie->uniquecharcount ) &&
1896 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1897 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1898 {
1899 if ( ++count > 1 ) {
1900 SV **tmp = av_fetch( revcharmap, ofs, 0);
1901 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
1902 if ( state == 1 ) break;
1903 if ( count == 2 ) {
1904 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1905 DEBUG_OPTIMISE_r(
1906 PerlIO_printf(Perl_debug_log,
1907 "%*sNew Start State=%"UVuf" Class: [",
1908 (int)depth * 2 + 2, "",
1909 (UV)state));
1910 if (idx >= 0) {
1911 SV ** const tmp = av_fetch( revcharmap, idx, 0);
1912 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1913
1914 TRIE_BITMAP_SET(trie,*ch);
1915 if ( folder )
1916 TRIE_BITMAP_SET(trie, folder[ *ch ]);
1917 DEBUG_OPTIMISE_r(
1918 PerlIO_printf(Perl_debug_log, (char*)ch)
1919 );
1920 }
1921 }
1922 TRIE_BITMAP_SET(trie,*ch);
1923 if ( folder )
1924 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1925 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1926 }
1927 idx = ofs;
1928 }
1929 }
1930 if ( count == 1 ) {
1931 SV **tmp = av_fetch( revcharmap, idx, 0);
1932 char *ch = SvPV_nolen( *tmp );
1933 DEBUG_OPTIMISE_r({
1934 SV *sv=sv_newmortal();
1935 PerlIO_printf( Perl_debug_log,
1936 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1937 (int)depth * 2 + 2, "",
1938 (UV)state, (UV)idx,
1939 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1940 PL_colors[0], PL_colors[1],
1941 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1942 PERL_PV_ESCAPE_FIRSTCHAR
1943 )
1944 );
1945 });
1946 if ( state==1 ) {
1947 OP( convert ) = nodetype;
1948 str=STRING(convert);
1949 STR_LEN(convert)=0;
1950 }
1951 while (*ch) {
1952 *str++ = *ch++;
1953 STR_LEN(convert)++;
1954 }
1955
1956 } else {
1957#ifdef DEBUGGING
1958 if (state>1)
1959 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
1960#endif
1961 break;
1962 }
1963 }
1964 if (str) {
1965 regnode *n = convert+NODE_SZ_STR(convert);
1966 NEXT_OFF(convert) = NODE_SZ_STR(convert);
1967 trie->startstate = state;
1968 trie->minlen -= (state - 1);
1969 trie->maxlen -= (state - 1);
1970 DEBUG_r({
1971 regnode *fix = convert;
1972 U32 word = trie->wordcount;
1973 mjd_nodelen++;
1974 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1975 while( ++fix < n ) {
1976 Set_Node_Offset_Length(fix, 0, 0);
1977 }
1978 while (word--) {
1979 SV ** const tmp = av_fetch( trie_words, word, 0 );
1980 if (tmp) {
1981 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1982 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1983 else
1984 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1985 }
1986 }
1987 });
1988 if (trie->maxlen) {
1989 convert = n;
1990 } else {
1991 NEXT_OFF(convert) = (U16)(tail - convert);
1992 DEBUG_r(optimize= n);
1993 }
1994 }
1995 }
1996 if (!jumper)
1997 jumper = last;
1998 if ( trie->maxlen ) {
1999 NEXT_OFF( convert ) = (U16)(tail - convert);
2000 ARG_SET( convert, data_slot );
2001 /* Store the offset to the first unabsorbed branch in
2002 jump[0], which is otherwise unused by the jump logic.
2003 We use this when dumping a trie and during optimisation. */
2004 if (trie->jump)
2005 trie->jump[0] = (U16)(nextbranch - convert);
2006
2007 /* XXXX */
2008 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
2009 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2010 {
2011 OP( convert ) = TRIEC;
2012 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2013 PerlMemShared_free(trie->bitmap);
2014 trie->bitmap= NULL;
2015 } else
2016 OP( convert ) = TRIE;
2017
2018 /* store the type in the flags */
2019 convert->flags = nodetype;
2020 DEBUG_r({
2021 optimize = convert
2022 + NODE_STEP_REGNODE
2023 + regarglen[ OP( convert ) ];
2024 });
2025 /* XXX We really should free up the resource in trie now,
2026 as we won't use them - (which resources?) dmq */
2027 }
2028 /* needed for dumping*/
2029 DEBUG_r(if (optimize) {
2030 regnode *opt = convert;
2031 while ( ++opt < optimize) {
2032 Set_Node_Offset_Length(opt,0,0);
2033 }
2034 /*
2035 Try to clean up some of the debris left after the
2036 optimisation.
2037 */
2038 while( optimize < jumper ) {
2039 mjd_nodelen += Node_Length((optimize));
2040 OP( optimize ) = OPTIMIZED;
2041 Set_Node_Offset_Length(optimize,0,0);
2042 optimize++;
2043 }
2044 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2045 });
2046 } /* end node insert */
2047 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2048#ifdef DEBUGGING
2049 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2050 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2051#else
2052 SvREFCNT_dec(revcharmap);
2053#endif
2054 return trie->jump
2055 ? MADE_JUMP_TRIE
2056 : trie->startstate>1
2057 ? MADE_EXACT_TRIE
2058 : MADE_TRIE;
2059}
2060
2061STATIC void
2062S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2063{
2064/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2065
2066 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2067 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2068 ISBN 0-201-10088-6
2069
2070 We find the fail state for each state in the trie, this state is the longest proper
2071 suffix of the current states 'word' that is also a proper prefix of another word in our
2072 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2073 the DFA not to have to restart after its tried and failed a word at a given point, it
2074 simply continues as though it had been matching the other word in the first place.
2075 Consider
2076 'abcdgu'=~/abcdefg|cdgu/
2077 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2078 fail, which would bring use to the state representing 'd' in the second word where we would
2079 try 'g' and succeed, prodceding to match 'cdgu'.
2080 */
2081 /* add a fail transition */
2082 const U32 trie_offset = ARG(source);
2083 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2084 U32 *q;
2085 const U32 ucharcount = trie->uniquecharcount;
2086 const U32 numstates = trie->statecount;
2087 const U32 ubound = trie->lasttrans + ucharcount;
2088 U32 q_read = 0;
2089 U32 q_write = 0;
2090 U32 charid;
2091 U32 base = trie->states[ 1 ].trans.base;
2092 U32 *fail;
2093 reg_ac_data *aho;
2094 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2095 GET_RE_DEBUG_FLAGS_DECL;
2096#ifndef DEBUGGING
2097 PERL_UNUSED_ARG(depth);
2098#endif
2099
2100
2101 ARG_SET( stclass, data_slot );
2102 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2103 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2104 aho->trie=trie_offset;
2105 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2106 Copy( trie->states, aho->states, numstates, reg_trie_state );
2107 Newxz( q, numstates, U32);
2108 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2109 aho->refcount = 1;
2110 fail = aho->fail;
2111 /* initialize fail[0..1] to be 1 so that we always have
2112 a valid final fail state */
2113 fail[ 0 ] = fail[ 1 ] = 1;
2114
2115 for ( charid = 0; charid < ucharcount ; charid++ ) {
2116 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2117 if ( newstate ) {
2118 q[ q_write ] = newstate;
2119 /* set to point at the root */
2120 fail[ q[ q_write++ ] ]=1;
2121 }
2122 }
2123 while ( q_read < q_write) {
2124 const U32 cur = q[ q_read++ % numstates ];
2125 base = trie->states[ cur ].trans.base;
2126
2127 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2128 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2129 if (ch_state) {
2130 U32 fail_state = cur;
2131 U32 fail_base;
2132 do {
2133 fail_state = fail[ fail_state ];
2134 fail_base = aho->states[ fail_state ].trans.base;
2135 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2136
2137 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2138 fail[ ch_state ] = fail_state;
2139 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2140 {
2141 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2142 }
2143 q[ q_write++ % numstates] = ch_state;
2144 }
2145 }
2146 }
2147 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2148 when we fail in state 1, this allows us to use the
2149 charclass scan to find a valid start char. This is based on the principle
2150 that theres a good chance the string being searched contains lots of stuff
2151 that cant be a start char.
2152 */
2153 fail[ 0 ] = fail[ 1 ] = 0;
2154 DEBUG_TRIE_COMPILE_r({
2155 PerlIO_printf(Perl_debug_log,
2156 "%*sStclass Failtable (%"UVuf" states): 0",
2157 (int)(depth * 2), "", (UV)numstates
2158 );
2159 for( q_read=1; q_read<numstates; q_read++ ) {
2160 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2161 }
2162 PerlIO_printf(Perl_debug_log, "\n");
2163 });
2164 Safefree(q);
2165 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2166}
2167
2168
2169/*
2170 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2171 * These need to be revisited when a newer toolchain becomes available.
2172 */
2173#if defined(__sparc64__) && defined(__GNUC__)
2174# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2175# undef SPARC64_GCC_WORKAROUND
2176# define SPARC64_GCC_WORKAROUND 1
2177# endif
2178#endif
2179
2180#define DEBUG_PEEP(str,scan,depth) \
2181 DEBUG_OPTIMISE_r({if (scan){ \
2182 SV * const mysv=sv_newmortal(); \
2183 regnode *Next = regnext(scan); \
2184 regprop(RExC_rx, mysv, scan); \
2185 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2186 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2187 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2188 }});
2189
2190
2191
2192
2193
2194#define JOIN_EXACT(scan,min,flags) \
2195 if (PL_regkind[OP(scan)] == EXACT) \
2196 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2197
2198STATIC U32
2199S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2200 /* Merge several consecutive EXACTish nodes into one. */
2201 regnode *n = regnext(scan);
2202 U32 stringok = 1;
2203 regnode *next = scan + NODE_SZ_STR(scan);
2204 U32 merged = 0;
2205 U32 stopnow = 0;
2206#ifdef DEBUGGING
2207 regnode *stop = scan;
2208 GET_RE_DEBUG_FLAGS_DECL;
2209#else
2210 PERL_UNUSED_ARG(depth);
2211#endif
2212#ifndef EXPERIMENTAL_INPLACESCAN
2213 PERL_UNUSED_ARG(flags);
2214 PERL_UNUSED_ARG(val);
2215#endif
2216 DEBUG_PEEP("join",scan,depth);
2217
2218 /* Skip NOTHING, merge EXACT*. */
2219 while (n &&
2220 ( PL_regkind[OP(n)] == NOTHING ||
2221 (stringok && (OP(n) == OP(scan))))
2222 && NEXT_OFF(n)
2223 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2224
2225 if (OP(n) == TAIL || n > next)
2226 stringok = 0;
2227 if (PL_regkind[OP(n)] == NOTHING) {
2228 DEBUG_PEEP("skip:",n,depth);
2229 NEXT_OFF(scan) += NEXT_OFF(n);
2230 next = n + NODE_STEP_REGNODE;
2231#ifdef DEBUGGING
2232 if (stringok)
2233 stop = n;
2234#endif
2235 n = regnext(n);
2236 }
2237 else if (stringok) {
2238 const unsigned int oldl = STR_LEN(scan);
2239 regnode * const nnext = regnext(n);
2240
2241 DEBUG_PEEP("merg",n,depth);
2242
2243 merged++;
2244 if (oldl + STR_LEN(n) > U8_MAX)
2245 break;
2246 NEXT_OFF(scan) += NEXT_OFF(n);
2247 STR_LEN(scan) += STR_LEN(n);
2248 next = n + NODE_SZ_STR(n);
2249 /* Now we can overwrite *n : */
2250 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2251#ifdef DEBUGGING
2252 stop = next - 1;
2253#endif
2254 n = nnext;
2255 if (stopnow) break;
2256 }
2257
2258#ifdef EXPERIMENTAL_INPLACESCAN
2259 if (flags && !NEXT_OFF(n)) {
2260 DEBUG_PEEP("atch", val, depth);
2261 if (reg_off_by_arg[OP(n)]) {
2262 ARG_SET(n, val - n);
2263 }
2264 else {
2265 NEXT_OFF(n) = val - n;
2266 }
2267 stopnow = 1;
2268 }
2269#endif
2270 }
2271
2272 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2273 /*
2274 Two problematic code points in Unicode casefolding of EXACT nodes:
2275
2276 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2277 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2278
2279 which casefold to
2280
2281 Unicode UTF-8
2282
2283 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2284 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2285
2286 This means that in case-insensitive matching (or "loose matching",
2287 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2288 length of the above casefolded versions) can match a target string
2289 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2290 This would rather mess up the minimum length computation.
2291
2292 What we'll do is to look for the tail four bytes, and then peek
2293 at the preceding two bytes to see whether we need to decrease
2294 the minimum length by four (six minus two).
2295
2296 Thanks to the design of UTF-8, there cannot be false matches:
2297 A sequence of valid UTF-8 bytes cannot be a subsequence of
2298 another valid sequence of UTF-8 bytes.
2299
2300 */
2301 char * const s0 = STRING(scan), *s, *t;
2302 char * const s1 = s0 + STR_LEN(scan) - 1;
2303 char * const s2 = s1 - 4;
2304#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2305 const char t0[] = "\xaf\x49\xaf\x42";
2306#else
2307 const char t0[] = "\xcc\x88\xcc\x81";
2308#endif
2309 const char * const t1 = t0 + 3;
2310
2311 for (s = s0 + 2;
2312 s < s2 && (t = ninstr(s, s1, t0, t1));
2313 s = t + 4) {
2314#ifdef EBCDIC
2315 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2316 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2317#else
2318 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2319 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2320#endif
2321 *min -= 4;
2322 }
2323 }
2324
2325#ifdef DEBUGGING
2326 /* Allow dumping */
2327 n = scan + NODE_SZ_STR(scan);
2328 while (n <= stop) {
2329 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2330 OP(n) = OPTIMIZED;
2331 NEXT_OFF(n) = 0;
2332 }
2333 n++;
2334 }
2335#endif
2336 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2337 return stopnow;
2338}
2339
2340/* REx optimizer. Converts nodes into quickier variants "in place".
2341 Finds fixed substrings. */
2342
2343/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2344 to the position after last scanned or to NULL. */
2345
2346#define INIT_AND_WITHP \
2347 assert(!and_withp); \
2348 Newx(and_withp,1,struct regnode_charclass_class); \
2349 SAVEFREEPV(and_withp)
2350
2351/* this is a chain of data about sub patterns we are processing that
2352 need to be handled seperately/specially in study_chunk. Its so
2353 we can simulate recursion without losing state. */
2354struct scan_frame;
2355typedef struct scan_frame {
2356 regnode *last; /* last node to process in this frame */
2357 regnode *next; /* next node to process when last is reached */
2358 struct scan_frame *prev; /*previous frame*/
2359 I32 stop; /* what stopparen do we use */
2360} scan_frame;
2361
2362
2363#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2364
2365STATIC I32
2366S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2367 I32 *minlenp, I32 *deltap,
2368 regnode *last,
2369 scan_data_t *data,
2370 I32 stopparen,
2371 U8* recursed,
2372 struct regnode_charclass_class *and_withp,
2373 U32 flags, U32 depth)
2374 /* scanp: Start here (read-write). */
2375 /* deltap: Write maxlen-minlen here. */
2376 /* last: Stop before this one. */
2377 /* data: string data about the pattern */
2378 /* stopparen: treat close N as END */
2379 /* recursed: which subroutines have we recursed into */
2380 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2381{
2382 dVAR;
2383 I32 min = 0, pars = 0, code;
2384 regnode *scan = *scanp, *next;
2385 I32 delta = 0;
2386 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2387 int is_inf_internal = 0; /* The studied chunk is infinite */
2388 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2389 scan_data_t data_fake;
2390 SV *re_trie_maxbuff = NULL;
2391 regnode *first_non_open = scan;
2392 I32 stopmin = I32_MAX;
2393 scan_frame *frame = NULL;
2394
2395 GET_RE_DEBUG_FLAGS_DECL;
2396
2397#ifdef DEBUGGING
2398 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2399#endif
2400
2401 if ( depth == 0 ) {
2402 while (first_non_open && OP(first_non_open) == OPEN)
2403 first_non_open=regnext(first_non_open);
2404 }
2405
2406
2407 fake_study_recurse:
2408 while ( scan && OP(scan) != END && scan < last ){
2409 /* Peephole optimizer: */
2410 DEBUG_STUDYDATA("Peep:", data,depth);
2411 DEBUG_PEEP("Peep",scan,depth);
2412 JOIN_EXACT(scan,&min,0);
2413
2414 /* Follow the next-chain of the current node and optimize
2415 away all the NOTHINGs from it. */
2416 if (OP(scan) != CURLYX) {
2417 const int max = (reg_off_by_arg[OP(scan)]
2418 ? I32_MAX
2419 /* I32 may be smaller than U16 on CRAYs! */
2420 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2421 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2422 int noff;
2423 regnode *n = scan;
2424
2425 /* Skip NOTHING and LONGJMP. */
2426 while ((n = regnext(n))
2427 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2428 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2429 && off + noff < max)
2430 off += noff;
2431 if (reg_off_by_arg[OP(scan)])
2432 ARG(scan) = off;
2433 else
2434 NEXT_OFF(scan) = off;
2435 }
2436
2437
2438
2439 /* The principal pseudo-switch. Cannot be a switch, since we
2440 look into several different things. */
2441 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2442 || OP(scan) == IFTHEN) {
2443 next = regnext(scan);
2444 code = OP(scan);
2445 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2446
2447 if (OP(next) == code || code == IFTHEN) {
2448 /* NOTE - There is similar code to this block below for handling
2449 TRIE nodes on a re-study. If you change stuff here check there
2450 too. */
2451 I32 max1 = 0, min1 = I32_MAX, num = 0;
2452 struct regnode_charclass_class accum;
2453 regnode * const startbranch=scan;
2454
2455 if (flags & SCF_DO_SUBSTR)
2456 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2457 if (flags & SCF_DO_STCLASS)
2458 cl_init_zero(pRExC_state, &accum);
2459
2460 while (OP(scan) == code) {
2461 I32 deltanext, minnext, f = 0, fake;
2462 struct regnode_charclass_class this_class;
2463
2464 num++;
2465 data_fake.flags = 0;
2466 if (data) {
2467 data_fake.whilem_c = data->whilem_c;
2468 data_fake.last_closep = data->last_closep;
2469 }
2470 else
2471 data_fake.last_closep = &fake;
2472
2473 data_fake.pos_delta = delta;
2474 next = regnext(scan);
2475 scan = NEXTOPER(scan);
2476 if (code != BRANCH)
2477 scan = NEXTOPER(scan);
2478 if (flags & SCF_DO_STCLASS) {
2479 cl_init(pRExC_state, &this_class);
2480 data_fake.start_class = &this_class;
2481 f = SCF_DO_STCLASS_AND;
2482 }
2483 if (flags & SCF_WHILEM_VISITED_POS)
2484 f |= SCF_WHILEM_VISITED_POS;
2485
2486 /* we suppose the run is continuous, last=next...*/
2487 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2488 next, &data_fake,
2489 stopparen, recursed, NULL, f,depth+1);
2490 if (min1 > minnext)
2491 min1 = minnext;
2492 if (max1 < minnext + deltanext)
2493 max1 = minnext + deltanext;
2494 if (deltanext == I32_MAX)
2495 is_inf = is_inf_internal = 1;
2496 scan = next;
2497 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2498 pars++;
2499 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2500 if ( stopmin > minnext)
2501 stopmin = min + min1;
2502 flags &= ~SCF_DO_SUBSTR;
2503 if (data)
2504 data->flags |= SCF_SEEN_ACCEPT;
2505 }
2506 if (data) {
2507 if (data_fake.flags & SF_HAS_EVAL)
2508 data->flags |= SF_HAS_EVAL;
2509 data->whilem_c = data_fake.whilem_c;
2510 }
2511 if (flags & SCF_DO_STCLASS)
2512 cl_or(pRExC_state, &accum, &this_class);
2513 }
2514 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2515 min1 = 0;
2516 if (flags & SCF_DO_SUBSTR) {
2517 data->pos_min += min1;
2518 data->pos_delta += max1 - min1;
2519 if (max1 != min1 || is_inf)
2520 data->longest = &(data->longest_float);
2521 }
2522 min += min1;
2523 delta += max1 - min1;
2524 if (flags & SCF_DO_STCLASS_OR) {
2525 cl_or(pRExC_state, data->start_class, &accum);
2526 if (min1) {
2527 cl_and(data->start_class, and_withp);
2528 flags &= ~SCF_DO_STCLASS;
2529 }
2530 }
2531 else if (flags & SCF_DO_STCLASS_AND) {
2532 if (min1) {
2533 cl_and(data->start_class, &accum);
2534 flags &= ~SCF_DO_STCLASS;
2535 }
2536 else {
2537 /* Switch to OR mode: cache the old value of
2538 * data->start_class */
2539 INIT_AND_WITHP;
2540 StructCopy(data->start_class, and_withp,
2541 struct regnode_charclass_class);
2542 flags &= ~SCF_DO_STCLASS_AND;
2543 StructCopy(&accum, data->start_class,
2544 struct regnode_charclass_class);
2545 flags |= SCF_DO_STCLASS_OR;
2546 data->start_class->flags |= ANYOF_EOS;
2547 }
2548 }
2549
2550 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2551 /* demq.
2552
2553 Assuming this was/is a branch we are dealing with: 'scan' now
2554 points at the item that follows the branch sequence, whatever
2555 it is. We now start at the beginning of the sequence and look
2556 for subsequences of
2557
2558 BRANCH->EXACT=>x1
2559 BRANCH->EXACT=>x2
2560 tail
2561
2562 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2563
2564 If we can find such a subseqence we need to turn the first
2565 element into a trie and then add the subsequent branch exact
2566 strings to the trie.
2567
2568 We have two cases
2569
2570 1. patterns where the whole set of branch can be converted.
2571
2572 2. patterns where only a subset can be converted.
2573
2574 In case 1 we can replace the whole set with a single regop
2575 for the trie. In case 2 we need to keep the start and end
2576 branchs so
2577
2578 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2579 becomes BRANCH TRIE; BRANCH X;
2580
2581 There is an additional case, that being where there is a
2582 common prefix, which gets split out into an EXACT like node
2583 preceding the TRIE node.
2584
2585 If x(1..n)==tail then we can do a simple trie, if not we make
2586 a "jump" trie, such that when we match the appropriate word
2587 we "jump" to the appopriate tail node. Essentailly we turn
2588 a nested if into a case structure of sorts.
2589
2590 */
2591
2592 int made=0;
2593 if (!re_trie_maxbuff) {
2594 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2595 if (!SvIOK(re_trie_maxbuff))
2596 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2597 }
2598 if ( SvIV(re_trie_maxbuff)>=0 ) {
2599 regnode *cur;
2600 regnode *first = (regnode *)NULL;
2601 regnode *last = (regnode *)NULL;
2602 regnode *tail = scan;
2603 U8 optype = 0;
2604 U32 count=0;
2605
2606#ifdef DEBUGGING
2607 SV * const mysv = sv_newmortal(); /* for dumping */
2608#endif
2609 /* var tail is used because there may be a TAIL
2610 regop in the way. Ie, the exacts will point to the
2611 thing following the TAIL, but the last branch will
2612 point at the TAIL. So we advance tail. If we
2613 have nested (?:) we may have to move through several
2614 tails.
2615 */
2616
2617 while ( OP( tail ) == TAIL ) {
2618 /* this is the TAIL generated by (?:) */
2619 tail = regnext( tail );
2620 }
2621
2622
2623 DEBUG_OPTIMISE_r({
2624 regprop(RExC_rx, mysv, tail );
2625 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2626 (int)depth * 2 + 2, "",
2627 "Looking for TRIE'able sequences. Tail node is: ",
2628 SvPV_nolen_const( mysv )
2629 );
2630 });
2631
2632 /*
2633
2634 step through the branches, cur represents each
2635 branch, noper is the first thing to be matched
2636 as part of that branch and noper_next is the
2637 regnext() of that node. if noper is an EXACT
2638 and noper_next is the same as scan (our current
2639 position in the regex) then the EXACT branch is
2640 a possible optimization target. Once we have
2641 two or more consequetive such branches we can
2642 create a trie of the EXACT's contents and stich
2643 it in place. If the sequence represents all of
2644 the branches we eliminate the whole thing and
2645 replace it with a single TRIE. If it is a
2646 subsequence then we need to stitch it in. This
2647 means the first branch has to remain, and needs
2648 to be repointed at the item on the branch chain
2649 following the last branch optimized. This could
2650 be either a BRANCH, in which case the
2651 subsequence is internal, or it could be the
2652 item following the branch sequence in which
2653 case the subsequence is at the end.
2654
2655 */
2656
2657 /* dont use tail as the end marker for this traverse */
2658 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2659 regnode * const noper = NEXTOPER( cur );
2660#if defined(DEBUGGING) || defined(NOJUMPTRIE)
2661 regnode * const noper_next = regnext( noper );
2662#endif
2663
2664 DEBUG_OPTIMISE_r({
2665 regprop(RExC_rx, mysv, cur);
2666 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2667 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2668
2669 regprop(RExC_rx, mysv, noper);
2670 PerlIO_printf( Perl_debug_log, " -> %s",
2671 SvPV_nolen_const(mysv));
2672
2673 if ( noper_next ) {
2674 regprop(RExC_rx, mysv, noper_next );
2675 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2676 SvPV_nolen_const(mysv));
2677 }
2678 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2679 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2680 });
2681 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2682 : PL_regkind[ OP( noper ) ] == EXACT )
2683 || OP(noper) == NOTHING )
2684#ifdef NOJUMPTRIE
2685 && noper_next == tail
2686#endif
2687 && count < U16_MAX)
2688 {
2689 count++;
2690 if ( !first || optype == NOTHING ) {
2691 if (!first) first = cur;
2692 optype = OP( noper );
2693 } else {
2694 last = cur;
2695 }
2696 } else {
2697 if ( last ) {
2698 make_trie( pRExC_state,
2699 startbranch, first, cur, tail, count,
2700 optype, depth+1 );
2701 }
2702 if ( PL_regkind[ OP( noper ) ] == EXACT
2703#ifdef NOJUMPTRIE
2704 && noper_next == tail
2705#endif
2706 ){
2707 count = 1;
2708 first = cur;
2709 optype = OP( noper );
2710 } else {
2711 count = 0;
2712 first = NULL;
2713 optype = 0;
2714 }
2715 last = NULL;
2716 }
2717 }
2718 DEBUG_OPTIMISE_r({
2719 regprop(RExC_rx, mysv, cur);
2720 PerlIO_printf( Perl_debug_log,
2721 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2722 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2723
2724 });
2725 if ( last ) {
2726 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2727#ifdef TRIE_STUDY_OPT
2728 if ( ((made == MADE_EXACT_TRIE &&
2729 startbranch == first)
2730 || ( first_non_open == first )) &&
2731 depth==0 ) {
2732 flags |= SCF_TRIE_RESTUDY;
2733 if ( startbranch == first
2734 && scan == tail )
2735 {
2736 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2737 }
2738 }
2739#endif
2740 }
2741 }
2742
2743 } /* do trie */
2744
2745 }
2746 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2747 scan = NEXTOPER(NEXTOPER(scan));
2748 } else /* single branch is optimized. */
2749 scan = NEXTOPER(scan);
2750 continue;
2751 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2752 scan_frame *newframe = NULL;
2753 I32 paren;
2754 regnode *start;
2755 regnode *end;
2756
2757 if (OP(scan) != SUSPEND) {
2758 /* set the pointer */
2759 if (OP(scan) == GOSUB) {
2760 paren = ARG(scan);
2761 RExC_recurse[ARG2L(scan)] = scan;
2762 start = RExC_open_parens[paren-1];
2763 end = RExC_close_parens[paren-1];
2764 } else {
2765 paren = 0;
2766 start = RExC_rxi->program + 1;
2767 end = RExC_opend;
2768 }
2769 if (!recursed) {
2770 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2771 SAVEFREEPV(recursed);
2772 }
2773 if (!PAREN_TEST(recursed,paren+1)) {
2774 PAREN_SET(recursed,paren+1);
2775 Newx(newframe,1,scan_frame);
2776 } else {
2777 if (flags & SCF_DO_SUBSTR) {
2778 SCAN_COMMIT(pRExC_state,data,minlenp);
2779 data->longest = &(data->longest_float);
2780 }
2781 is_inf = is_inf_internal = 1;
2782 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2783 cl_anything(pRExC_state, data->start_class);
2784 flags &= ~SCF_DO_STCLASS;
2785 }
2786 } else {
2787 Newx(newframe,1,scan_frame);
2788 paren = stopparen;
2789 start = scan+2;
2790 end = regnext(scan);
2791 }
2792 if (newframe) {
2793 assert(start);
2794 assert(end);
2795 SAVEFREEPV(newframe);
2796 newframe->next = regnext(scan);
2797 newframe->last = last;
2798 newframe->stop = stopparen;
2799 newframe->prev = frame;
2800
2801 frame = newframe;
2802 scan = start;
2803 stopparen = paren;
2804 last = end;
2805
2806 continue;
2807 }
2808 }
2809 else if (OP(scan) == EXACT) {
2810 I32 l = STR_LEN(scan);
2811 UV uc;
2812 if (UTF) {
2813 const U8 * const s = (U8*)STRING(scan);
2814 l = utf8_length(s, s + l);
2815 uc = utf8_to_uvchr(s, NULL);
2816 } else {
2817 uc = *((U8*)STRING(scan));
2818 }
2819 min += l;
2820 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2821 /* The code below prefers earlier match for fixed
2822 offset, later match for variable offset. */
2823 if (data->last_end == -1) { /* Update the start info. */
2824 data->last_start_min = data->pos_min;
2825 data->last_start_max = is_inf
2826 ? I32_MAX : data->pos_min + data->pos_delta;
2827 }
2828 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2829 if (UTF)
2830 SvUTF8_on(data->last_found);
2831 {
2832 SV * const sv = data->last_found;
2833 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2834 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2835 if (mg && mg->mg_len >= 0)
2836 mg->mg_len += utf8_length((U8*)STRING(scan),
2837 (U8*)STRING(scan)+STR_LEN(scan));
2838 }
2839 data->last_end = data->pos_min + l;
2840 data->pos_min += l; /* As in the first entry. */
2841 data->flags &= ~SF_BEFORE_EOL;
2842 }
2843 if (flags & SCF_DO_STCLASS_AND) {
2844 /* Check whether it is compatible with what we know already! */
2845 int compat = 1;
2846
2847 if (uc >= 0x100 ||
2848 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2849 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2850 && (!(data->start_class->flags & ANYOF_FOLD)
2851 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2852 )
2853 compat = 0;
2854 ANYOF_CLASS_ZERO(data->start_class);
2855 ANYOF_BITMAP_ZERO(data->start_class);
2856 if (compat)
2857 ANYOF_BITMAP_SET(data->start_class, uc);
2858 data->start_class->flags &= ~ANYOF_EOS;
2859 if (uc < 0x100)
2860 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2861 }
2862 else if (flags & SCF_DO_STCLASS_OR) {
2863 /* false positive possible if the class is case-folded */
2864 if (uc < 0x100)
2865 ANYOF_BITMAP_SET(data->start_class, uc);
2866 else
2867 data->start_class->flags |= ANYOF_UNICODE_ALL;
2868 data->start_class->flags &= ~ANYOF_EOS;
2869 cl_and(data->start_class, and_withp);
2870 }
2871 flags &= ~SCF_DO_STCLASS;
2872 }
2873 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2874 I32 l = STR_LEN(scan);
2875 UV uc = *((U8*)STRING(scan));
2876
2877 /* Search for fixed substrings supports EXACT only. */
2878 if (flags & SCF_DO_SUBSTR) {
2879 assert(data);
2880 SCAN_COMMIT(pRExC_state, data, minlenp);
2881 }
2882 if (UTF) {
2883 const U8 * const s = (U8 *)STRING(scan);
2884 l = utf8_length(s, s + l);
2885 uc = utf8_to_uvchr(s, NULL);
2886 }
2887 min += l;
2888 if (flags & SCF_DO_SUBSTR)
2889 data->pos_min += l;
2890 if (flags & SCF_DO_STCLASS_AND) {
2891 /* Check whether it is compatible with what we know already! */
2892 int compat = 1;
2893
2894 if (uc >= 0x100 ||
2895 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2896 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2897 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2898 compat = 0;
2899 ANYOF_CLASS_ZERO(data->start_class);
2900 ANYOF_BITMAP_ZERO(data->start_class);
2901 if (compat) {
2902 ANYOF_BITMAP_SET(data->start_class, uc);
2903 data->start_class->flags &= ~ANYOF_EOS;
2904 data->start_class->flags |= ANYOF_FOLD;
2905 if (OP(scan) == EXACTFL)
2906 data->start_class->flags |= ANYOF_LOCALE;
2907 }
2908 }
2909 else if (flags & SCF_DO_STCLASS_OR) {
2910 if (data->start_class->flags & ANYOF_FOLD) {
2911 /* false positive possible if the class is case-folded.
2912 Assume that the locale settings are the same... */
2913 if (uc < 0x100)
2914 ANYOF_BITMAP_SET(data->start_class, uc);
2915 data->start_class->flags &= ~ANYOF_EOS;
2916 }
2917 cl_and(data->start_class, and_withp);
2918 }
2919 flags &= ~SCF_DO_STCLASS;
2920 }
2921 else if (strchr((const char*)PL_varies,OP(scan))) {
2922 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2923 I32 f = flags, pos_before = 0;
2924 regnode * const oscan = scan;
2925 struct regnode_charclass_class this_class;
2926 struct regnode_charclass_class *oclass = NULL;
2927 I32 next_is_eval = 0;
2928
2929 switch (PL_regkind[OP(scan)]) {
2930 case WHILEM: /* End of (?:...)* . */
2931 scan = NEXTOPER(scan);
2932 goto finish;
2933 case PLUS:
2934 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2935 next = NEXTOPER(scan);
2936 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2937 mincount = 1;
2938 maxcount = REG_INFTY;
2939 next = regnext(scan);
2940 scan = NEXTOPER(scan);
2941 goto do_curly;
2942 }
2943 }
2944 if (flags & SCF_DO_SUBSTR)
2945 data->pos_min++;
2946 min++;
2947 /* Fall through. */
2948 case STAR:
2949 if (flags & SCF_DO_STCLASS) {
2950 mincount = 0;
2951 maxcount = REG_INFTY;
2952 next = regnext(scan);
2953 scan = NEXTOPER(scan);
2954 goto do_curly;
2955 }
2956 is_inf = is_inf_internal = 1;
2957 scan = regnext(scan);
2958 if (flags & SCF_DO_SUBSTR) {
2959 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
2960 data->longest = &(data->longest_float);
2961 }
2962 goto optimize_curly_tail;
2963 case CURLY:
2964 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2965 && (scan->flags == stopparen))
2966 {
2967 mincount = 1;
2968 maxcount = 1;
2969 } else {
2970 mincount = ARG1(scan);
2971 maxcount = ARG2(scan);
2972 }
2973 next = regnext(scan);
2974 if (OP(scan) == CURLYX) {
2975 I32 lp = (data ? *(data->last_closep) : 0);
2976 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2977 }
2978 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2979 next_is_eval = (OP(scan) == EVAL);
2980 do_curly:
2981 if (flags & SCF_DO_SUBSTR) {
2982 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
2983 pos_before = data->pos_min;
2984 }
2985 if (data) {
2986 fl = data->flags;
2987 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2988 if (is_inf)
2989 data->flags |= SF_IS_INF;
2990 }
2991 if (flags & SCF_DO_STCLASS) {
2992 cl_init(pRExC_state, &this_class);
2993 oclass = data->start_class;
2994 data->start_class = &this_class;
2995 f |= SCF_DO_STCLASS_AND;
2996 f &= ~SCF_DO_STCLASS_OR;
2997 }
2998 /* These are the cases when once a subexpression
2999 fails at a particular position, it cannot succeed
3000 even after backtracking at the enclosing scope.
3001
3002 XXXX what if minimal match and we are at the
3003 initial run of {n,m}? */
3004 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3005 f &= ~SCF_WHILEM_VISITED_POS;
3006
3007 /* This will finish on WHILEM, setting scan, or on NULL: */
3008 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3009 last, data, stopparen, recursed, NULL,
3010 (mincount == 0
3011 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3012
3013 if (flags & SCF_DO_STCLASS)
3014 data->start_class = oclass;
3015 if (mincount == 0 || minnext == 0) {
3016 if (flags & SCF_DO_STCLASS_OR) {
3017 cl_or(pRExC_state, data->start_class, &this_class);
3018 }
3019 else if (flags & SCF_DO_STCLASS_AND) {
3020 /* Switch to OR mode: cache the old value of
3021 * data->start_class */
3022 INIT_AND_WITHP;
3023 StructCopy(data->start_class, and_withp,
3024 struct regnode_charclass_class);
3025 flags &= ~SCF_DO_STCLASS_AND;
3026 StructCopy(&this_class, data->start_class,
3027 struct regnode_charclass_class);
3028 flags |= SCF_DO_STCLASS_OR;
3029 data->start_class->flags |= ANYOF_EOS;
3030 }
3031 } else { /* Non-zero len */
3032 if (flags & SCF_DO_STCLASS_OR) {
3033 cl_or(pRExC_state, data->start_class, &this_class);
3034 cl_and(data->start_class, and_withp);
3035 }
3036 else if (flags & SCF_DO_STCLASS_AND)
3037 cl_and(data->start_class, &this_class);
3038 flags &= ~SCF_DO_STCLASS;
3039 }
3040 if (!scan) /* It was not CURLYX, but CURLY. */
3041 scan = next;
3042 if ( /* ? quantifier ok, except for (?{ ... }) */
3043 (next_is_eval || !(mincount == 0 && maxcount == 1))
3044 && (minnext == 0) && (deltanext == 0)
3045 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3046 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3047 && ckWARN(WARN_REGEXP))
3048 {
3049 vWARN(RExC_parse,
3050 "Quantifier unexpected on zero-length expression");
3051 }
3052
3053 min += minnext * mincount;
3054 is_inf_internal |= ((maxcount == REG_INFTY
3055 && (minnext + deltanext) > 0)
3056 || deltanext == I32_MAX);
3057 is_inf |= is_inf_internal;
3058 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3059
3060 /* Try powerful optimization CURLYX => CURLYN. */
3061 if ( OP(oscan) == CURLYX && data
3062 && data->flags & SF_IN_PAR
3063 && !(data->flags & SF_HAS_EVAL)
3064 && !deltanext && minnext == 1 ) {
3065 /* Try to optimize to CURLYN. */
3066 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3067 regnode * const nxt1 = nxt;
3068#ifdef DEBUGGING
3069 regnode *nxt2;
3070#endif
3071
3072 /* Skip open. */
3073 nxt = regnext(nxt);
3074 if (!strchr((const char*)PL_simple,OP(nxt))
3075 && !(PL_regkind[OP(nxt)] == EXACT
3076 && STR_LEN(nxt) == 1))
3077 goto nogo;
3078#ifdef DEBUGGING
3079 nxt2 = nxt;
3080#endif
3081 nxt = regnext(nxt);
3082 if (OP(nxt) != CLOSE)
3083 goto nogo;
3084 if (RExC_open_parens) {
3085 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3086 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3087 }
3088 /* Now we know that nxt2 is the only contents: */
3089 oscan->flags = (U8)ARG(nxt);
3090 OP(oscan) = CURLYN;
3091 OP(nxt1) = NOTHING; /* was OPEN. */
3092
3093#ifdef DEBUGGING
3094 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3095 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3096 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3097 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3098 OP(nxt + 1) = OPTIMIZED; /* was count. */
3099 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
3100#endif
3101 }
3102 nogo:
3103
3104 /* Try optimization CURLYX => CURLYM. */
3105 if ( OP(oscan) == CURLYX && data
3106 && !(data->flags & SF_HAS_PAR)
3107 && !(data->flags & SF_HAS_EVAL)
3108 && !deltanext /* atom is fixed width */
3109 && minnext != 0 /* CURLYM can't handle zero width */
3110 ) {
3111 /* XXXX How to optimize if data == 0? */
3112 /* Optimize to a simpler form. */
3113 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3114 regnode *nxt2;
3115
3116 OP(oscan) = CURLYM;
3117 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3118 && (OP(nxt2) != WHILEM))
3119 nxt = nxt2;
3120 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3121 /* Need to optimize away parenths. */
3122 if (data->flags & SF_IN_PAR) {
3123 /* Set the parenth number. */
3124 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3125
3126 if (OP(nxt) != CLOSE)
3127 FAIL("Panic opt close");
3128 oscan->flags = (U8)ARG(nxt);
3129 if (RExC_open_parens) {
3130 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3131 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3132 }
3133 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3134 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3135
3136#ifdef DEBUGGING
3137 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3138 OP(nxt + 1) = OPTIMIZED; /* was count. */
3139 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3140 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3141#endif
3142#if 0
3143 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3144 regnode *nnxt = regnext(nxt1);
3145
3146 if (nnxt == nxt) {
3147 if (reg_off_by_arg[OP(nxt1)])
3148 ARG_SET(nxt1, nxt2 - nxt1);
3149 else if (nxt2 - nxt1 < U16_MAX)
3150 NEXT_OFF(nxt1) = nxt2 - nxt1;
3151 else
3152 OP(nxt) = NOTHING; /* Cannot beautify */
3153 }
3154 nxt1 = nnxt;
3155 }
3156#endif
3157 /* Optimize again: */
3158 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3159 NULL, stopparen, recursed, NULL, 0,depth+1);
3160 }
3161 else
3162 oscan->flags = 0;
3163 }
3164 else if ((OP(oscan) == CURLYX)
3165 && (flags & SCF_WHILEM_VISITED_POS)
3166 /* See the comment on a similar expression above.
3167 However, this time it not a subexpression
3168 we care about, but the expression itself. */
3169 && (maxcount == REG_INFTY)
3170 && data && ++data->whilem_c < 16) {
3171 /* This stays as CURLYX, we can put the count/of pair. */
3172 /* Find WHILEM (as in regexec.c) */
3173 regnode *nxt = oscan + NEXT_OFF(oscan);
3174
3175 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3176 nxt += ARG(nxt);
3177 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3178 | (RExC_whilem_seen << 4)); /* On WHILEM */
3179 }
3180 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3181 pars++;
3182 if (flags & SCF_DO_SUBSTR) {
3183 SV *last_str = NULL;
3184 int counted = mincount != 0;
3185
3186 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3187#if defined(SPARC64_GCC_WORKAROUND)
3188 I32 b = 0;
3189 STRLEN l = 0;
3190 const char *s = NULL;
3191 I32 old = 0;
3192
3193 if (pos_before >= data->last_start_min)
3194 b = pos_before;
3195 else
3196 b = data->last_start_min;
3197
3198 l = 0;
3199 s = SvPV_const(data->last_found, l);
3200 old = b - data->last_start_min;
3201
3202#else
3203 I32 b = pos_before >= data->last_start_min
3204 ? pos_before : data->last_start_min;
3205 STRLEN l;
3206 const char * const s = SvPV_const(data->last_found, l);
3207 I32 old = b - data->last_start_min;
3208#endif
3209
3210 if (UTF)
3211 old = utf8_hop((U8*)s, old) - (U8*)s;
3212
3213 l -= old;
3214 /* Get the added string: */
3215 last_str = newSVpvn(s + old, l);
3216 if (UTF)
3217 SvUTF8_on(last_str);
3218 if (deltanext == 0 && pos_before == b) {
3219 /* What was added is a constant string */
3220 if (mincount > 1) {
3221 SvGROW(last_str, (mincount * l) + 1);
3222 repeatcpy(SvPVX(last_str) + l,
3223 SvPVX_const(last_str), l, mincount - 1);
3224 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3225 /* Add additional parts. */
3226 SvCUR_set(data->last_found,
3227 SvCUR(data->last_found) - l);
3228 sv_catsv(data->last_found, last_str);
3229 {
3230 SV * sv = data->last_found;
3231 MAGIC *mg =
3232 SvUTF8(sv) && SvMAGICAL(sv) ?
3233 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3234 if (mg && mg->mg_len >= 0)
3235 mg->mg_len += CHR_SVLEN(last_str);
3236 }
3237 data->last_end += l * (mincount - 1);
3238 }
3239 } else {
3240 /* start offset must point into the last copy */
3241 data->last_start_min += minnext * (mincount - 1);
3242 data->last_start_max += is_inf ? I32_MAX
3243 : (maxcount - 1) * (minnext + data->pos_delta);
3244 }
3245 }
3246 /* It is counted once already... */
3247 data->pos_min += minnext * (mincount - counted);
3248 data->pos_delta += - counted * deltanext +
3249 (minnext + deltanext) * maxcount - minnext * mincount;
3250 if (mincount != maxcount) {
3251 /* Cannot extend fixed substrings found inside
3252 the group. */
3253 SCAN_COMMIT(pRExC_state,data,minlenp);
3254 if (mincount && last_str) {
3255 SV * const sv = data->last_found;
3256 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3257 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3258
3259 if (mg)
3260 mg->mg_len = -1;
3261 sv_setsv(sv, last_str);
3262 data->last_end = data->pos_min;
3263 data->last_start_min =
3264 data->pos_min - CHR_SVLEN(last_str);
3265 data->last_start_max = is_inf
3266 ? I32_MAX
3267 : data->pos_min + data->pos_delta
3268 - CHR_SVLEN(last_str);
3269 }
3270 data->longest = &(data->longest_float);
3271 }
3272 SvREFCNT_dec(last_str);
3273 }
3274 if (data && (fl & SF_HAS_EVAL))
3275 data->flags |= SF_HAS_EVAL;
3276 optimize_curly_tail:
3277 if (OP(oscan) != CURLYX) {
3278 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3279 && NEXT_OFF(next))
3280 NEXT_OFF(oscan) += NEXT_OFF(next);
3281 }
3282 continue;
3283 default: /* REF and CLUMP only? */
3284 if (flags & SCF_DO_SUBSTR) {
3285 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3286 data->longest = &(data->longest_float);
3287 }
3288 is_inf = is_inf_internal = 1;
3289 if (flags & SCF_DO_STCLASS_OR)
3290 cl_anything(pRExC_state, data->start_class);
3291 flags &= ~SCF_DO_STCLASS;
3292 break;
3293 }
3294 }
3295 else if (strchr((const char*)PL_simple,OP(scan))) {
3296 int value = 0;
3297
3298 if (flags & SCF_DO_SUBSTR) {
3299 SCAN_COMMIT(pRExC_state,data,minlenp);
3300 data->pos_min++;
3301 }
3302 min++;
3303 if (flags & SCF_DO_STCLASS) {
3304 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3305
3306 /* Some of the logic below assumes that switching
3307 locale on will only add false positives. */
3308 switch (PL_regkind[OP(scan)]) {
3309 case SANY:
3310 default:
3311 do_default:
3312 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3313 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3314 cl_anything(pRExC_state, data->start_class);
3315 break;
3316 case REG_ANY:
3317 if (OP(scan) == SANY)
3318 goto do_default;
3319 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3320 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3321 || (data->start_class->flags & ANYOF_CLASS));
3322 cl_anything(pRExC_state, data->start_class);
3323 }
3324 if (flags & SCF_DO_STCLASS_AND || !value)
3325 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3326 break;
3327 case ANYOF:
3328 if (flags & SCF_DO_STCLASS_AND)
3329 cl_and(data->start_class,
3330 (struct regnode_charclass_class*)scan);
3331 else
3332 cl_or(pRExC_state, data->start_class,
3333 (struct regnode_charclass_class*)scan);
3334 break;
3335 case ALNUM:
3336 if (flags & SCF_DO_STCLASS_AND) {
3337 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3338 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3339 for (value = 0; value < 256; value++)
3340 if (!isALNUM(value))
3341 ANYOF_BITMAP_CLEAR(data->start_class, value);
3342 }
3343 }
3344 else {
3345 if (data->start_class->flags & ANYOF_LOCALE)
3346 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3347 else {
3348 for (value = 0; value < 256; value++)
3349 if (isALNUM(value))
3350 ANYOF_BITMAP_SET(data->start_class, value);
3351 }
3352 }
3353 break;
3354 case ALNUML:
3355 if (flags & SCF_DO_STCLASS_AND) {
3356 if (data->start_class->flags & ANYOF_LOCALE)
3357 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3358 }
3359 else {
3360 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3361 data->start_class->flags |= ANYOF_LOCALE;
3362 }
3363 break;
3364 case NALNUM:
3365 if (flags & SCF_DO_STCLASS_AND) {
3366 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3367 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3368 for (value = 0; value < 256; value++)
3369 if (isALNUM(value))
3370 ANYOF_BITMAP_CLEAR(data->start_class, value);
3371 }
3372 }
3373 else {
3374 if (data->start_class->flags & ANYOF_LOCALE)
3375 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3376 else {
3377 for (value = 0; value < 256; value++)
3378 if (!isALNUM(value))
3379 ANYOF_BITMAP_SET(data->start_class, value);
3380 }
3381 }
3382 break;
3383 case NALNUML:
3384 if (flags & SCF_DO_STCLASS_AND) {
3385 if (data->start_class->flags & ANYOF_LOCALE)
3386 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3387 }
3388 else {
3389 data->start_class->flags |= ANYOF_LOCALE;
3390 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3391 }
3392 break;
3393 case SPACE:
3394 if (flags & SCF_DO_STCLASS_AND) {
3395 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3396 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3397 for (value = 0; value < 256; value++)
3398 if (!isSPACE(value))
3399 ANYOF_BITMAP_CLEAR(data->start_class, value);
3400 }
3401 }
3402 else {
3403 if (data->start_class->flags & ANYOF_LOCALE)
3404 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3405 else {
3406 for (value = 0; value < 256; value++)
3407 if (isSPACE(value))
3408 ANYOF_BITMAP_SET(data->start_class, value);
3409 }
3410 }
3411 break;
3412 case SPACEL:
3413 if (flags & SCF_DO_STCLASS_AND) {
3414 if (data->start_class->flags & ANYOF_LOCALE)
3415 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3416 }
3417 else {
3418 data->start_class->flags |= ANYOF_LOCALE;
3419 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3420 }
3421 break;
3422 case NSPACE:
3423 if (flags & SCF_DO_STCLASS_AND) {
3424 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3425 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3426 for (value = 0; value < 256; value++)
3427 if (isSPACE(value))
3428 ANYOF_BITMAP_CLEAR(data->start_class, value);
3429 }
3430 }
3431 else {
3432 if (data->start_class->flags & ANYOF_LOCALE)
3433 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3434 else {
3435 for (value = 0; value < 256; value++)
3436 if (!isSPACE(value))
3437 ANYOF_BITMAP_SET(data->start_class, value);
3438 }
3439 }
3440 break;
3441 case NSPACEL:
3442 if (flags & SCF_DO_STCLASS_AND) {
3443 if (data->start_class->flags & ANYOF_LOCALE) {
3444 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3445 for (value = 0; value < 256; value++)
3446 if (!isSPACE(value))
3447 ANYOF_BITMAP_CLEAR(data->start_class, value);
3448 }
3449 }
3450 else {
3451 data->start_class->flags |= ANYOF_LOCALE;
3452 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3453 }
3454 break;
3455 case DIGIT:
3456 if (flags & SCF_DO_STCLASS_AND) {
3457 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3458 for (value = 0; value < 256; value++)
3459 if (!isDIGIT(value))
3460 ANYOF_BITMAP_CLEAR(data->start_class, value);
3461 }
3462 else {
3463 if (data->start_class->flags & ANYOF_LOCALE)
3464 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3465 else {
3466 for (value = 0; value < 256; value++)
3467 if (isDIGIT(value))
3468 ANYOF_BITMAP_SET(data->start_class, value);
3469 }
3470 }
3471 break;
3472 case NDIGIT:
3473 if (flags & SCF_DO_STCLASS_AND) {
3474 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3475 for (value = 0; value < 256; value++)
3476 if (isDIGIT(value))
3477 ANYOF_BITMAP_CLEAR(data->start_class, value);
3478 }
3479 else {
3480 if (data->start_class->flags & ANYOF_LOCALE)
3481 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3482 else {
3483 for (value = 0; value < 256; value++)
3484 if (!isDIGIT(value))
3485 ANYOF_BITMAP_SET(data->start_class, value);
3486 }
3487 }
3488 break;
3489 }
3490 if (flags & SCF_DO_STCLASS_OR)
3491 cl_and(data->start_class, and_withp);
3492 flags &= ~SCF_DO_STCLASS;
3493 }
3494 }
3495 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3496 data->flags |= (OP(scan) == MEOL
3497 ? SF_BEFORE_MEOL
3498 : SF_BEFORE_SEOL);
3499 }
3500 else if ( PL_regkind[OP(scan)] == BRANCHJ
3501 /* Lookbehind, or need to calculate parens/evals/stclass: */
3502 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3503 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3504 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3505 || OP(scan) == UNLESSM )
3506 {
3507 /* Negative Lookahead/lookbehind
3508 In this case we can't do fixed string optimisation.
3509 */
3510
3511 I32 deltanext, minnext, fake = 0;
3512 regnode *nscan;
3513 struct regnode_charclass_class intrnl;
3514 int f = 0;
3515
3516 data_fake.flags = 0;
3517 if (data) {
3518 data_fake.whilem_c = data->whilem_c;
3519 data_fake.last_closep = data->last_closep;
3520 }
3521 else
3522 data_fake.last_closep = &fake;
3523 data_fake.pos_delta = delta;
3524 if ( flags & SCF_DO_STCLASS && !scan->flags
3525 && OP(scan) == IFMATCH ) { /* Lookahead */
3526 cl_init(pRExC_state, &intrnl);
3527 data_fake.start_class = &intrnl;
3528 f |= SCF_DO_STCLASS_AND;
3529 }
3530 if (flags & SCF_WHILEM_VISITED_POS)
3531 f |= SCF_WHILEM_VISITED_POS;
3532 next = regnext(scan);
3533 nscan = NEXTOPER(NEXTOPER(scan));
3534 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3535 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3536 if (scan->flags) {
3537 if (deltanext) {
3538 FAIL("Variable length lookbehind not implemented");
3539 }
3540 else if (minnext > (I32)U8_MAX) {
3541 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3542 }
3543 scan->flags = (U8)minnext;
3544 }
3545 if (data) {
3546 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3547 pars++;
3548 if (data_fake.flags & SF_HAS_EVAL)
3549 data->flags |= SF_HAS_EVAL;
3550 data->whilem_c = data_fake.whilem_c;
3551 }
3552 if (f & SCF_DO_STCLASS_AND) {
3553 const int was = (data->start_class->flags & ANYOF_EOS);
3554
3555 cl_and(data->start_class, &intrnl);
3556 if (was)
3557 data->start_class->flags |= ANYOF_EOS;
3558 }
3559 }
3560#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3561 else {
3562 /* Positive Lookahead/lookbehind
3563 In this case we can do fixed string optimisation,
3564 but we must be careful about it. Note in the case of
3565 lookbehind the positions will be offset by the minimum
3566 length of the pattern, something we won't know about
3567 until after the recurse.
3568 */
3569 I32 deltanext, fake = 0;
3570 regnode *nscan;
3571 struct regnode_charclass_class intrnl;
3572 int f = 0;
3573 /* We use SAVEFREEPV so that when the full compile
3574 is finished perl will clean up the allocated
3575 minlens when its all done. This was we don't
3576 have to worry about freeing them when we know
3577 they wont be used, which would be a pain.
3578 */
3579 I32 *minnextp;
3580 Newx( minnextp, 1, I32 );
3581 SAVEFREEPV(minnextp);
3582
3583 if (data) {
3584 StructCopy(data, &data_fake, scan_data_t);
3585 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3586 f |= SCF_DO_SUBSTR;
3587 if (scan->flags)
3588 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3589 data_fake.last_found=newSVsv(data->last_found);
3590 }
3591 }
3592 else
3593 data_fake.last_closep = &fake;
3594 data_fake.flags = 0;
3595 data_fake.pos_delta = delta;
3596 if (is_inf)
3597 data_fake.flags |= SF_IS_INF;
3598 if ( flags & SCF_DO_STCLASS && !scan->flags
3599 && OP(scan) == IFMATCH ) { /* Lookahead */
3600 cl_init(pRExC_state, &intrnl);
3601 data_fake.start_class = &intrnl;
3602 f |= SCF_DO_STCLASS_AND;
3603 }
3604 if (flags & SCF_WHILEM_VISITED_POS)
3605 f |= SCF_WHILEM_VISITED_POS;
3606 next = regnext(scan);
3607 nscan = NEXTOPER(NEXTOPER(scan));
3608
3609 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3610 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3611 if (scan->flags) {
3612 if (deltanext) {
3613 FAIL("Variable length lookbehind not implemented");
3614 }
3615 else if (*minnextp > (I32)U8_MAX) {
3616 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3617 }
3618 scan->flags = (U8)*minnextp;
3619 }
3620
3621 *minnextp += min;
3622
3623 if (f & SCF_DO_STCLASS_AND) {
3624 const int was = (data->start_class->flags & ANYOF_EOS);
3625
3626 cl_and(data->start_class, &intrnl);
3627 if (was)
3628 data->start_class->flags |= ANYOF_EOS;
3629 }
3630 if (data) {
3631 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3632 pars++;
3633 if (data_fake.flags & SF_HAS_EVAL)
3634 data->flags |= SF_HAS_EVAL;
3635 data->whilem_c = data_fake.whilem_c;
3636 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3637 if (RExC_rx->minlen<*minnextp)
3638 RExC_rx->minlen=*minnextp;
3639 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3640 SvREFCNT_dec(data_fake.last_found);
3641
3642 if ( data_fake.minlen_fixed != minlenp )
3643 {
3644 data->offset_fixed= data_fake.offset_fixed;
3645 data->minlen_fixed= data_fake.minlen_fixed;
3646 data->lookbehind_fixed+= scan->flags;
3647 }
3648 if ( data_fake.minlen_float != minlenp )
3649 {
3650 data->minlen_float= data_fake.minlen_float;
3651 data->offset_float_min=data_fake.offset_float_min;
3652 data->offset_float_max=data_fake.offset_float_max;
3653 data->lookbehind_float+= scan->flags;
3654 }
3655 }
3656 }
3657
3658
3659 }
3660#endif
3661 }
3662 else if (OP(scan) == OPEN) {
3663 if (stopparen != (I32)ARG(scan))
3664 pars++;
3665 }
3666 else if (OP(scan) == CLOSE) {
3667 if (stopparen == (I32)ARG(scan)) {
3668 break;
3669 }
3670 if ((I32)ARG(scan) == is_par) {
3671 next = regnext(scan);
3672
3673 if ( next && (OP(next) != WHILEM) && next < last)
3674 is_par = 0; /* Disable optimization */
3675 }
3676 if (data)
3677 *(data->last_closep) = ARG(scan);
3678 }
3679 else if (OP(scan) == EVAL) {
3680 if (data)
3681 data->flags |= SF_HAS_EVAL;
3682 }
3683 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3684 if (flags & SCF_DO_SUBSTR) {
3685 SCAN_COMMIT(pRExC_state,data,minlenp);
3686 flags &= ~SCF_DO_SUBSTR;
3687 }
3688 if (data && OP(scan)==ACCEPT) {
3689 data->flags |= SCF_SEEN_ACCEPT;
3690 if (stopmin > min)
3691 stopmin = min;
3692 }
3693 }
3694 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3695 {
3696 if (flags & SCF_DO_SUBSTR) {
3697 SCAN_COMMIT(pRExC_state,data,minlenp);
3698 data->longest = &(data->longest_float);
3699 }
3700 is_inf = is_inf_internal = 1;
3701 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3702 cl_anything(pRExC_state, data->start_class);
3703 flags &= ~SCF_DO_STCLASS;
3704 }
3705 else if (OP(scan) == GPOS) {
3706 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
3707 !(delta || is_inf || (data && data->pos_delta)))
3708 {
3709 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3710 RExC_rx->extflags |= RXf_ANCH_GPOS;
3711 if (RExC_rx->gofs < (U32)min)
3712 RExC_rx->gofs = min;
3713 } else {
3714 RExC_rx->extflags |= RXf_GPOS_FLOAT;
3715 RExC_rx->gofs = 0;
3716 }
3717 }
3718#ifdef TRIE_STUDY_OPT
3719#ifdef FULL_TRIE_STUDY
3720 else if (PL_regkind[OP(scan)] == TRIE) {
3721 /* NOTE - There is similar code to this block above for handling
3722 BRANCH nodes on the initial study. If you change stuff here
3723 check there too. */
3724 regnode *trie_node= scan;
3725 regnode *tail= regnext(scan);
3726 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3727 I32 max1 = 0, min1 = I32_MAX;
3728 struct regnode_charclass_class accum;
3729
3730 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
3731 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
3732 if (flags & SCF_DO_STCLASS)
3733 cl_init_zero(pRExC_state, &accum);
3734
3735 if (!trie->jump) {
3736 min1= trie->minlen;
3737 max1= trie->maxlen;
3738 } else {
3739 const regnode *nextbranch= NULL;
3740 U32 word;
3741
3742 for ( word=1 ; word <= trie->wordcount ; word++)
3743 {
3744 I32 deltanext=0, minnext=0, f = 0, fake;
3745 struct regnode_charclass_class this_class;
3746
3747 data_fake.flags = 0;
3748 if (data) {
3749 data_fake.whilem_c = data->whilem_c;
3750 data_fake.last_closep = data->last_closep;
3751 }
3752 else
3753 data_fake.last_closep = &fake;
3754 data_fake.pos_delta = delta;
3755 if (flags & SCF_DO_STCLASS) {
3756 cl_init(pRExC_state, &this_class);
3757 data_fake.start_class = &this_class;
3758 f = SCF_DO_STCLASS_AND;
3759 }
3760 if (flags & SCF_WHILEM_VISITED_POS)
3761 f |= SCF_WHILEM_VISITED_POS;
3762
3763 if (trie->jump[word]) {
3764 if (!nextbranch)
3765 nextbranch = trie_node + trie->jump[0];
3766 scan= trie_node + trie->jump[word];
3767 /* We go from the jump point to the branch that follows
3768 it. Note this means we need the vestigal unused branches
3769 even though they arent otherwise used.
3770 */
3771 minnext = study_chunk(pRExC_state, &scan, minlenp,
3772 &deltanext, (regnode *)nextbranch, &data_fake,
3773 stopparen, recursed, NULL, f,depth+1);
3774 }
3775 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3776 nextbranch= regnext((regnode*)nextbranch);
3777
3778 if (min1 > (I32)(minnext + trie->minlen))
3779 min1 = minnext + trie->minlen;
3780 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3781 max1 = minnext + deltanext + trie->maxlen;
3782 if (deltanext == I32_MAX)
3783 is_inf = is_inf_internal = 1;
3784
3785 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3786 pars++;
3787 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3788 if ( stopmin > min + min1)
3789 stopmin = min + min1;
3790 flags &= ~SCF_DO_SUBSTR;
3791 if (data)
3792 data->flags |= SCF_SEEN_ACCEPT;
3793 }
3794 if (data) {
3795 if (data_fake.flags & SF_HAS_EVAL)
3796 data->flags |= SF_HAS_EVAL;
3797 data->whilem_c = data_fake.whilem_c;
3798 }
3799 if (flags & SCF_DO_STCLASS)
3800 cl_or(pRExC_state, &accum, &this_class);
3801 }
3802 }
3803 if (flags & SCF_DO_SUBSTR) {
3804 data->pos_min += min1;
3805 data->pos_delta += max1 - min1;
3806 if (max1 != min1 || is_inf)
3807 data->longest = &(data->longest_float);
3808 }
3809 min += min1;
3810 delta += max1 - min1;
3811 if (flags & SCF_DO_STCLASS_OR) {
3812 cl_or(pRExC_state, data->start_class, &accum);
3813 if (min1) {
3814 cl_and(data->start_class, and_withp);
3815 flags &= ~SCF_DO_STCLASS;
3816 }
3817 }
3818 else if (flags & SCF_DO_STCLASS_AND) {
3819 if (min1) {
3820 cl_and(data->start_class, &accum);
3821 flags &= ~SCF_DO_STCLASS;
3822 }
3823 else {
3824 /* Switch to OR mode: cache the old value of
3825 * data->start_class */
3826 INIT_AND_WITHP;
3827 StructCopy(data->start_class, and_withp,
3828 struct regnode_charclass_class);
3829 flags &= ~SCF_DO_STCLASS_AND;
3830 StructCopy(&accum, data->start_class,
3831 struct regnode_charclass_class);
3832 flags |= SCF_DO_STCLASS_OR;
3833 data->start_class->flags |= ANYOF_EOS;
3834 }
3835 }
3836 scan= tail;
3837 continue;
3838 }
3839#else
3840 else if (PL_regkind[OP(scan)] == TRIE) {
3841 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3842 U8*bang=NULL;
3843
3844 min += trie->minlen;
3845 delta += (trie->maxlen - trie->minlen);
3846 flags &= ~SCF_DO_STCLASS; /* xxx */
3847 if (flags & SCF_DO_SUBSTR) {
3848 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3849 data->pos_min += trie->minlen;
3850 data->pos_delta += (trie->maxlen - trie->minlen);
3851 if (trie->maxlen != trie->minlen)
3852 data->longest = &(data->longest_float);
3853 }
3854 if (trie->jump) /* no more substrings -- for now /grr*/
3855 flags &= ~SCF_DO_SUBSTR;
3856 }
3857#endif /* old or new */
3858#endif /* TRIE_STUDY_OPT */
3859 /* Else: zero-length, ignore. */
3860 scan = regnext(scan);
3861 }
3862 if (frame) {
3863 last = frame->last;
3864 scan = frame->next;
3865 stopparen = frame->stop;
3866 frame = frame->prev;
3867 goto fake_study_recurse;
3868 }
3869
3870 finish:
3871 assert(!frame);
3872 DEBUG_STUDYDATA("pre-fin:",data,depth);
3873
3874 *scanp = scan;
3875 *deltap = is_inf_internal ? I32_MAX : delta;
3876 if (flags & SCF_DO_SUBSTR && is_inf)
3877 data->pos_delta = I32_MAX - data->pos_min;
3878 if (is_par > (I32)U8_MAX)
3879 is_par = 0;
3880 if (is_par && pars==1 && data) {
3881 data->flags |= SF_IN_PAR;
3882 data->flags &= ~SF_HAS_PAR;
3883 }
3884 else if (pars && data) {
3885 data->flags |= SF_HAS_PAR;
3886 data->flags &= ~SF_IN_PAR;
3887 }
3888 if (flags & SCF_DO_STCLASS_OR)
3889 cl_and(data->start_class, and_withp);
3890 if (flags & SCF_TRIE_RESTUDY)
3891 data->flags |= SCF_TRIE_RESTUDY;
3892
3893 DEBUG_STUDYDATA("post-fin:",data,depth);
3894
3895 return min < stopmin ? min : stopmin;
3896}
3897
3898STATIC U32
3899S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
3900{
3901 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
3902
3903 Renewc(RExC_rxi->data,
3904 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
3905 char, struct reg_data);
3906 if(count)
3907 Renew(RExC_rxi->data->what, count + n, U8);
3908 else
3909 Newx(RExC_rxi->data->what, n, U8);
3910 RExC_rxi->data->count = count + n;
3911 Copy(s, RExC_rxi->data->what + count, n, U8);
3912 return count;
3913}
3914
3915/*XXX: todo make this not included in a non debugging perl */
3916#ifndef PERL_IN_XSUB_RE
3917void
3918Perl_reginitcolors(pTHX)
3919{
3920 dVAR;
3921 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
3922 if (s) {
3923 char *t = savepv(s);
3924 int i = 0;
3925 PL_colors[0] = t;
3926 while (++i < 6) {
3927 t = strchr(t, '\t');
3928 if (t) {
3929 *t = '\0';
3930 PL_colors[i] = ++t;
3931 }
3932 else
3933 PL_colors[i] = t = (char *)"";
3934 }
3935 } else {
3936 int i = 0;
3937 while (i < 6)
3938 PL_colors[i++] = (char *)"";
3939 }
3940 PL_colorset = 1;
3941}
3942#endif
3943
3944
3945#ifdef TRIE_STUDY_OPT
3946#define CHECK_RESTUDY_GOTO \
3947 if ( \
3948 (data.flags & SCF_TRIE_RESTUDY) \
3949 && ! restudied++ \
3950 ) goto reStudy
3951#else
3952#define CHECK_RESTUDY_GOTO
3953#endif
3954
3955/*
3956 - pregcomp - compile a regular expression into internal code
3957 *
3958 * We can't allocate space until we know how big the compiled form will be,
3959 * but we can't compile it (and thus know how big it is) until we've got a
3960 * place to put the code. So we cheat: we compile it twice, once with code
3961 * generation turned off and size counting turned on, and once "for real".
3962 * This also means that we don't allocate space until we are sure that the
3963 * thing really will compile successfully, and we never have to move the
3964 * code and thus invalidate pointers into it. (Note that it has to be in
3965 * one piece because free() must be able to free it all.) [NB: not true in perl]
3966 *
3967 * Beware that the optimization-preparation code in here knows about some
3968 * of the structure of the compiled regexp. [I'll say.]
3969 */
3970
3971
3972
3973#ifndef PERL_IN_XSUB_RE
3974#define RE_ENGINE_PTR &PL_core_reg_engine
3975#else
3976extern const struct regexp_engine my_reg_engine;
3977#define RE_ENGINE_PTR &my_reg_engine
3978#endif
3979
3980#ifndef PERL_IN_XSUB_RE
3981regexp *
3982Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
3983{
3984 dVAR;
3985 HV * const table = GvHV(PL_hintgv);
3986 /* Dispatch a request to compile a regexp to correct
3987 regexp engine. */
3988 if (table) {
3989 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
3990 GET_RE_DEBUG_FLAGS_DECL;
3991 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
3992 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3993 DEBUG_COMPILE_r({
3994 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
3995 SvIV(*ptr));
3996 });
3997 return CALLREGCOMP_ENG(eng, exp, xend, pm);
3998 }
3999 }
4000 return Perl_re_compile(aTHX_ exp, xend, pm);
4001}
4002#endif
4003
4004regexp *
4005Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm)
4006{
4007 dVAR;
4008 register regexp *r;
4009 register regexp_internal *ri;
4010 regnode *scan;
4011 regnode *first;
4012 I32 flags;
4013 I32 minlen = 0;
4014 I32 sawplus = 0;
4015 I32 sawopen = 0;
4016 scan_data_t data;
4017 RExC_state_t RExC_state;
4018 RExC_state_t * const pRExC_state = &RExC_state;
4019#ifdef TRIE_STUDY_OPT
4020 int restudied= 0;
4021 RExC_state_t copyRExC_state;
4022#endif
4023 GET_RE_DEBUG_FLAGS_DECL;
4024 DEBUG_r(if (!PL_colorset) reginitcolors());
4025
4026 if (exp == NULL)
4027 FAIL("NULL regexp argument");
4028
4029 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
4030
4031 RExC_precomp = exp;
4032 DEBUG_COMPILE_r({
4033 SV *dsv= sv_newmortal();
4034 RE_PV_QUOTED_DECL(s, RExC_utf8,
4035 dsv, RExC_precomp, (xend - exp), 60);
4036 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4037 PL_colors[4],PL_colors[5],s);
4038 });
4039 RExC_flags = pm->op_pmflags;
4040 RExC_sawback = 0;
4041
4042 RExC_seen = 0;
4043 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4044 RExC_seen_evals = 0;
4045 RExC_extralen = 0;
4046
4047 /* First pass: determine size, legality. */
4048 RExC_parse = exp;
4049 RExC_start = exp;
4050 RExC_end = xend;
4051 RExC_naughty = 0;
4052 RExC_npar = 1;
4053 RExC_nestroot = 0;
4054 RExC_size = 0L;
4055 RExC_emit = &PL_regdummy;
4056 RExC_whilem_seen = 0;
4057 RExC_charnames = NULL;
4058 RExC_open_parens = NULL;
4059 RExC_close_parens = NULL;
4060 RExC_opend = NULL;
4061 RExC_paren_names = NULL;
4062#ifdef DEBUGGING
4063 RExC_paren_name_list = NULL;
4064#endif
4065 RExC_recurse = NULL;
4066 RExC_recurse_count = 0;
4067
4068#if 0 /* REGC() is (currently) a NOP at the first pass.
4069 * Clever compilers notice this and complain. --jhi */
4070 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4071#endif
4072 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4073 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4074 RExC_precomp = NULL;
4075 return(NULL);
4076 }
4077 DEBUG_PARSE_r({
4078 PerlIO_printf(Perl_debug_log,
4079 "Required size %"IVdf" nodes\n"
4080 "Starting second pass (creation)\n",
4081 (IV)RExC_size);
4082 RExC_lastnum=0;
4083 RExC_lastparse=NULL;
4084 });
4085 /* Small enough for pointer-storage convention?
4086 If extralen==0, this means that we will not need long jumps. */
4087 if (RExC_size >= 0x10000L && RExC_extralen)
4088 RExC_size += RExC_extralen;
4089 else
4090 RExC_extralen = 0;
4091 if (RExC_whilem_seen > 15)
4092 RExC_whilem_seen = 15;
4093
4094#ifdef DEBUGGING
4095 /* Make room for a sentinel value at the end of the program */
4096 RExC_size++;
4097#endif
4098
4099 /* Allocate space and zero-initialize. Note, the two step process
4100 of zeroing when in debug mode, thus anything assigned has to
4101 happen after that */
4102 Newxz(r, 1, regexp);
4103 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4104 char, regexp_internal);
4105 if ( r == NULL || ri == NULL )
4106 FAIL("Regexp out of space");
4107#ifdef DEBUGGING
4108 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4109 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4110#else
4111 /* bulk initialize base fields with 0. */
4112 Zero(ri, sizeof(regexp_internal), char);
4113#endif
4114
4115 /* non-zero initialization begins here */
4116 RXi_SET( r, ri );
4117 r->engine= RE_ENGINE_PTR;
4118 r->refcnt = 1;
4119 r->prelen = xend - exp;
4120 r->precomp = savepvn(RExC_precomp, r->prelen);
4121 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME;
4122 r->intflags = 0;
4123 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4124
4125 if (RExC_seen & REG_SEEN_RECURSE) {
4126 Newxz(RExC_open_parens, RExC_npar,regnode *);
4127 SAVEFREEPV(RExC_open_parens);
4128 Newxz(RExC_close_parens,RExC_npar,regnode *);
4129 SAVEFREEPV(RExC_close_parens);
4130 }
4131
4132 /* Useful during FAIL. */
4133 Newxz(ri->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4134 if (ri->offsets) {
4135 ri->offsets[0] = RExC_size;
4136 }
4137 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4138 "%s %"UVuf" bytes for offset annotations.\n",
4139 ri->offsets ? "Got" : "Couldn't get",
4140 (UV)((2*RExC_size+1) * sizeof(U32))));
4141
4142 RExC_rx = r;
4143 RExC_rxi = ri;
4144
4145 /* Second pass: emit code. */
4146 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
4147 RExC_parse = exp;
4148 RExC_end = xend;
4149 RExC_naughty = 0;
4150 RExC_npar = 1;
4151 RExC_emit_start = ri->program;
4152 RExC_emit = ri->program;
4153#ifdef DEBUGGING
4154 /* put a sentinal on the end of the program so we can check for
4155 overwrites */
4156 ri->program[RExC_size].type = 255;
4157#endif
4158 /* Store the count of eval-groups for security checks: */
4159 RExC_rx->seen_evals = RExC_seen_evals;
4160 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4161 if (reg(pRExC_state, 0, &flags,1) == NULL)
4162 return(NULL);
4163
4164 /* XXXX To minimize changes to RE engine we always allocate
4165 3-units-long substrs field. */
4166 Newx(r->substrs, 1, struct reg_substr_data);
4167 if (RExC_recurse_count) {
4168 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4169 SAVEFREEPV(RExC_recurse);
4170 }
4171
4172reStudy:
4173 r->minlen = minlen = sawplus = sawopen = 0;
4174 Zero(r->substrs, 1, struct reg_substr_data);
4175
4176#ifdef TRIE_STUDY_OPT
4177 if ( restudied ) {
4178 U32 seen=RExC_seen;
4179 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4180
4181 RExC_state = copyRExC_state;
4182 if (seen & REG_TOP_LEVEL_BRANCHES)
4183 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4184 else
4185 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4186 if (data.last_found) {
4187 SvREFCNT_dec(data.longest_fixed);
4188 SvREFCNT_dec(data.longest_float);
4189 SvREFCNT_dec(data.last_found);
4190 }
4191 StructCopy(&zero_scan_data, &data, scan_data_t);
4192 } else {
4193 StructCopy(&zero_scan_data, &data, scan_data_t);
4194 copyRExC_state = RExC_state;
4195 }
4196#else
4197 StructCopy(&zero_scan_data, &data, scan_data_t);
4198#endif
4199
4200 /* Dig out information for optimizations. */
4201 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */
4202 pm->op_pmflags = RExC_flags;
4203 if (UTF)
4204 r->extflags |= RXf_UTF8; /* Unicode in it? */
4205 ri->regstclass = NULL;
4206 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4207 r->intflags |= PREGf_NAUGHTY;
4208 scan = ri->program + 1; /* First BRANCH. */
4209
4210 /* testing for BRANCH here tells us whether there is "must appear"
4211 data in the pattern. If there is then we can use it for optimisations */
4212 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4213 I32 fake;
4214 STRLEN longest_float_length, longest_fixed_length;
4215 struct regnode_charclass_class ch_class; /* pointed to by data */
4216 int stclass_flag;
4217 I32 last_close = 0; /* pointed to by data */
4218
4219 first = scan;
4220 /* Skip introductions and multiplicators >= 1. */
4221 while ((OP(first) == OPEN && (sawopen = 1)) ||
4222 /* An OR of *one* alternative - should not happen now. */
4223 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
4224 /* for now we can't handle lookbehind IFMATCH*/
4225 (OP(first) == IFMATCH && !first->flags) ||
4226 (OP(first) == PLUS) ||
4227 (OP(first) == MINMOD) ||
4228 /* An {n,m} with n>0 */
4229 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4230 {
4231
4232 if (OP(first) == PLUS)
4233 sawplus = 1;
4234 else
4235 first += regarglen[OP(first)];
4236 if (OP(first) == IFMATCH) {
4237 first = NEXTOPER(first);
4238 first += EXTRA_STEP_2ARGS;
4239 } else /* XXX possible optimisation for /(?=)/ */
4240 first = NEXTOPER(first);
4241 }
4242
4243 /* Starting-point info. */
4244 again:
4245 DEBUG_PEEP("first:",first,0);
4246 /* Ignore EXACT as we deal with it later. */
4247 if (PL_regkind[OP(first)] == EXACT) {
4248 if (OP(first) == EXACT)
4249 NOOP; /* Empty, get anchored substr later. */
4250 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
4251 ri->regstclass = first;
4252 }
4253#ifdef TRIE_STCLASS
4254 else if (PL_regkind[OP(first)] == TRIE &&
4255 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4256 {
4257 regnode *trie_op;
4258 /* this can happen only on restudy */
4259 if ( OP(first) == TRIE ) {
4260 struct regnode_1 *trieop = (struct regnode_1 *)
4261 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4262 StructCopy(first,trieop,struct regnode_1);
4263 trie_op=(regnode *)trieop;
4264 } else {
4265 struct regnode_charclass *trieop = (struct regnode_charclass *)
4266 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4267 StructCopy(first,trieop,struct regnode_charclass);
4268 trie_op=(regnode *)trieop;
4269 }
4270 OP(trie_op)+=2;
4271 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4272 ri->regstclass = trie_op;
4273 }
4274#endif
4275 else if (strchr((const char*)PL_simple,OP(first)))
4276 ri->regstclass = first;
4277 else if (PL_regkind[OP(first)] == BOUND ||
4278 PL_regkind[OP(first)] == NBOUND)
4279 ri->regstclass = first;
4280 else if (PL_regkind[OP(first)] == BOL) {
4281 r->extflags |= (OP(first) == MBOL
4282 ? RXf_ANCH_MBOL
4283 : (OP(first) == SBOL
4284 ? RXf_ANCH_SBOL
4285 : RXf_ANCH_BOL));
4286 first = NEXTOPER(first);
4287 goto again;
4288 }
4289 else if (OP(first) == GPOS) {
4290 r->extflags |= RXf_ANCH_GPOS;
4291 first = NEXTOPER(first);
4292 goto again;
4293 }
4294 else if ((!sawopen || !RExC_sawback) &&
4295 (OP(first) == STAR &&
4296 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4297 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4298 {
4299 /* turn .* into ^.* with an implied $*=1 */
4300 const int type =
4301 (OP(NEXTOPER(first)) == REG_ANY)
4302 ? RXf_ANCH_MBOL
4303 : RXf_ANCH_SBOL;
4304 r->extflags |= type;
4305 r->intflags |= PREGf_IMPLICIT;
4306 first = NEXTOPER(first);
4307 goto again;
4308 }
4309 if (sawplus && (!sawopen || !RExC_sawback)
4310 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4311 /* x+ must match at the 1st pos of run of x's */
4312 r->intflags |= PREGf_SKIP;
4313
4314 /* Scan is after the zeroth branch, first is atomic matcher. */
4315#ifdef TRIE_STUDY_OPT
4316 DEBUG_PARSE_r(
4317 if (!restudied)
4318 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4319 (IV)(first - scan + 1))
4320 );
4321#else
4322 DEBUG_PARSE_r(
4323 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4324 (IV)(first - scan + 1))
4325 );
4326#endif
4327
4328
4329 /*
4330 * If there's something expensive in the r.e., find the
4331 * longest literal string that must appear and make it the
4332 * regmust. Resolve ties in favor of later strings, since
4333 * the regstart check works with the beginning of the r.e.
4334 * and avoiding duplication strengthens checking. Not a
4335 * strong reason, but sufficient in the absence of others.
4336 * [Now we resolve ties in favor of the earlier string if
4337 * it happens that c_offset_min has been invalidated, since the
4338 * earlier string may buy us something the later one won't.]
4339 */
4340
4341 data.longest_fixed = newSVpvs("");
4342 data.longest_float = newSVpvs("");
4343 data.last_found = newSVpvs("");
4344 data.longest = &(data.longest_fixed);
4345 first = scan;
4346 if (!ri->regstclass) {
4347 cl_init(pRExC_state, &ch_class);
4348 data.start_class = &ch_class;
4349 stclass_flag = SCF_DO_STCLASS_AND;
4350 } else /* XXXX Check for BOUND? */
4351 stclass_flag = 0;
4352 data.last_closep = &last_close;
4353
4354 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4355 &data, -1, NULL, NULL,
4356 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4357
4358
4359 CHECK_RESTUDY_GOTO;
4360
4361
4362 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4363 && data.last_start_min == 0 && data.last_end > 0
4364 && !RExC_seen_zerolen
4365 && !(RExC_seen & REG_SEEN_VERBARG)
4366 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4367 r->extflags |= RXf_CHECK_ALL;
4368 scan_commit(pRExC_state, &data,&minlen,0);
4369 SvREFCNT_dec(data.last_found);
4370
4371 /* Note that code very similar to this but for anchored string
4372 follows immediately below, changes may need to be made to both.
4373 Be careful.
4374 */
4375 longest_float_length = CHR_SVLEN(data.longest_float);
4376 if (longest_float_length
4377 || (data.flags & SF_FL_BEFORE_EOL
4378 && (!(data.flags & SF_FL_BEFORE_MEOL)
4379 || (RExC_flags & RXf_PMf_MULTILINE))))
4380 {
4381 I32 t,ml;
4382
4383 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4384 && data.offset_fixed == data.offset_float_min
4385 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4386 goto remove_float; /* As in (a)+. */
4387
4388 /* copy the information about the longest float from the reg_scan_data
4389 over to the program. */
4390 if (SvUTF8(data.longest_float)) {
4391 r->float_utf8 = data.longest_float;
4392 r->float_substr = NULL;
4393 } else {
4394 r->float_substr = data.longest_float;
4395 r->float_utf8 = NULL;
4396 }
4397 /* float_end_shift is how many chars that must be matched that
4398 follow this item. We calculate it ahead of time as once the
4399 lookbehind offset is added in we lose the ability to correctly
4400 calculate it.*/
4401 ml = data.minlen_float ? *(data.minlen_float)
4402 : (I32)longest_float_length;
4403 r->float_end_shift = ml - data.offset_float_min
4404 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4405 + data.lookbehind_float;
4406 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4407 r->float_max_offset = data.offset_float_max;
4408 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4409 r->float_max_offset -= data.lookbehind_float;
4410
4411 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4412 && (!(data.flags & SF_FL_BEFORE_MEOL)
4413 || (RExC_flags & RXf_PMf_MULTILINE)));
4414 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4415 }
4416 else {
4417 remove_float:
4418 r->float_substr = r->float_utf8 = NULL;
4419 SvREFCNT_dec(data.longest_float);
4420 longest_float_length = 0;
4421 }
4422
4423 /* Note that code very similar to this but for floating string
4424 is immediately above, changes may need to be made to both.
4425 Be careful.
4426 */
4427 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4428 if (longest_fixed_length
4429 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4430 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4431 || (RExC_flags & RXf_PMf_MULTILINE))))
4432 {
4433 I32 t,ml;
4434
4435 /* copy the information about the longest fixed
4436 from the reg_scan_data over to the program. */
4437 if (SvUTF8(data.longest_fixed)) {
4438 r->anchored_utf8 = data.longest_fixed;
4439 r->anchored_substr = NULL;
4440 } else {
4441 r->anchored_substr = data.longest_fixed;
4442 r->anchored_utf8 = NULL;
4443 }
4444 /* fixed_end_shift is how many chars that must be matched that
4445 follow this item. We calculate it ahead of time as once the
4446 lookbehind offset is added in we lose the ability to correctly
4447 calculate it.*/
4448 ml = data.minlen_fixed ? *(data.minlen_fixed)
4449 : (I32)longest_fixed_length;
4450 r->anchored_end_shift = ml - data.offset_fixed
4451 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4452 + data.lookbehind_fixed;
4453 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4454
4455 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4456 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4457 || (RExC_flags & RXf_PMf_MULTILINE)));
4458 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4459 }
4460 else {
4461 r->anchored_substr = r->anchored_utf8 = NULL;
4462 SvREFCNT_dec(data.longest_fixed);
4463 longest_fixed_length = 0;
4464 }
4465 if (ri->regstclass
4466 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4467 ri->regstclass = NULL;
4468 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4469 && stclass_flag
4470 && !(data.start_class->flags & ANYOF_EOS)
4471 && !cl_is_anything(data.start_class))
4472 {
4473 const U32 n = add_data(pRExC_state, 1, "f");
4474
4475 Newx(RExC_rxi->data->data[n], 1,
4476 struct regnode_charclass_class);
4477 StructCopy(data.start_class,
4478 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4479 struct regnode_charclass_class);
4480 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4481 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4482 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4483 regprop(r, sv, (regnode*)data.start_class);
4484 PerlIO_printf(Perl_debug_log,
4485 "synthetic stclass \"%s\".\n",
4486 SvPVX_const(sv));});
4487 }
4488
4489 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4490 if (longest_fixed_length > longest_float_length) {
4491 r->check_end_shift = r->anchored_end_shift;
4492 r->check_substr = r->anchored_substr;
4493 r->check_utf8 = r->anchored_utf8;
4494 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4495 if (r->extflags & RXf_ANCH_SINGLE)
4496 r->extflags |= RXf_NOSCAN;
4497 }
4498 else {
4499 r->check_end_shift = r->float_end_shift;
4500 r->check_substr = r->float_substr;
4501 r->check_utf8 = r->float_utf8;
4502 r->check_offset_min = r->float_min_offset;
4503 r->check_offset_max = r->float_max_offset;
4504 }
4505 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4506 This should be changed ASAP! */
4507 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4508 r->extflags |= RXf_USE_INTUIT;
4509 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
4510 r->extflags |= RXf_INTUIT_TAIL;
4511 }
4512 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4513 if ( (STRLEN)minlen < longest_float_length )
4514 minlen= longest_float_length;
4515 if ( (STRLEN)minlen < longest_fixed_length )
4516 minlen= longest_fixed_length;
4517 */
4518 }
4519 else {
4520 /* Several toplevels. Best we can is to set minlen. */
4521 I32 fake;
4522 struct regnode_charclass_class ch_class;
4523 I32 last_close = 0;
4524
4525 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
4526
4527 scan = ri->program + 1;
4528 cl_init(pRExC_state, &ch_class);
4529 data.start_class = &ch_class;
4530 data.last_closep = &last_close;
4531
4532
4533 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
4534 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4535
4536 CHECK_RESTUDY_GOTO;
4537
4538 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
4539 = r->float_substr = r->float_utf8 = NULL;
4540 if (!(data.start_class->flags & ANYOF_EOS)
4541 && !cl_is_anything(data.start_class))
4542 {
4543 const U32 n = add_data(pRExC_state, 1, "f");
4544
4545 Newx(RExC_rxi->data->data[n], 1,
4546 struct regnode_charclass_class);
4547 StructCopy(data.start_class,
4548 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4549 struct regnode_charclass_class);
4550 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4551 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4552 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
4553 regprop(r, sv, (regnode*)data.start_class);
4554 PerlIO_printf(Perl_debug_log,
4555 "synthetic stclass \"%s\".\n",
4556 SvPVX_const(sv));});
4557 }
4558 }
4559
4560 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4561 the "real" pattern. */
4562 DEBUG_OPTIMISE_r({
4563 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4564 (IV)minlen, (IV)r->minlen);
4565 });
4566 r->minlenret = minlen;
4567 if (r->minlen < minlen)
4568 r->minlen = minlen;
4569
4570 if (RExC_seen & REG_SEEN_GPOS)
4571 r->extflags |= RXf_GPOS_SEEN;
4572 if (RExC_seen & REG_SEEN_LOOKBEHIND)
4573 r->extflags |= RXf_LOOKBEHIND_SEEN;
4574 if (RExC_seen & REG_SEEN_EVAL)
4575 r->extflags |= RXf_EVAL_SEEN;
4576 if (RExC_seen & REG_SEEN_CANY)
4577 r->extflags |= RXf_CANY_SEEN;
4578 if (RExC_seen & REG_SEEN_VERBARG)
4579 r->intflags |= PREGf_VERBARG_SEEN;
4580 if (RExC_seen & REG_SEEN_CUTGROUP)
4581 r->intflags |= PREGf_CUTGROUP_SEEN;
4582 if (RExC_paren_names)
4583 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4584 else
4585 r->paren_names = NULL;
4586 if (r->prelen == 3 && strEQ("\\s+", r->precomp))
4587 r->extflags |= RXf_WHITE;
4588 else if (r->prelen == 1 && r->precomp[0] == '^')
4589 r->extflags |= RXf_START_ONLY;
4590
4591#ifdef DEBUGGING
4592 if (RExC_paren_names) {
4593 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4594 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4595 } else
4596#endif
4597 ri->name_list_idx = 0;
4598
4599 if (RExC_recurse_count) {
4600 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4601 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4602 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4603 }
4604 }
4605 Newxz(r->startp, RExC_npar, I32);
4606 Newxz(r->endp, RExC_npar, I32);
4607 /* assume we don't need to swap parens around before we match */
4608
4609 DEBUG_DUMP_r({
4610 PerlIO_printf(Perl_debug_log,"Final program:\n");
4611 regdump(r);
4612 });
4613 DEBUG_OFFSETS_r(if (ri->offsets) {
4614 const U32 len = ri->offsets[0];
4615 U32 i;
4616 GET_RE_DEBUG_FLAGS_DECL;
4617 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->offsets[0]);
4618 for (i = 1; i <= len; i++) {
4619 if (ri->offsets[i*2-1] || ri->offsets[i*2])
4620 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
4621 (UV)i, (UV)ri->offsets[i*2-1], (UV)ri->offsets[i*2]);
4622 }
4623 PerlIO_printf(Perl_debug_log, "\n");
4624 });
4625 return(r);
4626}
4627
4628#undef CORE_ONLY_BLOCK
4629#undef RE_ENGINE_PTR
4630
4631#ifndef PERL_IN_XSUB_RE
4632SV*
4633Perl_reg_named_buff_get(pTHX_ SV* namesv, const REGEXP * const from_re, U32 flags)
4634{
4635 AV *retarray = NULL;
4636 SV *ret;
4637 if (flags & 1)
4638 retarray=newAV();
4639
4640 if (from_re || PL_curpm) {
4641 const REGEXP * const rx = from_re ? from_re : PM_GETRE(PL_curpm);
4642 if (rx && rx->paren_names) {
4643 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4644 if (he_str) {
4645 IV i;
4646 SV* sv_dat=HeVAL(he_str);
4647 I32 *nums=(I32*)SvPVX(sv_dat);
4648 for ( i=0; i<SvIVX(sv_dat); i++ ) {
4649 if ((I32)(rx->lastparen) >= nums[i] &&
4650 rx->endp[nums[i]] != -1)
4651 {
4652 ret = reg_numbered_buff_get(nums[i],rx,NULL,0);
4653 if (!retarray)
4654 return ret;
4655 } else {
4656 ret = newSVsv(&PL_sv_undef);
4657 }
4658 if (retarray) {
4659 SvREFCNT_inc(ret);
4660 av_push(retarray, ret);
4661 }
4662 }
4663 if (retarray)
4664 return (SV*)retarray;
4665 }
4666 }
4667 }
4668 return NULL;
4669}
4670
4671SV*
4672Perl_reg_numbered_buff_get(pTHX_ I32 paren, const REGEXP * const rx, SV* usesv, U32 flags)
4673{
4674 char *s = NULL;
4675 I32 i = 0;
4676 I32 s1, t1;
4677 SV *sv = usesv ? usesv : newSVpvs("");
4678 PERL_UNUSED_ARG(flags);
4679
4680 if (!rx->subbeg) {
4681 sv_setsv(sv,&PL_sv_undef);
4682 return sv;
4683 }
4684 else
4685 if (paren == -2 && rx->startp[0] != -1) {
4686 /* $` */
4687 i = rx->startp[0];
4688 s = rx->subbeg;
4689 }
4690 else
4691 if (paren == -1 && rx->endp[0] != -1) {
4692 /* $' */
4693 s = rx->subbeg + rx->endp[0];
4694 i = rx->sublen - rx->endp[0];
4695 }
4696 else
4697 if ( 0 <= paren && paren <= (I32)rx->nparens &&
4698 (s1 = rx->startp[paren]) != -1 &&
4699 (t1 = rx->endp[paren]) != -1)
4700 {
4701 /* $& $1 ... */
4702 i = t1 - s1;
4703 s = rx->subbeg + s1;
4704 } else {
4705 sv_setsv(sv,&PL_sv_undef);
4706 return sv;
4707 }
4708 assert(rx->sublen >= (s - rx->subbeg) + i );
4709 if (i >= 0) {
4710 const int oldtainted = PL_tainted;
4711 TAINT_NOT;
4712 sv_setpvn(sv, s, i);
4713 PL_tainted = oldtainted;
4714 if ( (rx->extflags & RXf_CANY_SEEN)
4715 ? (RX_MATCH_UTF8(rx)
4716 && (!i || is_utf8_string((U8*)s, i)))
4717 : (RX_MATCH_UTF8(rx)) )
4718 {
4719 SvUTF8_on(sv);
4720 }
4721 else
4722 SvUTF8_off(sv);
4723 if (PL_tainting) {
4724 if (RX_MATCH_TAINTED(rx)) {
4725 if (SvTYPE(sv) >= SVt_PVMG) {
4726 MAGIC* const mg = SvMAGIC(sv);
4727 MAGIC* mgt;
4728 PL_tainted = 1;
4729 SvMAGIC_set(sv, mg->mg_moremagic);
4730 SvTAINT(sv);
4731 if ((mgt = SvMAGIC(sv))) {
4732 mg->mg_moremagic = mgt;
4733 SvMAGIC_set(sv, mg);
4734 }
4735 } else {
4736 PL_tainted = 1;
4737 SvTAINT(sv);
4738 }
4739 } else
4740 SvTAINTED_off(sv);
4741 }
4742 } else {
4743 sv_setsv(sv,&PL_sv_undef);
4744 }
4745 return sv;
4746}
4747#endif
4748
4749/* Scans the name of a named buffer from the pattern.
4750 * If flags is REG_RSN_RETURN_NULL returns null.
4751 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4752 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4753 * to the parsed name as looked up in the RExC_paren_names hash.
4754 * If there is an error throws a vFAIL().. type exception.
4755 */
4756
4757#define REG_RSN_RETURN_NULL 0
4758#define REG_RSN_RETURN_NAME 1
4759#define REG_RSN_RETURN_DATA 2
4760
4761STATIC SV*
4762S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4763 char *name_start = RExC_parse;
4764
4765 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
4766 /* skip IDFIRST by using do...while */
4767 if (UTF)
4768 do {
4769 RExC_parse += UTF8SKIP(RExC_parse);
4770 } while (isALNUM_utf8((U8*)RExC_parse));
4771 else
4772 do {
4773 RExC_parse++;
4774 } while (isALNUM(*RExC_parse));
4775 }
4776
4777 if ( flags ) {
4778 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4779 (int)(RExC_parse - name_start)));
4780 if (UTF)
4781 SvUTF8_on(sv_name);
4782 if ( flags == REG_RSN_RETURN_NAME)
4783 return sv_name;
4784 else if (flags==REG_RSN_RETURN_DATA) {
4785 HE *he_str = NULL;
4786 SV *sv_dat = NULL;
4787 if ( ! sv_name ) /* should not happen*/
4788 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4789 if (RExC_paren_names)
4790 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4791 if ( he_str )
4792 sv_dat = HeVAL(he_str);
4793 if ( ! sv_dat )
4794 vFAIL("Reference to nonexistent named group");
4795 return sv_dat;
4796 }
4797 else {
4798 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4799 }
4800 /* NOT REACHED */
4801 }
4802 return NULL;
4803}
4804
4805#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4806 int rem=(int)(RExC_end - RExC_parse); \
4807 int cut; \
4808 int num; \
4809 int iscut=0; \
4810 if (rem>10) { \
4811 rem=10; \
4812 iscut=1; \
4813 } \
4814 cut=10-rem; \
4815 if (RExC_lastparse!=RExC_parse) \
4816 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4817 rem, RExC_parse, \
4818 cut + 4, \
4819 iscut ? "..." : "<" \
4820 ); \
4821 else \
4822 PerlIO_printf(Perl_debug_log,"%16s",""); \
4823 \
4824 if (SIZE_ONLY) \
4825 num=RExC_size; \
4826 else \
4827 num=REG_NODE_NUM(RExC_emit); \
4828 if (RExC_lastnum!=num) \
4829 PerlIO_printf(Perl_debug_log,"|%4d",num); \
4830 else \
4831 PerlIO_printf(Perl_debug_log,"|%4s",""); \
4832 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4833 (int)((depth*2)), "", \
4834 (funcname) \
4835 ); \
4836 RExC_lastnum=num; \
4837 RExC_lastparse=RExC_parse; \
4838})
4839
4840
4841
4842#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4843 DEBUG_PARSE_MSG((funcname)); \
4844 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4845})
4846#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4847 DEBUG_PARSE_MSG((funcname)); \
4848 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4849})
4850/*
4851 - reg - regular expression, i.e. main body or parenthesized thing
4852 *
4853 * Caller must absorb opening parenthesis.
4854 *
4855 * Combining parenthesis handling with the base level of regular expression
4856 * is a trifle forced, but the need to tie the tails of the branches to what
4857 * follows makes it hard to avoid.
4858 */
4859#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4860#ifdef DEBUGGING
4861#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4862#else
4863#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4864#endif
4865
4866/* this idea is borrowed from STR_WITH_LEN in handy.h */
4867#define CHECK_WORD(s,v,l) \
4868 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4869
4870STATIC regnode *
4871S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
4872 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
4873{
4874 dVAR;
4875 register regnode *ret; /* Will be the head of the group. */
4876 register regnode *br;
4877 register regnode *lastbr;
4878 register regnode *ender = NULL;
4879 register I32 parno = 0;
4880 I32 flags;
4881 const I32 oregflags = RExC_flags;
4882 bool have_branch = 0;
4883 bool is_open = 0;
4884
4885 /* for (?g), (?gc), and (?o) warnings; warning
4886 about (?c) will warn about (?g) -- japhy */
4887
4888#define WASTED_O 0x01
4889#define WASTED_G 0x02
4890#define WASTED_C 0x04
4891#define WASTED_GC (0x02|0x04)
4892 I32 wastedflags = 0x00;
4893
4894 char * parse_start = RExC_parse; /* MJD */
4895 char * const oregcomp_parse = RExC_parse;
4896
4897 GET_RE_DEBUG_FLAGS_DECL;
4898 DEBUG_PARSE("reg ");
4899
4900
4901 *flagp = 0; /* Tentatively. */
4902
4903
4904 /* Make an OPEN node, if parenthesized. */
4905 if (paren) {
4906 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4907 char *start_verb = RExC_parse;
4908 STRLEN verb_len = 0;
4909 char *start_arg = NULL;
4910 unsigned char op = 0;
4911 int argok = 1;
4912 int internal_argval = 0; /* internal_argval is only useful if !argok */
4913 while ( *RExC_parse && *RExC_parse != ')' ) {
4914 if ( *RExC_parse == ':' ) {
4915 start_arg = RExC_parse + 1;
4916 break;
4917 }
4918 RExC_parse++;
4919 }
4920 ++start_verb;
4921 verb_len = RExC_parse - start_verb;
4922 if ( start_arg ) {
4923 RExC_parse++;
4924 while ( *RExC_parse && *RExC_parse != ')' )
4925 RExC_parse++;
4926 if ( *RExC_parse != ')' )
4927 vFAIL("Unterminated verb pattern argument");
4928 if ( RExC_parse == start_arg )
4929 start_arg = NULL;
4930 } else {
4931 if ( *RExC_parse != ')' )
4932 vFAIL("Unterminated verb pattern");
4933 }
4934
4935 switch ( *start_verb ) {
4936 case 'A': /* (*ACCEPT) */
4937 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4938 op = ACCEPT;
4939 internal_argval = RExC_nestroot;
4940 }
4941 break;
4942 case 'C': /* (*COMMIT) */
4943 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4944 op = COMMIT;
4945 break;
4946 case 'F': /* (*FAIL) */
4947 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4948 op = OPFAIL;
4949 argok = 0;
4950 }
4951 break;
4952 case ':': /* (*:NAME) */
4953 case 'M': /* (*MARK:NAME) */
4954 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
4955 op = MARKPOINT;
4956 argok = -1;
4957 }
4958 break;
4959 case 'P': /* (*PRUNE) */
4960 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4961 op = PRUNE;
4962 break;
4963 case 'S': /* (*SKIP) */
4964 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4965 op = SKIP;
4966 break;
4967 case 'T': /* (*THEN) */
4968 /* [19:06] <TimToady> :: is then */
4969 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4970 op = CUTGROUP;
4971 RExC_seen |= REG_SEEN_CUTGROUP;
4972 }
4973 break;
4974 }
4975 if ( ! op ) {
4976 RExC_parse++;
4977 vFAIL3("Unknown verb pattern '%.*s'",
4978 verb_len, start_verb);
4979 }
4980 if ( argok ) {
4981 if ( start_arg && internal_argval ) {
4982 vFAIL3("Verb pattern '%.*s' may not have an argument",
4983 verb_len, start_verb);
4984 } else if ( argok < 0 && !start_arg ) {
4985 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4986 verb_len, start_verb);
4987 } else {
4988 ret = reganode(pRExC_state, op, internal_argval);
4989 if ( ! internal_argval && ! SIZE_ONLY ) {
4990 if (start_arg) {
4991 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4992 ARG(ret) = add_data( pRExC_state, 1, "S" );
4993 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
4994 ret->flags = 0;
4995 } else {
4996 ret->flags = 1;
4997 }
4998 }
4999 }
5000 if (!internal_argval)
5001 RExC_seen |= REG_SEEN_VERBARG;
5002 } else if ( start_arg ) {
5003 vFAIL3("Verb pattern '%.*s' may not have an argument",
5004 verb_len, start_verb);
5005 } else {
5006 ret = reg_node(pRExC_state, op);
5007 }
5008 nextchar(pRExC_state);
5009 return ret;
5010 } else
5011 if (*RExC_parse == '?') { /* (?...) */
5012 bool is_logical = 0;
5013 const char * const seqstart = RExC_parse;
5014
5015 RExC_parse++;
5016 paren = *RExC_parse++;
5017 ret = NULL; /* For look-ahead/behind. */
5018 switch (paren) {
5019
5020 case 'P': /* (?P...) variants for those used to PCRE/Python */
5021 paren = *RExC_parse++;
5022 if ( paren == '<') /* (?P<...>) named capture */
5023 goto named_capture;
5024 else if (paren == '>') { /* (?P>name) named recursion */
5025 goto named_recursion;
5026 }
5027 else if (paren == '=') { /* (?P=...) named backref */
5028 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5029 you change this make sure you change that */
5030 char* name_start = RExC_parse;
5031 U32 num = 0;
5032 SV *sv_dat = reg_scan_name(pRExC_state,
5033 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5034 if (RExC_parse == name_start || *RExC_parse != ')')
5035 vFAIL2("Sequence %.3s... not terminated",parse_start);
5036
5037 if (!SIZE_ONLY) {
5038 num = add_data( pRExC_state, 1, "S" );
5039 RExC_rxi->data->data[num]=(void*)sv_dat;
5040 SvREFCNT_inc(sv_dat);
5041 }
5042 RExC_sawback = 1;
5043 ret = reganode(pRExC_state,
5044 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5045 num);
5046 *flagp |= HASWIDTH;
5047
5048 Set_Node_Offset(ret, parse_start+1);
5049 Set_Node_Cur_Length(ret); /* MJD */
5050
5051 nextchar(pRExC_state);
5052 return ret;
5053 }
5054 goto unknown;
5055 case '<': /* (?<...) */
5056 if (*RExC_parse == '!')
5057 paren = ',';
5058 else if (*RExC_parse != '=')
5059 named_capture:
5060 { /* (?<...>) */
5061 char *name_start;
5062 SV *svname;
5063 paren= '>';
5064 case '\'': /* (?'...') */
5065 name_start= RExC_parse;
5066 svname = reg_scan_name(pRExC_state,
5067 SIZE_ONLY ? /* reverse test from the others */
5068 REG_RSN_RETURN_NAME :
5069 REG_RSN_RETURN_NULL);
5070 if (RExC_parse == name_start)
5071 goto unknown;
5072 if (*RExC_parse != paren)
5073 vFAIL2("Sequence (?%c... not terminated",
5074 paren=='>' ? '<' : paren);
5075 if (SIZE_ONLY) {
5076 HE *he_str;
5077 SV *sv_dat = NULL;
5078 if (!svname) /* shouldnt happen */
5079 Perl_croak(aTHX_
5080 "panic: reg_scan_name returned NULL");
5081 if (!RExC_paren_names) {
5082 RExC_paren_names= newHV();
5083 sv_2mortal((SV*)RExC_paren_names);
5084#ifdef DEBUGGING
5085 RExC_paren_name_list= newAV();
5086 sv_2mortal((SV*)RExC_paren_name_list);
5087#endif
5088 }
5089 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5090 if ( he_str )
5091 sv_dat = HeVAL(he_str);
5092 if ( ! sv_dat ) {
5093 /* croak baby croak */
5094 Perl_croak(aTHX_
5095 "panic: paren_name hash element allocation failed");
5096 } else if ( SvPOK(sv_dat) ) {
5097 IV count=SvIV(sv_dat);
5098 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
5099 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
5100 pv[count]=RExC_npar;
5101 SvIVX(sv_dat)++;
5102 } else {
5103 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5104 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5105 SvIOK_on(sv_dat);
5106 SvIVX(sv_dat)= 1;
5107 }
5108#ifdef DEBUGGING
5109 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5110 SvREFCNT_dec(svname);
5111#endif
5112
5113 /*sv_dump(sv_dat);*/
5114 }
5115 nextchar(pRExC_state);
5116 paren = 1;
5117 goto capturing_parens;
5118 }
5119 RExC_seen |= REG_SEEN_LOOKBEHIND;
5120 RExC_parse++;
5121 case '=': /* (?=...) */
5122 case '!': /* (?!...) */
5123 RExC_seen_zerolen++;
5124 if (*RExC_parse == ')') {
5125 ret=reg_node(pRExC_state, OPFAIL);
5126 nextchar(pRExC_state);
5127 return ret;
5128 }
5129 case ':': /* (?:...) */
5130 case '>': /* (?>...) */
5131 break;
5132 case '$': /* (?$...) */
5133 case '@': /* (?@...) */
5134 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5135 break;
5136 case '#': /* (?#...) */
5137 while (*RExC_parse && *RExC_parse != ')')
5138 RExC_parse++;
5139 if (*RExC_parse != ')')
5140 FAIL("Sequence (?#... not terminated");
5141 nextchar(pRExC_state);
5142 *flagp = TRYAGAIN;
5143 return NULL;
5144 case '0' : /* (?0) */
5145 case 'R' : /* (?R) */
5146 if (*RExC_parse != ')')
5147 FAIL("Sequence (?R) not terminated");
5148 ret = reg_node(pRExC_state, GOSTART);
5149 nextchar(pRExC_state);
5150 return ret;
5151 /*notreached*/
5152 { /* named and numeric backreferences */
5153 I32 num;
5154 case '&': /* (?&NAME) */
5155 parse_start = RExC_parse - 1;
5156 named_recursion:
5157 {
5158 SV *sv_dat = reg_scan_name(pRExC_state,
5159 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5160 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5161 }
5162 goto gen_recurse_regop;
5163 /* NOT REACHED */
5164 case '+':
5165 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5166 RExC_parse++;
5167 vFAIL("Illegal pattern");
5168 }
5169 goto parse_recursion;
5170 /* NOT REACHED*/
5171 case '-': /* (?-1) */
5172 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5173 RExC_parse--; /* rewind to let it be handled later */
5174 goto parse_flags;
5175 }
5176 /*FALLTHROUGH */
5177 case '1': case '2': case '3': case '4': /* (?1) */
5178 case '5': case '6': case '7': case '8': case '9':
5179 RExC_parse--;
5180 parse_recursion:
5181 num = atoi(RExC_parse);
5182 parse_start = RExC_parse - 1; /* MJD */
5183 if (*RExC_parse == '-')
5184 RExC_parse++;
5185 while (isDIGIT(*RExC_parse))
5186 RExC_parse++;
5187 if (*RExC_parse!=')')
5188 vFAIL("Expecting close bracket");
5189
5190 gen_recurse_regop:
5191 if ( paren == '-' ) {
5192 /*
5193 Diagram of capture buffer numbering.
5194 Top line is the normal capture buffer numbers
5195 Botton line is the negative indexing as from
5196 the X (the (?-2))
5197
5198 + 1 2 3 4 5 X 6 7
5199 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5200 - 5 4 3 2 1 X x x
5201
5202 */
5203 num = RExC_npar + num;
5204 if (num < 1) {
5205 RExC_parse++;
5206 vFAIL("Reference to nonexistent group");
5207 }
5208 } else if ( paren == '+' ) {
5209 num = RExC_npar + num - 1;
5210 }
5211
5212 ret = reganode(pRExC_state, GOSUB, num);
5213 if (!SIZE_ONLY) {
5214 if (num > (I32)RExC_rx->nparens) {
5215 RExC_parse++;
5216 vFAIL("Reference to nonexistent group");
5217 }
5218 ARG2L_SET( ret, RExC_recurse_count++);
5219 RExC_emit++;
5220 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5221 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
5222 } else {
5223 RExC_size++;
5224 }
5225 RExC_seen |= REG_SEEN_RECURSE;
5226 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
5227 Set_Node_Offset(ret, parse_start); /* MJD */
5228
5229 nextchar(pRExC_state);
5230 return ret;
5231 } /* named and numeric backreferences */
5232 /* NOT REACHED */
5233
5234 case 'p': /* (?p...) */
5235 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
5236 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
5237 /* FALL THROUGH*/
5238 case '?': /* (??...) */
5239 is_logical = 1;
5240 if (*RExC_parse != '{')
5241 goto unknown;
5242 paren = *RExC_parse++;
5243 /* FALL THROUGH */
5244 case '{': /* (?{...}) */
5245 {
5246 I32 count = 1;
5247 U32 n = 0;
5248 char c;
5249 char *s = RExC_parse;
5250
5251 RExC_seen_zerolen++;
5252 RExC_seen |= REG_SEEN_EVAL;
5253 while (count && (c = *RExC_parse)) {
5254 if (c == '\\') {
5255 if (RExC_parse[1])
5256 RExC_parse++;
5257 }
5258 else if (c == '{')
5259 count++;
5260 else if (c == '}')
5261 count--;
5262 RExC_parse++;
5263 }
5264 if (*RExC_parse != ')') {
5265 RExC_parse = s;
5266 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5267 }
5268 if (!SIZE_ONLY) {
5269 PAD *pad;
5270 OP_4tree *sop, *rop;
5271 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
5272
5273 ENTER;
5274 Perl_save_re_context(aTHX);
5275 rop = sv_compile_2op(sv, &sop, "re", &pad);
5276 sop->op_private |= OPpREFCOUNTED;
5277 /* re_dup will OpREFCNT_inc */
5278 OpREFCNT_set(sop, 1);
5279 LEAVE;
5280
5281 n = add_data(pRExC_state, 3, "nop");
5282 RExC_rxi->data->data[n] = (void*)rop;
5283 RExC_rxi->data->data[n+1] = (void*)sop;
5284 RExC_rxi->data->data[n+2] = (void*)pad;
5285 SvREFCNT_dec(sv);
5286 }
5287 else { /* First pass */
5288 if (PL_reginterp_cnt < ++RExC_seen_evals
5289 && IN_PERL_RUNTIME)
5290 /* No compiled RE interpolated, has runtime
5291 components ===> unsafe. */
5292 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5293 if (PL_tainting && PL_tainted)
5294 FAIL("Eval-group in insecure regular expression");
5295#if PERL_VERSION > 8
5296 if (IN_PERL_COMPILETIME)
5297 PL_cv_has_eval = 1;
5298#endif
5299 }
5300
5301 nextchar(pRExC_state);
5302 if (is_logical) {
5303 ret = reg_node(pRExC_state, LOGICAL);
5304 if (!SIZE_ONLY)
5305 ret->flags = 2;
5306 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
5307 /* deal with the length of this later - MJD */
5308 return ret;
5309 }
5310 ret = reganode(pRExC_state, EVAL, n);
5311 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5312 Set_Node_Offset(ret, parse_start);
5313 return ret;
5314 }
5315 case '(': /* (?(?{...})...) and (?(?=...)...) */
5316 {
5317 int is_define= 0;
5318 if (RExC_parse[0] == '?') { /* (?(?...)) */
5319 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5320 || RExC_parse[1] == '<'
5321 || RExC_parse[1] == '{') { /* Lookahead or eval. */
5322 I32 flag;
5323
5324 ret = reg_node(pRExC_state, LOGICAL);
5325 if (!SIZE_ONLY)
5326 ret->flags = 1;
5327 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
5328 goto insert_if;
5329 }
5330 }
5331 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5332 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5333 {
5334 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5335 char *name_start= RExC_parse++;
5336 U32 num = 0;
5337 SV *sv_dat=reg_scan_name(pRExC_state,
5338 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5339 if (RExC_parse == name_start || *RExC_parse != ch)
5340 vFAIL2("Sequence (?(%c... not terminated",
5341 (ch == '>' ? '<' : ch));
5342 RExC_parse++;
5343 if (!SIZE_ONLY) {
5344 num = add_data( pRExC_state, 1, "S" );
5345 RExC_rxi->data->data[num]=(void*)sv_dat;
5346 SvREFCNT_inc(sv_dat);
5347 }
5348 ret = reganode(pRExC_state,NGROUPP,num);
5349 goto insert_if_check_paren;
5350 }
5351 else if (RExC_parse[0] == 'D' &&
5352 RExC_parse[1] == 'E' &&
5353 RExC_parse[2] == 'F' &&
5354 RExC_parse[3] == 'I' &&
5355 RExC_parse[4] == 'N' &&
5356 RExC_parse[5] == 'E')
5357 {
5358 ret = reganode(pRExC_state,DEFINEP,0);
5359 RExC_parse +=6 ;
5360 is_define = 1;
5361 goto insert_if_check_paren;
5362 }
5363 else if (RExC_parse[0] == 'R') {
5364 RExC_parse++;
5365 parno = 0;
5366 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5367 parno = atoi(RExC_parse++);
5368 while (isDIGIT(*RExC_parse))
5369 RExC_parse++;
5370 } else if (RExC_parse[0] == '&') {
5371 SV *sv_dat;
5372 RExC_parse++;
5373 sv_dat = reg_scan_name(pRExC_state,
5374 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5375 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5376 }
5377 ret = reganode(pRExC_state,INSUBP,parno);
5378 goto insert_if_check_paren;
5379 }
5380 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5381 /* (?(1)...) */
5382 char c;
5383 parno = atoi(RExC_parse++);
5384
5385 while (isDIGIT(*RExC_parse))
5386 RExC_parse++;
5387 ret = reganode(pRExC_state, GROUPP, parno);
5388
5389 insert_if_check_paren:
5390 if ((c = *nextchar(pRExC_state)) != ')')
5391 vFAIL("Switch condition not recognized");
5392 insert_if:
5393 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5394 br = regbranch(pRExC_state, &flags, 1,depth+1);
5395 if (br == NULL)
5396 br = reganode(pRExC_state, LONGJMP, 0);
5397 else
5398 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
5399 c = *nextchar(pRExC_state);
5400 if (flags&HASWIDTH)
5401 *flagp |= HASWIDTH;
5402 if (c == '|') {
5403 if (is_define)
5404 vFAIL("(?(DEFINE)....) does not allow branches");
5405 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
5406 regbranch(pRExC_state, &flags, 1,depth+1);
5407 REGTAIL(pRExC_state, ret, lastbr);
5408 if (flags&HASWIDTH)
5409 *flagp |= HASWIDTH;
5410 c = *nextchar(pRExC_state);
5411 }
5412 else
5413 lastbr = NULL;
5414 if (c != ')')
5415 vFAIL("Switch (?(condition)... contains too many branches");
5416 ender = reg_node(pRExC_state, TAIL);
5417 REGTAIL(pRExC_state, br, ender);
5418 if (lastbr) {
5419 REGTAIL(pRExC_state, lastbr, ender);
5420 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
5421 }
5422 else
5423 REGTAIL(pRExC_state, ret, ender);
5424 return ret;
5425 }
5426 else {
5427 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
5428 }
5429 }
5430 case 0:
5431 RExC_parse--; /* for vFAIL to print correctly */
5432 vFAIL("Sequence (? incomplete");
5433 break;
5434 default:
5435 --RExC_parse;
5436 parse_flags: /* (?i) */
5437 {
5438 U32 posflags = 0, negflags = 0;
5439 U32 *flagsp = &posflags;
5440
5441 while (*RExC_parse) {
5442 /* && strchr("iogcmsx", *RExC_parse) */
5443 /* (?g), (?gc) and (?o) are useless here
5444 and must be globally applied -- japhy */
5445 switch (*RExC_parse) {
5446 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
5447 case 'o':
5448 case 'g':
5449 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5450 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
5451 if (! (wastedflags & wflagbit) ) {
5452 wastedflags |= wflagbit;
5453 vWARN5(
5454 RExC_parse + 1,
5455 "Useless (%s%c) - %suse /%c modifier",
5456 flagsp == &negflags ? "?-" : "?",
5457 *RExC_parse,
5458 flagsp == &negflags ? "don't " : "",
5459 *RExC_parse
5460 );
5461 }
5462 }
5463 break;
5464
5465 case 'c':
5466 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5467 if (! (wastedflags & WASTED_C) ) {
5468 wastedflags |= WASTED_GC;
5469 vWARN3(
5470 RExC_parse + 1,
5471 "Useless (%sc) - %suse /gc modifier",
5472 flagsp == &negflags ? "?-" : "?",
5473 flagsp == &negflags ? "don't " : ""
5474 );
5475 }
5476 }
5477 break;
5478 case 'k':
5479 if (flagsp == &negflags) {
5480 if (SIZE_ONLY && ckWARN(WARN_REGEXP))
5481 vWARN(RExC_parse + 1,"Useless use of (?-k)");
5482 } else {
5483 *flagsp |= RXf_PMf_KEEPCOPY;
5484 }
5485 break;
5486 case '-':
5487 if (flagsp == &negflags)
5488 goto unknown;
5489 flagsp = &negflags;
5490 wastedflags = 0; /* reset so (?g-c) warns twice */
5491 break;
5492 case ':':
5493 paren = ':';
5494 /*FALLTHROUGH*/
5495 case ')':
5496 RExC_flags |= posflags;
5497 RExC_flags &= ~negflags;
5498 nextchar(pRExC_state);
5499 if (paren != ':') {
5500 *flagp = TRYAGAIN;
5501 return NULL;
5502 } else {
5503 ret = NULL;
5504 goto parse_rest;
5505 }
5506 /*NOTREACHED*/
5507 default:
5508 unknown:
5509 RExC_parse++;
5510 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5511 /*NOTREACHED*/
5512 }
5513 ++RExC_parse;
5514 }
5515 }} /* one for the default block, one for the switch */
5516 }
5517 else { /* (...) */
5518 capturing_parens:
5519 parno = RExC_npar;
5520 RExC_npar++;
5521
5522 ret = reganode(pRExC_state, OPEN, parno);
5523 if (!SIZE_ONLY ){
5524 if (!RExC_nestroot)
5525 RExC_nestroot = parno;
5526 if (RExC_seen & REG_SEEN_RECURSE) {
5527 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5528 "Setting open paren #%"IVdf" to %d\n",
5529 (IV)parno, REG_NODE_NUM(ret)));
5530 RExC_open_parens[parno-1]= ret;
5531 }
5532 }
5533 Set_Node_Length(ret, 1); /* MJD */
5534 Set_Node_Offset(ret, RExC_parse); /* MJD */
5535 is_open = 1;
5536 }
5537 }
5538 else /* ! paren */
5539 ret = NULL;
5540
5541 parse_rest:
5542 /* Pick up the branches, linking them together. */
5543 parse_start = RExC_parse; /* MJD */
5544 br = regbranch(pRExC_state, &flags, 1,depth+1);
5545 /* branch_len = (paren != 0); */
5546
5547 if (br == NULL)
5548 return(NULL);
5549 if (*RExC_parse == '|') {
5550 if (!SIZE_ONLY && RExC_extralen) {
5551 reginsert(pRExC_state, BRANCHJ, br, depth+1);
5552 }
5553 else { /* MJD */
5554 reginsert(pRExC_state, BRANCH, br, depth+1);
5555 Set_Node_Length(br, paren != 0);
5556 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5557 }
5558 have_branch = 1;
5559 if (SIZE_ONLY)
5560 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
5561 }
5562 else if (paren == ':') {
5563 *flagp |= flags&SIMPLE;
5564 }
5565 if (is_open) { /* Starts with OPEN. */
5566 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
5567 }
5568 else if (paren != '?') /* Not Conditional */
5569 ret = br;
5570 *flagp |= flags & (SPSTART | HASWIDTH);
5571 lastbr = br;
5572 while (*RExC_parse == '|') {
5573 if (!SIZE_ONLY && RExC_extralen) {
5574 ender = reganode(pRExC_state, LONGJMP,0);
5575 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
5576 }
5577 if (SIZE_ONLY)
5578 RExC_extralen += 2; /* Account for LONGJMP. */
5579 nextchar(pRExC_state);
5580 br = regbranch(pRExC_state, &flags, 0, depth+1);
5581
5582 if (br == NULL)
5583 return(NULL);
5584 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
5585 lastbr = br;
5586 if (flags&HASWIDTH)
5587 *flagp |= HASWIDTH;
5588 *flagp |= flags&SPSTART;
5589 }
5590
5591 if (have_branch || paren != ':') {
5592 /* Make a closing node, and hook it on the end. */
5593 switch (paren) {
5594 case ':':
5595 ender = reg_node(pRExC_state, TAIL);
5596 break;
5597 case 1:
5598 ender = reganode(pRExC_state, CLOSE, parno);
5599 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5600 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5601 "Setting close paren #%"IVdf" to %d\n",
5602 (IV)parno, REG_NODE_NUM(ender)));
5603 RExC_close_parens[parno-1]= ender;
5604 if (RExC_nestroot == parno)
5605 RExC_nestroot = 0;
5606 }
5607 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5608 Set_Node_Length(ender,1); /* MJD */
5609 break;
5610 case '<':
5611 case ',':
5612 case '=':
5613 case '!':
5614 *flagp &= ~HASWIDTH;
5615 /* FALL THROUGH */
5616 case '>':
5617 ender = reg_node(pRExC_state, SUCCEED);
5618 break;
5619 case 0:
5620 ender = reg_node(pRExC_state, END);
5621 if (!SIZE_ONLY) {
5622 assert(!RExC_opend); /* there can only be one! */
5623 RExC_opend = ender;
5624 }
5625 break;
5626 }
5627 REGTAIL(pRExC_state, lastbr, ender);
5628
5629 if (have_branch && !SIZE_ONLY) {
5630 if (depth==1)
5631 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5632
5633 /* Hook the tails of the branches to the closing node. */
5634 for (br = ret; br; br = regnext(br)) {
5635 const U8 op = PL_regkind[OP(br)];
5636 if (op == BRANCH) {
5637 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
5638 }
5639 else if (op == BRANCHJ) {
5640 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
5641 }
5642 }
5643 }
5644 }
5645
5646 {
5647 const char *p;
5648 static const char parens[] = "=!<,>";
5649
5650 if (paren && (p = strchr(parens, paren))) {
5651 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
5652 int flag = (p - parens) > 1;
5653
5654 if (paren == '>')
5655 node = SUSPEND, flag = 0;
5656 reginsert(pRExC_state, node,ret, depth+1);
5657 Set_Node_Cur_Length(ret);
5658 Set_Node_Offset(ret, parse_start + 1);
5659 ret->flags = flag;
5660 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
5661 }
5662 }
5663
5664 /* Check for proper termination. */
5665 if (paren) {
5666 RExC_flags = oregflags;
5667 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5668 RExC_parse = oregcomp_parse;
5669 vFAIL("Unmatched (");
5670 }
5671 }
5672 else if (!paren && RExC_parse < RExC_end) {
5673 if (*RExC_parse == ')') {
5674 RExC_parse++;
5675 vFAIL("Unmatched )");
5676 }
5677 else
5678 FAIL("Junk on end of regexp"); /* "Can't happen". */
5679 /* NOTREACHED */
5680 }
5681
5682 return(ret);
5683}
5684
5685/*
5686 - regbranch - one alternative of an | operator
5687 *
5688 * Implements the concatenation operator.
5689 */
5690STATIC regnode *
5691S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
5692{
5693 dVAR;
5694 register regnode *ret;
5695 register regnode *chain = NULL;
5696 register regnode *latest;
5697 I32 flags = 0, c = 0;
5698 GET_RE_DEBUG_FLAGS_DECL;
5699 DEBUG_PARSE("brnc");
5700 if (first)
5701 ret = NULL;
5702 else {
5703 if (!SIZE_ONLY && RExC_extralen)
5704 ret = reganode(pRExC_state, BRANCHJ,0);
5705 else {
5706 ret = reg_node(pRExC_state, BRANCH);
5707 Set_Node_Length(ret, 1);
5708 }
5709 }
5710
5711 if (!first && SIZE_ONLY)
5712 RExC_extralen += 1; /* BRANCHJ */
5713
5714 *flagp = WORST; /* Tentatively. */
5715
5716 RExC_parse--;
5717 nextchar(pRExC_state);
5718 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
5719 flags &= ~TRYAGAIN;
5720 latest = regpiece(pRExC_state, &flags,depth+1);
5721 if (latest == NULL) {
5722 if (flags & TRYAGAIN)
5723 continue;
5724 return(NULL);
5725 }
5726 else if (ret == NULL)
5727 ret = latest;
5728 *flagp |= flags&HASWIDTH;
5729 if (chain == NULL) /* First piece. */
5730 *flagp |= flags&SPSTART;
5731 else {
5732 RExC_naughty++;
5733 REGTAIL(pRExC_state, chain, latest);
5734 }
5735 chain = latest;
5736 c++;
5737 }
5738 if (chain == NULL) { /* Loop ran zero times. */
5739 chain = reg_node(pRExC_state, NOTHING);
5740 if (ret == NULL)
5741 ret = chain;
5742 }
5743 if (c == 1) {
5744 *flagp |= flags&SIMPLE;
5745 }
5746
5747 return ret;
5748}
5749
5750/*
5751 - regpiece - something followed by possible [*+?]
5752 *
5753 * Note that the branching code sequences used for ? and the general cases
5754 * of * and + are somewhat optimized: they use the same NOTHING node as
5755 * both the endmarker for their branch list and the body of the last branch.
5756 * It might seem that this node could be dispensed with entirely, but the
5757 * endmarker role is not redundant.
5758 */
5759STATIC regnode *
5760S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5761{
5762 dVAR;
5763 register regnode *ret;
5764 register char op;
5765 register char *next;
5766 I32 flags;
5767 const char * const origparse = RExC_parse;
5768 I32 min;
5769 I32 max = REG_INFTY;
5770 char *parse_start;
5771 const char *maxpos = NULL;
5772 GET_RE_DEBUG_FLAGS_DECL;
5773 DEBUG_PARSE("piec");
5774
5775 ret = regatom(pRExC_state, &flags,depth+1);
5776 if (ret == NULL) {
5777 if (flags & TRYAGAIN)
5778 *flagp |= TRYAGAIN;
5779 return(NULL);
5780 }
5781
5782 op = *RExC_parse;
5783
5784 if (op == '{' && regcurly(RExC_parse)) {
5785 maxpos = NULL;
5786 parse_start = RExC_parse; /* MJD */
5787 next = RExC_parse + 1;
5788 while (isDIGIT(*next) || *next == ',') {
5789 if (*next == ',') {
5790 if (maxpos)
5791 break;
5792 else
5793 maxpos = next;
5794 }
5795 next++;
5796 }
5797 if (*next == '}') { /* got one */
5798 if (!maxpos)
5799 maxpos = next;
5800 RExC_parse++;
5801 min = atoi(RExC_parse);
5802 if (*maxpos == ',')
5803 maxpos++;
5804 else
5805 maxpos = RExC_parse;
5806 max = atoi(maxpos);
5807 if (!max && *maxpos != '0')
5808 max = REG_INFTY; /* meaning "infinity" */
5809 else if (max >= REG_INFTY)
5810 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
5811 RExC_parse = next;
5812 nextchar(pRExC_state);
5813
5814 do_curly:
5815 if ((flags&SIMPLE)) {
5816 RExC_naughty += 2 + RExC_naughty / 2;
5817 reginsert(pRExC_state, CURLY, ret, depth+1);
5818 Set_Node_Offset(ret, parse_start+1); /* MJD */
5819 Set_Node_Cur_Length(ret);
5820 }
5821 else {
5822 regnode * const w = reg_node(pRExC_state, WHILEM);
5823
5824 w->flags = 0;
5825 REGTAIL(pRExC_state, ret, w);
5826 if (!SIZE_ONLY && RExC_extralen) {
5827 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5828 reginsert(pRExC_state, NOTHING,ret, depth+1);
5829 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5830 }
5831 reginsert(pRExC_state, CURLYX,ret, depth+1);
5832 /* MJD hk */
5833 Set_Node_Offset(ret, parse_start+1);
5834 Set_Node_Length(ret,
5835 op == '{' ? (RExC_parse - parse_start) : 1);
5836
5837 if (!SIZE_ONLY && RExC_extralen)
5838 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
5839 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
5840 if (SIZE_ONLY)
5841 RExC_whilem_seen++, RExC_extralen += 3;
5842 RExC_naughty += 4 + RExC_naughty; /* compound interest */
5843 }
5844 ret->flags = 0;
5845
5846 if (min > 0)
5847 *flagp = WORST;
5848 if (max > 0)
5849 *flagp |= HASWIDTH;
5850 if (max && max < min)
5851 vFAIL("Can't do {n,m} with n > m");
5852 if (!SIZE_ONLY) {
5853 ARG1_SET(ret, (U16)min);
5854 ARG2_SET(ret, (U16)max);
5855 }
5856
5857 goto nest_check;
5858 }
5859 }
5860
5861 if (!ISMULT1(op)) {
5862 *flagp = flags;
5863 return(ret);
5864 }
5865
5866#if 0 /* Now runtime fix should be reliable. */
5867
5868 /* if this is reinstated, don't forget to put this back into perldiag:
5869
5870 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5871
5872 (F) The part of the regexp subject to either the * or + quantifier
5873 could match an empty string. The {#} shows in the regular
5874 expression about where the problem was discovered.
5875
5876 */
5877
5878 if (!(flags&HASWIDTH) && op != '?')
5879 vFAIL("Regexp *+ operand could be empty");
5880#endif
5881
5882 parse_start = RExC_parse;
5883 nextchar(pRExC_state);
5884
5885 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
5886
5887 if (op == '*' && (flags&SIMPLE)) {
5888 reginsert(pRExC_state, STAR, ret, depth+1);
5889 ret->flags = 0;
5890 RExC_naughty += 4;
5891 }
5892 else if (op == '*') {
5893 min = 0;
5894 goto do_curly;
5895 }
5896 else if (op == '+' && (flags&SIMPLE)) {
5897 reginsert(pRExC_state, PLUS, ret, depth+1);
5898 ret->flags = 0;
5899 RExC_naughty += 3;
5900 }
5901 else if (op == '+') {
5902 min = 1;
5903 goto do_curly;
5904 }
5905 else if (op == '?') {
5906 min = 0; max = 1;
5907 goto do_curly;
5908 }
5909 nest_check:
5910 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
5911 vWARN3(RExC_parse,
5912 "%.*s matches null string many times",
5913 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
5914 origparse);
5915 }
5916
5917 if (RExC_parse < RExC_end && *RExC_parse == '?') {
5918 nextchar(pRExC_state);
5919 reginsert(pRExC_state, MINMOD, ret, depth+1);
5920 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
5921 }
5922#ifndef REG_ALLOW_MINMOD_SUSPEND
5923 else
5924#endif
5925 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5926 regnode *ender;
5927 nextchar(pRExC_state);
5928 ender = reg_node(pRExC_state, SUCCEED);
5929 REGTAIL(pRExC_state, ret, ender);
5930 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5931 ret->flags = 0;
5932 ender = reg_node(pRExC_state, TAIL);
5933 REGTAIL(pRExC_state, ret, ender);
5934 /*ret= ender;*/
5935 }
5936
5937 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
5938 RExC_parse++;
5939 vFAIL("Nested quantifiers");
5940 }
5941
5942 return(ret);
5943}
5944
5945
5946/* reg_namedseq(pRExC_state,UVp)
5947
5948 This is expected to be called by a parser routine that has
5949 recognized'\N' and needs to handle the rest. RExC_parse is
5950 expected to point at the first char following the N at the time
5951 of the call.
5952
5953 If valuep is non-null then it is assumed that we are parsing inside
5954 of a charclass definition and the first codepoint in the resolved
5955 string is returned via *valuep and the routine will return NULL.
5956 In this mode if a multichar string is returned from the charnames
5957 handler a warning will be issued, and only the first char in the
5958 sequence will be examined. If the string returned is zero length
5959 then the value of *valuep is undefined and NON-NULL will
5960 be returned to indicate failure. (This will NOT be a valid pointer
5961 to a regnode.)
5962
5963 If value is null then it is assumed that we are parsing normal text
5964 and inserts a new EXACT node into the program containing the resolved
5965 string and returns a pointer to the new node. If the string is
5966 zerolength a NOTHING node is emitted.
5967
5968 On success RExC_parse is set to the char following the endbrace.
5969 Parsing failures will generate a fatal errorvia vFAIL(...)
5970
5971 NOTE: We cache all results from the charnames handler locally in
5972 the RExC_charnames hash (created on first use) to prevent a charnames
5973 handler from playing silly-buggers and returning a short string and
5974 then a long string for a given pattern. Since the regexp program
5975 size is calculated during an initial parse this would result
5976 in a buffer overrun so we cache to prevent the charname result from
5977 changing during the course of the parse.
5978
5979 */
5980STATIC regnode *
5981S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5982{
5983 char * name; /* start of the content of the name */
5984 char * endbrace; /* endbrace following the name */
5985 SV *sv_str = NULL;
5986 SV *sv_name = NULL;
5987 STRLEN len; /* this has various purposes throughout the code */
5988 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5989 regnode *ret = NULL;
5990
5991 if (*RExC_parse != '{') {
5992 vFAIL("Missing braces on \\N{}");
5993 }
5994 name = RExC_parse+1;
5995 endbrace = strchr(RExC_parse, '}');
5996 if ( ! endbrace ) {
5997 RExC_parse++;
5998 vFAIL("Missing right brace on \\N{}");
5999 }
6000 RExC_parse = endbrace + 1;
6001
6002
6003 /* RExC_parse points at the beginning brace,
6004 endbrace points at the last */
6005 if ( name[0]=='U' && name[1]=='+' ) {
6006 /* its a "unicode hex" notation {U+89AB} */
6007 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6008 | PERL_SCAN_DISALLOW_PREFIX
6009 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6010 UV cp;
6011 len = (STRLEN)(endbrace - name - 2);
6012 cp = grok_hex(name + 2, &len, &fl, NULL);
6013 if ( len != (STRLEN)(endbrace - name - 2) ) {
6014 cp = 0xFFFD;
6015 }
6016 if (cp > 0xff)
6017 RExC_utf8 = 1;
6018 if ( valuep ) {
6019 *valuep = cp;
6020 return NULL;
6021 }
6022 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
6023 } else {
6024 /* fetch the charnames handler for this scope */
6025 HV * const table = GvHV(PL_hintgv);
6026 SV **cvp= table ?
6027 hv_fetchs(table, "charnames", FALSE) :
6028 NULL;
6029 SV *cv= cvp ? *cvp : NULL;
6030 HE *he_str;
6031 int count;
6032 /* create an SV with the name as argument */
6033 sv_name = newSVpvn(name, endbrace - name);
6034
6035 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6036 vFAIL2("Constant(\\N{%s}) unknown: "
6037 "(possibly a missing \"use charnames ...\")",
6038 SvPVX(sv_name));
6039 }
6040 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6041 vFAIL2("Constant(\\N{%s}): "
6042 "$^H{charnames} is not defined",SvPVX(sv_name));
6043 }
6044
6045
6046
6047 if (!RExC_charnames) {
6048 /* make sure our cache is allocated */
6049 RExC_charnames = newHV();
6050 sv_2mortal((SV*)RExC_charnames);
6051 }
6052 /* see if we have looked this one up before */
6053 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6054 if ( he_str ) {
6055 sv_str = HeVAL(he_str);
6056 cached = 1;
6057 } else {
6058 dSP ;
6059
6060 ENTER ;
6061 SAVETMPS ;
6062 PUSHMARK(SP) ;
6063
6064 XPUSHs(sv_name);
6065
6066 PUTBACK ;
6067
6068 count= call_sv(cv, G_SCALAR);
6069
6070 if (count == 1) { /* XXXX is this right? dmq */
6071 sv_str = POPs;
6072 SvREFCNT_inc_simple_void(sv_str);
6073 }
6074
6075 SPAGAIN ;
6076 PUTBACK ;
6077 FREETMPS ;
6078 LEAVE ;
6079
6080 if ( !sv_str || !SvOK(sv_str) ) {
6081 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
6082 "did not return a defined value",SvPVX(sv_name));
6083 }
6084 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6085 cached = 1;
6086 }
6087 }
6088 if (valuep) {
6089 char *p = SvPV(sv_str, len);
6090 if (len) {
6091 STRLEN numlen = 1;
6092 if ( SvUTF8(sv_str) ) {
6093 *valuep = utf8_to_uvchr((U8*)p, &numlen);
6094 if (*valuep > 0x7F)
6095 RExC_utf8 = 1;
6096 /* XXXX
6097 We have to turn on utf8 for high bit chars otherwise
6098 we get failures with
6099
6100 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6101 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6102
6103 This is different from what \x{} would do with the same
6104 codepoint, where the condition is > 0xFF.
6105 - dmq
6106 */
6107
6108
6109 } else {
6110 *valuep = (UV)*p;
6111 /* warn if we havent used the whole string? */
6112 }
6113 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6114 vWARN2(RExC_parse,
6115 "Ignoring excess chars from \\N{%s} in character class",
6116 SvPVX(sv_name)
6117 );
6118 }
6119 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6120 vWARN2(RExC_parse,
6121 "Ignoring zero length \\N{%s} in character class",
6122 SvPVX(sv_name)
6123 );
6124 }
6125 if (sv_name)
6126 SvREFCNT_dec(sv_name);
6127 if (!cached)
6128 SvREFCNT_dec(sv_str);
6129 return len ? NULL : (regnode *)&len;
6130 } else if(SvCUR(sv_str)) {
6131
6132 char *s;
6133 char *p, *pend;
6134 STRLEN charlen = 1;
6135 char * parse_start = name-3; /* needed for the offsets */
6136 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6137
6138 ret = reg_node(pRExC_state,
6139 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6140 s= STRING(ret);
6141
6142 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6143 sv_utf8_upgrade(sv_str);
6144 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6145 RExC_utf8= 1;
6146 }
6147
6148 p = SvPV(sv_str, len);
6149 pend = p + len;
6150 /* len is the length written, charlen is the size the char read */
6151 for ( len = 0; p < pend; p += charlen ) {
6152 if (UTF) {
6153 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
6154 if (FOLD) {
6155 STRLEN foldlen,numlen;
6156 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6157 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6158 /* Emit all the Unicode characters. */
6159
6160 for (foldbuf = tmpbuf;
6161 foldlen;
6162 foldlen -= numlen)
6163 {
6164 uvc = utf8_to_uvchr(foldbuf, &numlen);
6165 if (numlen > 0) {
6166 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6167 s += unilen;
6168 len += unilen;
6169 /* In EBCDIC the numlen
6170 * and unilen can differ. */
6171 foldbuf += numlen;
6172 if (numlen >= foldlen)
6173 break;
6174 }
6175 else
6176 break; /* "Can't happen." */
6177 }
6178 } else {
6179 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6180 if (unilen > 0) {
6181 s += unilen;
6182 len += unilen;
6183 }
6184 }
6185 } else {
6186 len++;
6187 REGC(*p, s++);
6188 }
6189 }
6190 if (SIZE_ONLY) {
6191 RExC_size += STR_SZ(len);
6192 } else {
6193 STR_LEN(ret) = len;
6194 RExC_emit += STR_SZ(len);
6195 }
6196 Set_Node_Cur_Length(ret); /* MJD */
6197 RExC_parse--;
6198 nextchar(pRExC_state);
6199 } else {
6200 ret = reg_node(pRExC_state,NOTHING);
6201 }
6202 if (!cached) {
6203 SvREFCNT_dec(sv_str);
6204 }
6205 if (sv_name) {
6206 SvREFCNT_dec(sv_name);
6207 }
6208 return ret;
6209
6210}
6211
6212
6213/*
6214 * reg_recode
6215 *
6216 * It returns the code point in utf8 for the value in *encp.
6217 * value: a code value in the source encoding
6218 * encp: a pointer to an Encode object
6219 *
6220 * If the result from Encode is not a single character,
6221 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6222 */
6223STATIC UV
6224S_reg_recode(pTHX_ const char value, SV **encp)
6225{
6226 STRLEN numlen = 1;
6227 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6228 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6229 : SvPVX(sv);
6230 const STRLEN newlen = SvCUR(sv);
6231 UV uv = UNICODE_REPLACEMENT;
6232
6233 if (newlen)
6234 uv = SvUTF8(sv)
6235 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6236 : *(U8*)s;
6237
6238 if (!newlen || numlen != newlen) {
6239 uv = UNICODE_REPLACEMENT;
6240 if (encp)
6241 *encp = NULL;
6242 }
6243 return uv;
6244}
6245
6246
6247/*
6248 - regatom - the lowest level
6249
6250 Try to identify anything special at the start of the pattern. If there
6251 is, then handle it as required. This may involve generating a single regop,
6252 such as for an assertion; or it may involve recursing, such as to
6253 handle a () structure.
6254
6255 If the string doesn't start with something special then we gobble up
6256 as much literal text as we can.
6257
6258 Once we have been able to handle whatever type of thing started the
6259 sequence, we return.
6260
6261 Note: we have to be careful with escapes, as they can be both literal
6262 and special, and in the case of \10 and friends can either, depending
6263 on context. Specifically there are two seperate switches for handling
6264 escape sequences, with the one for handling literal escapes requiring
6265 a dummy entry for all of the special escapes that are actually handled
6266 by the other.
6267*/
6268
6269STATIC regnode *
6270S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6271{
6272 dVAR;
6273 register regnode *ret = NULL;
6274 I32 flags;
6275 char *parse_start = RExC_parse;
6276 GET_RE_DEBUG_FLAGS_DECL;
6277 DEBUG_PARSE("atom");
6278 *flagp = WORST; /* Tentatively. */
6279
6280
6281tryagain:
6282 switch (*RExC_parse) {
6283 case '^':
6284 RExC_seen_zerolen++;
6285 nextchar(pRExC_state);
6286 if (RExC_flags & RXf_PMf_MULTILINE)
6287 ret = reg_node(pRExC_state, MBOL);
6288 else if (RExC_flags & RXf_PMf_SINGLELINE)
6289 ret = reg_node(pRExC_state, SBOL);
6290 else
6291 ret = reg_node(pRExC_state, BOL);
6292 Set_Node_Length(ret, 1); /* MJD */
6293 break;
6294 case '$':
6295 nextchar(pRExC_state);
6296 if (*RExC_parse)
6297 RExC_seen_zerolen++;
6298 if (RExC_flags & RXf_PMf_MULTILINE)
6299 ret = reg_node(pRExC_state, MEOL);
6300 else if (RExC_flags & RXf_PMf_SINGLELINE)
6301 ret = reg_node(pRExC_state, SEOL);
6302 else
6303 ret = reg_node(pRExC_state, EOL);
6304 Set_Node_Length(ret, 1); /* MJD */
6305 break;
6306 case '.':
6307 nextchar(pRExC_state);
6308 if (RExC_flags & RXf_PMf_SINGLELINE)
6309 ret = reg_node(pRExC_state, SANY);
6310 else
6311 ret = reg_node(pRExC_state, REG_ANY);
6312 *flagp |= HASWIDTH|SIMPLE;
6313 RExC_naughty++;
6314 Set_Node_Length(ret, 1); /* MJD */
6315 break;
6316 case '[':
6317 {
6318 char * const oregcomp_parse = ++RExC_parse;
6319 ret = regclass(pRExC_state,depth+1);
6320 if (*RExC_parse != ']') {
6321 RExC_parse = oregcomp_parse;
6322 vFAIL("Unmatched [");
6323 }
6324 nextchar(pRExC_state);
6325 *flagp |= HASWIDTH|SIMPLE;
6326 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
6327 break;
6328 }
6329 case '(':
6330 nextchar(pRExC_state);
6331 ret = reg(pRExC_state, 1, &flags,depth+1);
6332 if (ret == NULL) {
6333 if (flags & TRYAGAIN) {
6334 if (RExC_parse == RExC_end) {
6335 /* Make parent create an empty node if needed. */
6336 *flagp |= TRYAGAIN;
6337 return(NULL);
6338 }
6339 goto tryagain;
6340 }
6341 return(NULL);
6342 }
6343 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
6344 break;
6345 case '|':
6346 case ')':
6347 if (flags & TRYAGAIN) {
6348 *flagp |= TRYAGAIN;
6349 return NULL;
6350 }
6351 vFAIL("Internal urp");
6352 /* Supposed to be caught earlier. */
6353 break;
6354 case '{':
6355 if (!regcurly(RExC_parse)) {
6356 RExC_parse++;
6357 goto defchar;
6358 }
6359 /* FALL THROUGH */
6360 case '?':
6361 case '+':
6362 case '*':
6363 RExC_parse++;
6364 vFAIL("Quantifier follows nothing");
6365 break;
6366 case '\\':
6367 /* Special Escapes
6368
6369 This switch handles escape sequences that resolve to some kind
6370 of special regop and not to literal text. Escape sequnces that
6371 resolve to literal text are handled below in the switch marked
6372 "Literal Escapes".
6373
6374 Every entry in this switch *must* have a corresponding entry
6375 in the literal escape switch. However, the opposite is not
6376 required, as the default for this switch is to jump to the
6377 literal text handling code.
6378 */
6379 switch (*++RExC_parse) {
6380 /* Special Escapes */
6381 case 'A':
6382 RExC_seen_zerolen++;
6383 ret = reg_node(pRExC_state, SBOL);
6384 *flagp |= SIMPLE;
6385 goto finish_meta_pat;
6386 case 'G':
6387 ret = reg_node(pRExC_state, GPOS);
6388 RExC_seen |= REG_SEEN_GPOS;
6389 *flagp |= SIMPLE;
6390 goto finish_meta_pat;
6391 case 'K':
6392 RExC_seen_zerolen++;
6393 ret = reg_node(pRExC_state, KEEPS);
6394 *flagp |= SIMPLE;
6395 goto finish_meta_pat;
6396 case 'Z':
6397 ret = reg_node(pRExC_state, SEOL);
6398 *flagp |= SIMPLE;
6399 RExC_seen_zerolen++; /* Do not optimize RE away */
6400 goto finish_meta_pat;
6401 case 'z':
6402 ret = reg_node(pRExC_state, EOS);
6403 *flagp |= SIMPLE;
6404 RExC_seen_zerolen++; /* Do not optimize RE away */
6405 goto finish_meta_pat;
6406 case 'C':
6407 ret = reg_node(pRExC_state, CANY);
6408 RExC_seen |= REG_SEEN_CANY;
6409 *flagp |= HASWIDTH|SIMPLE;
6410 goto finish_meta_pat;
6411 case 'X':
6412 ret = reg_node(pRExC_state, CLUMP);
6413 *flagp |= HASWIDTH;
6414 goto finish_meta_pat;
6415 case 'w':
6416 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
6417 *flagp |= HASWIDTH|SIMPLE;
6418 goto finish_meta_pat;
6419 case 'W':
6420 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
6421 *flagp |= HASWIDTH|SIMPLE;
6422 goto finish_meta_pat;
6423 case 'b':
6424 RExC_seen_zerolen++;
6425 RExC_seen |= REG_SEEN_LOOKBEHIND;
6426 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
6427 *flagp |= SIMPLE;
6428 goto finish_meta_pat;
6429 case 'B':
6430 RExC_seen_zerolen++;
6431 RExC_seen |= REG_SEEN_LOOKBEHIND;
6432 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
6433 *flagp |= SIMPLE;
6434 goto finish_meta_pat;
6435 case 's':
6436 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
6437 *flagp |= HASWIDTH|SIMPLE;
6438 goto finish_meta_pat;
6439 case 'S':
6440 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
6441 *flagp |= HASWIDTH|SIMPLE;
6442 goto finish_meta_pat;
6443 case 'd':
6444 ret = reg_node(pRExC_state, DIGIT);
6445 *flagp |= HASWIDTH|SIMPLE;
6446 goto finish_meta_pat;
6447 case 'D':
6448 ret = reg_node(pRExC_state, NDIGIT);
6449 *flagp |= HASWIDTH|SIMPLE;
6450 goto finish_meta_pat;
6451 case 'v':
6452 ret = reganode(pRExC_state, PRUNE, 0);
6453 ret->flags = 1;
6454 *flagp |= SIMPLE;
6455 goto finish_meta_pat;
6456 case 'V':
6457 ret = reganode(pRExC_state, SKIP, 0);
6458 ret->flags = 1;
6459 *flagp |= SIMPLE;
6460 finish_meta_pat:
6461 nextchar(pRExC_state);
6462 Set_Node_Length(ret, 2); /* MJD */
6463 break;
6464 case 'p':
6465 case 'P':
6466 {
6467 char* const oldregxend = RExC_end;
6468 char* parse_start = RExC_parse - 2;
6469
6470 if (RExC_parse[1] == '{') {
6471 /* a lovely hack--pretend we saw [\pX] instead */
6472 RExC_end = strchr(RExC_parse, '}');
6473 if (!RExC_end) {
6474 const U8 c = (U8)*RExC_parse;
6475 RExC_parse += 2;
6476 RExC_end = oldregxend;
6477 vFAIL2("Missing right brace on \\%c{}", c);
6478 }
6479 RExC_end++;
6480 }
6481 else {
6482 RExC_end = RExC_parse + 2;
6483 if (RExC_end > oldregxend)
6484 RExC_end = oldregxend;
6485 }
6486 RExC_parse--;
6487
6488 ret = regclass(pRExC_state,depth+1);
6489
6490 RExC_end = oldregxend;
6491 RExC_parse--;
6492
6493 Set_Node_Offset(ret, parse_start + 2);
6494 Set_Node_Cur_Length(ret);
6495 nextchar(pRExC_state);
6496 *flagp |= HASWIDTH|SIMPLE;
6497 }
6498 break;
6499 case 'N':
6500 /* Handle \N{NAME} here and not below because it can be
6501 multicharacter. join_exact() will join them up later on.
6502 Also this makes sure that things like /\N{BLAH}+/ and
6503 \N{BLAH} being multi char Just Happen. dmq*/
6504 ++RExC_parse;
6505 ret= reg_namedseq(pRExC_state, NULL);
6506 break;
6507 case 'k': /* Handle \k<NAME> and \k'NAME' */
6508 parse_named_seq:
6509 {
6510 char ch= RExC_parse[1];
6511 if (ch != '<' && ch != '\'' && ch != '{') {
6512 RExC_parse++;
6513 vFAIL2("Sequence %.2s... not terminated",parse_start);
6514 } else {
6515 /* this pretty much dupes the code for (?P=...) in reg(), if
6516 you change this make sure you change that */
6517 char* name_start = (RExC_parse += 2);
6518 U32 num = 0;
6519 SV *sv_dat = reg_scan_name(pRExC_state,
6520 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6521 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
6522 if (RExC_parse == name_start || *RExC_parse != ch)
6523 vFAIL2("Sequence %.3s... not terminated",parse_start);
6524
6525 if (!SIZE_ONLY) {
6526 num = add_data( pRExC_state, 1, "S" );
6527 RExC_rxi->data->data[num]=(void*)sv_dat;
6528 SvREFCNT_inc(sv_dat);
6529 }
6530
6531 RExC_sawback = 1;
6532 ret = reganode(pRExC_state,
6533 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6534 num);
6535 *flagp |= HASWIDTH;
6536
6537 /* override incorrect value set in reganode MJD */
6538 Set_Node_Offset(ret, parse_start+1);
6539 Set_Node_Cur_Length(ret); /* MJD */
6540 nextchar(pRExC_state);
6541
6542 }
6543 break;
6544 }
6545 case 'g':
6546 case '1': case '2': case '3': case '4':
6547 case '5': case '6': case '7': case '8': case '9':
6548 {
6549 I32 num;
6550 bool isg = *RExC_parse == 'g';
6551 bool isrel = 0;
6552 bool hasbrace = 0;
6553 if (isg) {
6554 RExC_parse++;
6555 if (*RExC_parse == '{') {
6556 RExC_parse++;
6557 hasbrace = 1;
6558 }
6559 if (*RExC_parse == '-') {
6560 RExC_parse++;
6561 isrel = 1;
6562 }
6563 if (hasbrace && !isDIGIT(*RExC_parse)) {
6564 if (isrel) RExC_parse--;
6565 RExC_parse -= 2;
6566 goto parse_named_seq;
6567 } }
6568 num = atoi(RExC_parse);
6569 if (isrel) {
6570 num = RExC_npar - num;
6571 if (num < 1)
6572 vFAIL("Reference to nonexistent or unclosed group");
6573 }
6574 if (!isg && num > 9 && num >= RExC_npar)
6575 goto defchar;
6576 else {
6577 char * const parse_start = RExC_parse - 1; /* MJD */
6578 while (isDIGIT(*RExC_parse))
6579 RExC_parse++;
6580 if (parse_start == RExC_parse - 1)
6581 vFAIL("Unterminated \\g... pattern");
6582 if (hasbrace) {
6583 if (*RExC_parse != '}')
6584 vFAIL("Unterminated \\g{...} pattern");
6585 RExC_parse++;
6586 }
6587 if (!SIZE_ONLY) {
6588 if (num > (I32)RExC_rx->nparens)
6589 vFAIL("Reference to nonexistent group");
6590 }
6591 RExC_sawback = 1;
6592 ret = reganode(pRExC_state,
6593 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6594 num);
6595 *flagp |= HASWIDTH;
6596
6597 /* override incorrect value set in reganode MJD */
6598 Set_Node_Offset(ret, parse_start+1);
6599 Set_Node_Cur_Length(ret); /* MJD */
6600 RExC_parse--;
6601 nextchar(pRExC_state);
6602 }
6603 }
6604 break;
6605 case '\0':
6606 if (RExC_parse >= RExC_end)
6607 FAIL("Trailing \\");
6608 /* FALL THROUGH */
6609 default:
6610 /* Do not generate "unrecognized" warnings here, we fall
6611 back into the quick-grab loop below */
6612 parse_start--;
6613 goto defchar;
6614 }
6615 break;
6616
6617 case '#':
6618 if (RExC_flags & RXf_PMf_EXTENDED) {
6619 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6620 RExC_parse++;
6621 if (RExC_parse < RExC_end)
6622 goto tryagain;
6623 }
6624 /* FALL THROUGH */
6625
6626 default: {
6627 register STRLEN len;
6628 register UV ender;
6629 register char *p;
6630 char *s;
6631 STRLEN foldlen;
6632 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6633
6634 parse_start = RExC_parse - 1;
6635
6636 RExC_parse++;
6637
6638 defchar:
6639 ender = 0;
6640 ret = reg_node(pRExC_state,
6641 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6642 s = STRING(ret);
6643 for (len = 0, p = RExC_parse - 1;
6644 len < 127 && p < RExC_end;
6645 len++)
6646 {
6647 char * const oldp = p;
6648
6649 if (RExC_flags & RXf_PMf_EXTENDED)
6650 p = regwhite(p, RExC_end);
6651 switch (*p) {
6652 case '^':
6653 case '$':
6654 case '.':
6655 case '[':
6656 case '(':
6657 case ')':
6658 case '|':
6659 goto loopdone;
6660 case '\\':
6661 /* Literal Escapes Switch
6662
6663 This switch is meant to handle escape sequences that
6664 resolve to a literal character.
6665
6666 Every escape sequence that represents something
6667 else, like an assertion or a char class, is handled
6668 in the switch marked 'Special Escapes' above in this
6669 routine, but also has an entry here as anything that
6670 isn't explicitly mentioned here will be treated as
6671 an unescaped equivalent literal.
6672 */
6673
6674 switch (*++p) {
6675 /* These are all the special escapes. */
6676 case 'A': /* Start assertion */
6677 case 'b': case 'B': /* Word-boundary assertion*/
6678 case 'C': /* Single char !DANGEROUS! */
6679 case 'd': case 'D': /* digit class */
6680 case 'g': case 'G': /* generic-backref, pos assertion */
6681 case 'k': case 'K': /* named backref, keep marker */
6682 case 'N': /* named char sequence */
6683 case 'p': case 'P': /* unicode property */
6684 case 's': case 'S': /* space class */
6685 case 'v': case 'V': /* (*PRUNE) and (*SKIP) */
6686 case 'w': case 'W': /* word class */
6687 case 'X': /* eXtended Unicode "combining character sequence" */
6688 case 'z': case 'Z': /* End of line/string assertion */
6689 --p;
6690 goto loopdone;
6691
6692 /* Anything after here is an escape that resolves to a
6693 literal. (Except digits, which may or may not)
6694 */
6695 case 'n':
6696 ender = '\n';
6697 p++;
6698 break;
6699 case 'r':
6700 ender = '\r';
6701 p++;
6702 break;
6703 case 't':
6704 ender = '\t';
6705 p++;
6706 break;
6707 case 'f':
6708 ender = '\f';
6709 p++;
6710 break;
6711 case 'e':
6712 ender = ASCII_TO_NATIVE('\033');
6713 p++;
6714 break;
6715 case 'a':
6716 ender = ASCII_TO_NATIVE('\007');
6717 p++;
6718 break;
6719 case 'x':
6720 if (*++p == '{') {
6721 char* const e = strchr(p, '}');
6722
6723 if (!e) {
6724 RExC_parse = p + 1;
6725 vFAIL("Missing right brace on \\x{}");
6726 }
6727 else {
6728 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6729 | PERL_SCAN_DISALLOW_PREFIX;
6730 STRLEN numlen = e - p - 1;
6731 ender = grok_hex(p + 1, &numlen, &flags, NULL);
6732 if (ender > 0xff)
6733 RExC_utf8 = 1;
6734 p = e + 1;
6735 }
6736 }
6737 else {
6738 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
6739 STRLEN numlen = 2;
6740 ender = grok_hex(p, &numlen, &flags, NULL);
6741 p += numlen;
6742 }
6743 if (PL_encoding && ender < 0x100)
6744 goto recode_encoding;
6745 break;
6746 case 'c':
6747 p++;
6748 ender = UCHARAT(p++);
6749 ender = toCTRL(ender);
6750 break;
6751 case '0': case '1': case '2': case '3':case '4':
6752 case '5': case '6': case '7': case '8':case '9':
6753 if (*p == '0' ||
6754 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
6755 I32 flags = 0;
6756 STRLEN numlen = 3;
6757 ender = grok_oct(p, &numlen, &flags, NULL);
6758 p += numlen;
6759 }
6760 else {
6761 --p;
6762 goto loopdone;
6763 }
6764 if (PL_encoding && ender < 0x100)
6765 goto recode_encoding;
6766 break;
6767 recode_encoding:
6768 {
6769 SV* enc = PL_encoding;
6770 ender = reg_recode((const char)(U8)ender, &enc);
6771 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6772 vWARN(p, "Invalid escape in the specified encoding");
6773 RExC_utf8 = 1;
6774 }
6775 break;
6776 case '\0':
6777 if (p >= RExC_end)
6778 FAIL("Trailing \\");
6779 /* FALL THROUGH */
6780 default:
6781 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
6782 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
6783 goto normal_default;
6784 }
6785 break;
6786 default:
6787 normal_default:
6788 if (UTF8_IS_START(*p) && UTF) {
6789 STRLEN numlen;
6790 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
6791 &numlen, UTF8_ALLOW_DEFAULT);
6792 p += numlen;
6793 }
6794 else
6795 ender = *p++;
6796 break;
6797 }
6798 if (RExC_flags & RXf_PMf_EXTENDED)
6799 p = regwhite(p, RExC_end);
6800 if (UTF && FOLD) {
6801 /* Prime the casefolded buffer. */
6802 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
6803 }
6804 if (ISMULT2(p)) { /* Back off on ?+*. */
6805 if (len)
6806 p = oldp;
6807 else if (UTF) {
6808 if (FOLD) {
6809 /* Emit all the Unicode characters. */
6810 STRLEN numlen;
6811 for (foldbuf = tmpbuf;
6812 foldlen;
6813 foldlen -= numlen) {
6814 ender = utf8_to_uvchr(foldbuf, &numlen);
6815 if (numlen > 0) {
6816 const STRLEN unilen = reguni(pRExC_state, ender, s);
6817 s += unilen;
6818 len += unilen;
6819 /* In EBCDIC the numlen
6820 * and unilen can differ. */
6821 foldbuf += numlen;
6822 if (numlen >= foldlen)
6823 break;
6824 }
6825 else
6826 break; /* "Can't happen." */
6827 }
6828 }
6829 else {
6830 const STRLEN unilen = reguni(pRExC_state, ender, s);
6831 if (unilen > 0) {
6832 s += unilen;
6833 len += unilen;
6834 }
6835 }
6836 }
6837 else {
6838 len++;
6839 REGC((char)ender, s++);
6840 }
6841 break;
6842 }
6843 if (UTF) {
6844 if (FOLD) {
6845 /* Emit all the Unicode characters. */
6846 STRLEN numlen;
6847 for (foldbuf = tmpbuf;
6848 foldlen;
6849 foldlen -= numlen) {
6850 ender = utf8_to_uvchr(foldbuf, &numlen);
6851 if (numlen > 0) {
6852 const STRLEN unilen = reguni(pRExC_state, ender, s);
6853 len += unilen;
6854 s += unilen;
6855 /* In EBCDIC the numlen
6856 * and unilen can differ. */
6857 foldbuf += numlen;
6858 if (numlen >= foldlen)
6859 break;
6860 }
6861 else
6862 break;
6863 }
6864 }
6865 else {
6866 const STRLEN unilen = reguni(pRExC_state, ender, s);
6867 if (unilen > 0) {
6868 s += unilen;
6869 len += unilen;
6870 }
6871 }
6872 len--;
6873 }
6874 else
6875 REGC((char)ender, s++);
6876 }
6877 loopdone:
6878 RExC_parse = p - 1;
6879 Set_Node_Cur_Length(ret); /* MJD */
6880 nextchar(pRExC_state);
6881 {
6882 /* len is STRLEN which is unsigned, need to copy to signed */
6883 IV iv = len;
6884 if (iv < 0)
6885 vFAIL("Internal disaster");
6886 }
6887 if (len > 0)
6888 *flagp |= HASWIDTH;
6889 if (len == 1 && UNI_IS_INVARIANT(ender))
6890 *flagp |= SIMPLE;
6891
6892 if (SIZE_ONLY)
6893 RExC_size += STR_SZ(len);
6894 else {
6895 STR_LEN(ret) = len;
6896 RExC_emit += STR_SZ(len);
6897 }
6898 }
6899 break;
6900 }
6901
6902 return(ret);
6903}
6904
6905STATIC char *
6906S_regwhite(char *p, const char *e)
6907{
6908 while (p < e) {
6909 if (isSPACE(*p))
6910 ++p;
6911 else if (*p == '#') {
6912 do {
6913 p++;
6914 } while (p < e && *p != '\n');
6915 }
6916 else
6917 break;
6918 }
6919 return p;
6920}
6921
6922/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6923 Character classes ([:foo:]) can also be negated ([:^foo:]).
6924 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6925 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
6926 but trigger failures because they are currently unimplemented. */
6927
6928#define POSIXCC_DONE(c) ((c) == ':')
6929#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6930#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6931
6932STATIC I32
6933S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
6934{
6935 dVAR;
6936 I32 namedclass = OOB_NAMEDCLASS;
6937
6938 if (value == '[' && RExC_parse + 1 < RExC_end &&
6939 /* I smell either [: or [= or [. -- POSIX has been here, right? */
6940 POSIXCC(UCHARAT(RExC_parse))) {
6941 const char c = UCHARAT(RExC_parse);
6942 char* const s = RExC_parse++;
6943
6944 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
6945 RExC_parse++;
6946 if (RExC_parse == RExC_end)
6947 /* Grandfather lone [:, [=, [. */
6948 RExC_parse = s;
6949 else {
6950 const char* const t = RExC_parse++; /* skip over the c */
6951 assert(*t == c);
6952
6953 if (UCHARAT(RExC_parse) == ']') {
6954 const char *posixcc = s + 1;
6955 RExC_parse++; /* skip over the ending ] */
6956
6957 if (*s == ':') {
6958 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6959 const I32 skip = t - posixcc;
6960
6961 /* Initially switch on the length of the name. */
6962 switch (skip) {
6963 case 4:
6964 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6965 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
6966 break;
6967 case 5:
6968 /* Names all of length 5. */
6969 /* alnum alpha ascii blank cntrl digit graph lower
6970 print punct space upper */
6971 /* Offset 4 gives the best switch position. */
6972 switch (posixcc[4]) {
6973 case 'a':
6974 if (memEQ(posixcc, "alph", 4)) /* alpha */
6975 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
6976 break;
6977 case 'e':
6978 if (memEQ(posixcc, "spac", 4)) /* space */
6979 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
6980 break;
6981 case 'h':
6982 if (memEQ(posixcc, "grap", 4)) /* graph */
6983 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
6984 break;
6985 case 'i':
6986 if (memEQ(posixcc, "asci", 4)) /* ascii */
6987 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
6988 break;
6989 case 'k':
6990 if (memEQ(posixcc, "blan", 4)) /* blank */
6991 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
6992 break;
6993 case 'l':
6994 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6995 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
6996 break;
6997 case 'm':
6998 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6999 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
7000 break;
7001 case 'r':
7002 if (memEQ(posixcc, "lowe", 4)) /* lower */
7003 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7004 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7005 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
7006 break;
7007 case 't':
7008 if (memEQ(posixcc, "digi", 4)) /* digit */
7009 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7010 else if (memEQ(posixcc, "prin", 4)) /* print */
7011 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7012 else if (memEQ(posixcc, "punc", 4)) /* punct */
7013 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
7014 break;
7015 }
7016 break;
7017 case 6:
7018 if (memEQ(posixcc, "xdigit", 6))
7019 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
7020 break;
7021 }
7022
7023 if (namedclass == OOB_NAMEDCLASS)
7024 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7025 t - s - 1, s + 1);
7026 assert (posixcc[skip] == ':');
7027 assert (posixcc[skip+1] == ']');
7028 } else if (!SIZE_ONLY) {
7029 /* [[=foo=]] and [[.foo.]] are still future. */
7030
7031 /* adjust RExC_parse so the warning shows after
7032 the class closes */
7033 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
7034 RExC_parse++;
7035 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7036 }
7037 } else {
7038 /* Maternal grandfather:
7039 * "[:" ending in ":" but not in ":]" */
7040 RExC_parse = s;
7041 }
7042 }
7043 }
7044
7045 return namedclass;
7046}
7047
7048STATIC void
7049S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
7050{
7051 dVAR;
7052 if (POSIXCC(UCHARAT(RExC_parse))) {
7053 const char *s = RExC_parse;
7054 const char c = *s++;
7055
7056 while (isALNUM(*s))
7057 s++;
7058 if (*s && c == *s && s[1] == ']') {
7059 if (ckWARN(WARN_REGEXP))
7060 vWARN3(s+2,
7061 "POSIX syntax [%c %c] belongs inside character classes",
7062 c, c);
7063
7064 /* [[=foo=]] and [[.foo.]] are still future. */
7065 if (POSIXCC_NOTYET(c)) {
7066 /* adjust RExC_parse so the error shows after
7067 the class closes */
7068 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
7069 NOOP;
7070 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7071 }
7072 }
7073 }
7074}
7075
7076
7077#define _C_C_T_(NAME,TEST,WORD) \
7078ANYOF_##NAME: \
7079 if (LOC) \
7080 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7081 else { \
7082 for (value = 0; value < 256; value++) \
7083 if (TEST) \
7084 ANYOF_BITMAP_SET(ret, value); \
7085 } \
7086 yesno = '+'; \
7087 what = WORD; \
7088 break; \
7089case ANYOF_N##NAME: \
7090 if (LOC) \
7091 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7092 else { \
7093 for (value = 0; value < 256; value++) \
7094 if (!TEST) \
7095 ANYOF_BITMAP_SET(ret, value); \
7096 } \
7097 yesno = '!'; \
7098 what = WORD; \
7099 break
7100
7101
7102/*
7103 parse a class specification and produce either an ANYOF node that
7104 matches the pattern or if the pattern matches a single char only and
7105 that char is < 256 and we are case insensitive then we produce an
7106 EXACT node instead.
7107*/
7108
7109STATIC regnode *
7110S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
7111{
7112 dVAR;
7113 register UV value = 0;
7114 register UV nextvalue;
7115 register IV prevvalue = OOB_UNICODE;
7116 register IV range = 0;
7117 register regnode *ret;
7118 STRLEN numlen;
7119 IV namedclass;
7120 char *rangebegin = NULL;
7121 bool need_class = 0;
7122 SV *listsv = NULL;
7123 UV n;
7124 bool optimize_invert = TRUE;
7125 AV* unicode_alternate = NULL;
7126#ifdef EBCDIC
7127 UV literal_endpoint = 0;
7128#endif
7129 UV stored = 0; /* number of chars stored in the class */
7130
7131 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7132 case we need to change the emitted regop to an EXACT. */
7133 const char * orig_parse = RExC_parse;
7134 GET_RE_DEBUG_FLAGS_DECL;
7135#ifndef DEBUGGING
7136 PERL_UNUSED_ARG(depth);
7137#endif
7138
7139 DEBUG_PARSE("clas");
7140
7141 /* Assume we are going to generate an ANYOF node. */
7142 ret = reganode(pRExC_state, ANYOF, 0);
7143
7144 if (!SIZE_ONLY)
7145 ANYOF_FLAGS(ret) = 0;
7146
7147 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
7148 RExC_naughty++;
7149 RExC_parse++;
7150 if (!SIZE_ONLY)
7151 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7152 }
7153
7154 if (SIZE_ONLY) {
7155 RExC_size += ANYOF_SKIP;
7156 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7157 }
7158 else {
7159 RExC_emit += ANYOF_SKIP;
7160 if (FOLD)
7161 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7162 if (LOC)
7163 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
7164 ANYOF_BITMAP_ZERO(ret);
7165 listsv = newSVpvs("# comment\n");
7166 }
7167
7168 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7169
7170 if (!SIZE_ONLY && POSIXCC(nextvalue))
7171 checkposixcc(pRExC_state);
7172
7173 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7174 if (UCHARAT(RExC_parse) == ']')
7175 goto charclassloop;
7176
7177parseit:
7178 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
7179
7180 charclassloop:
7181
7182 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7183
7184 if (!range)
7185 rangebegin = RExC_parse;
7186 if (UTF) {
7187 value = utf8n_to_uvchr((U8*)RExC_parse,
7188 RExC_end - RExC_parse,
7189 &numlen, UTF8_ALLOW_DEFAULT);
7190 RExC_parse += numlen;
7191 }
7192 else
7193 value = UCHARAT(RExC_parse++);
7194
7195 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7196 if (value == '[' && POSIXCC(nextvalue))
7197 namedclass = regpposixcc(pRExC_state, value);
7198 else if (value == '\\') {
7199 if (UTF) {
7200 value = utf8n_to_uvchr((U8*)RExC_parse,
7201 RExC_end - RExC_parse,
7202 &numlen, UTF8_ALLOW_DEFAULT);
7203 RExC_parse += numlen;
7204 }
7205 else
7206 value = UCHARAT(RExC_parse++);
7207 /* Some compilers cannot handle switching on 64-bit integer
7208 * values, therefore value cannot be an UV. Yes, this will
7209 * be a problem later if we want switch on Unicode.
7210 * A similar issue a little bit later when switching on
7211 * namedclass. --jhi */
7212 switch ((I32)value) {
7213 case 'w': namedclass = ANYOF_ALNUM; break;
7214 case 'W': namedclass = ANYOF_NALNUM; break;
7215 case 's': namedclass = ANYOF_SPACE; break;
7216 case 'S': namedclass = ANYOF_NSPACE; break;
7217 case 'd': namedclass = ANYOF_DIGIT; break;
7218 case 'D': namedclass = ANYOF_NDIGIT; break;
7219 case 'N': /* Handle \N{NAME} in class */
7220 {
7221 /* We only pay attention to the first char of
7222 multichar strings being returned. I kinda wonder
7223 if this makes sense as it does change the behaviour
7224 from earlier versions, OTOH that behaviour was broken
7225 as well. */
7226 UV v; /* value is register so we cant & it /grrr */
7227 if (reg_namedseq(pRExC_state, &v)) {
7228 goto parseit;
7229 }
7230 value= v;
7231 }
7232 break;
7233 case 'p':
7234 case 'P':
7235 {
7236 char *e;
7237 if (RExC_parse >= RExC_end)
7238 vFAIL2("Empty \\%c{}", (U8)value);
7239 if (*RExC_parse == '{') {
7240 const U8 c = (U8)value;
7241 e = strchr(RExC_parse++, '}');
7242 if (!e)
7243 vFAIL2("Missing right brace on \\%c{}", c);
7244 while (isSPACE(UCHARAT(RExC_parse)))
7245 RExC_parse++;
7246 if (e == RExC_parse)
7247 vFAIL2("Empty \\%c{}", c);
7248 n = e - RExC_parse;
7249 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7250 n--;
7251 }
7252 else {
7253 e = RExC_parse;
7254 n = 1;
7255 }
7256 if (!SIZE_ONLY) {
7257 if (UCHARAT(RExC_parse) == '^') {
7258 RExC_parse++;
7259 n--;
7260 value = value == 'p' ? 'P' : 'p'; /* toggle */
7261 while (isSPACE(UCHARAT(RExC_parse))) {
7262 RExC_parse++;
7263 n--;
7264 }
7265 }
7266 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7267 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
7268 }
7269 RExC_parse = e + 1;
7270 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7271 namedclass = ANYOF_MAX; /* no official name, but it's named */
7272 }
7273 break;
7274 case 'n': value = '\n'; break;
7275 case 'r': value = '\r'; break;
7276 case 't': value = '\t'; break;
7277 case 'f': value = '\f'; break;
7278 case 'b': value = '\b'; break;
7279 case 'e': value = ASCII_TO_NATIVE('\033');break;
7280 case 'a': value = ASCII_TO_NATIVE('\007');break;
7281 case 'x':
7282 if (*RExC_parse == '{') {
7283 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7284 | PERL_SCAN_DISALLOW_PREFIX;
7285 char * const e = strchr(RExC_parse++, '}');
7286 if (!e)
7287 vFAIL("Missing right brace on \\x{}");
7288
7289 numlen = e - RExC_parse;
7290 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7291 RExC_parse = e + 1;
7292 }
7293 else {
7294 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7295 numlen = 2;
7296 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
7297 RExC_parse += numlen;
7298 }
7299 if (PL_encoding && value < 0x100)
7300 goto recode_encoding;
7301 break;
7302 case 'c':
7303 value = UCHARAT(RExC_parse++);
7304 value = toCTRL(value);
7305 break;
7306 case '0': case '1': case '2': case '3': case '4':
7307 case '5': case '6': case '7': case '8': case '9':
7308 {
7309 I32 flags = 0;
7310 numlen = 3;
7311 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7312 RExC_parse += numlen;
7313 if (PL_encoding && value < 0x100)
7314 goto recode_encoding;
7315 break;
7316 }
7317 recode_encoding:
7318 {
7319 SV* enc = PL_encoding;
7320 value = reg_recode((const char)(U8)value, &enc);
7321 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7322 vWARN(RExC_parse,
7323 "Invalid escape in the specified encoding");
7324 break;
7325 }
7326 default:
7327 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
7328 vWARN2(RExC_parse,
7329 "Unrecognized escape \\%c in character class passed through",
7330 (int)value);
7331 break;
7332 }
7333 } /* end of \blah */
7334#ifdef EBCDIC
7335 else
7336 literal_endpoint++;
7337#endif
7338
7339 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7340
7341 if (!SIZE_ONLY && !need_class)
7342 ANYOF_CLASS_ZERO(ret);
7343
7344 need_class = 1;
7345
7346 /* a bad range like a-\d, a-[:digit:] ? */
7347 if (range) {
7348 if (!SIZE_ONLY) {
7349 if (ckWARN(WARN_REGEXP)) {
7350 const int w =
7351 RExC_parse >= rangebegin ?
7352 RExC_parse - rangebegin : 0;
7353 vWARN4(RExC_parse,
7354 "False [] range \"%*.*s\"",
7355 w, w, rangebegin);
7356 }
7357 if (prevvalue < 256) {
7358 ANYOF_BITMAP_SET(ret, prevvalue);
7359 ANYOF_BITMAP_SET(ret, '-');
7360 }
7361 else {
7362 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7363 Perl_sv_catpvf(aTHX_ listsv,
7364 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
7365 }
7366 }
7367
7368 range = 0; /* this was not a true range */
7369 }
7370
7371
7372
7373 if (!SIZE_ONLY) {
7374 const char *what = NULL;
7375 char yesno = 0;
7376
7377 if (namedclass > OOB_NAMEDCLASS)
7378 optimize_invert = FALSE;
7379 /* Possible truncation here but in some 64-bit environments
7380 * the compiler gets heartburn about switch on 64-bit values.
7381 * A similar issue a little earlier when switching on value.
7382 * --jhi */
7383 switch ((I32)namedclass) {
7384 case _C_C_T_(ALNUM, isALNUM(value), "Word");
7385 case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
7386 case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
7387 case _C_C_T_(BLANK, isBLANK(value), "Blank");
7388 case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
7389 case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
7390 case _C_C_T_(LOWER, isLOWER(value), "Lower");
7391 case _C_C_T_(PRINT, isPRINT(value), "Print");
7392 case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
7393 case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
7394 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
7395 case _C_C_T_(UPPER, isUPPER(value), "Upper");
7396 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
7397 case ANYOF_ASCII:
7398 if (LOC)
7399 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
7400 else {
7401#ifndef EBCDIC
7402 for (value = 0; value < 128; value++)
7403 ANYOF_BITMAP_SET(ret, value);
7404#else /* EBCDIC */
7405 for (value = 0; value < 256; value++) {
7406 if (isASCII(value))
7407 ANYOF_BITMAP_SET(ret, value);
7408 }
7409#endif /* EBCDIC */
7410 }
7411 yesno = '+';
7412 what = "ASCII";
7413 break;
7414 case ANYOF_NASCII:
7415 if (LOC)
7416 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
7417 else {
7418#ifndef EBCDIC
7419 for (value = 128; value < 256; value++)
7420 ANYOF_BITMAP_SET(ret, value);
7421#else /* EBCDIC */
7422 for (value = 0; value < 256; value++) {
7423 if (!isASCII(value))
7424 ANYOF_BITMAP_SET(ret, value);
7425 }
7426#endif /* EBCDIC */
7427 }
7428 yesno = '!';
7429 what = "ASCII";
7430 break;
7431 case ANYOF_DIGIT:
7432 if (LOC)
7433 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7434 else {
7435 /* consecutive digits assumed */
7436 for (value = '0'; value <= '9'; value++)
7437 ANYOF_BITMAP_SET(ret, value);
7438 }
7439 yesno = '+';
7440 what = "Digit";
7441 break;
7442 case ANYOF_NDIGIT:
7443 if (LOC)
7444 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7445 else {
7446 /* consecutive digits assumed */
7447 for (value = 0; value < '0'; value++)
7448 ANYOF_BITMAP_SET(ret, value);
7449 for (value = '9' + 1; value < 256; value++)
7450 ANYOF_BITMAP_SET(ret, value);
7451 }
7452 yesno = '!';
7453 what = "Digit";
7454 break;
7455 case ANYOF_MAX:
7456 /* this is to handle \p and \P */
7457 break;
7458 default:
7459 vFAIL("Invalid [::] class");
7460 break;
7461 }
7462 if (what) {
7463 /* Strings such as "+utf8::isWord\n" */
7464 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7465 }
7466 if (LOC)
7467 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
7468 continue;
7469 }
7470 } /* end of namedclass \blah */
7471
7472 if (range) {
7473 if (prevvalue > (IV)value) /* b-a */ {
7474 const int w = RExC_parse - rangebegin;
7475 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
7476 range = 0; /* not a valid range */
7477 }
7478 }
7479 else {
7480 prevvalue = value; /* save the beginning of the range */
7481 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7482 RExC_parse[1] != ']') {
7483 RExC_parse++;
7484
7485 /* a bad range like \w-, [:word:]- ? */
7486 if (namedclass > OOB_NAMEDCLASS) {
7487 if (ckWARN(WARN_REGEXP)) {
7488 const int w =
7489 RExC_parse >= rangebegin ?
7490 RExC_parse - rangebegin : 0;
7491 vWARN4(RExC_parse,
7492 "False [] range \"%*.*s\"",
7493 w, w, rangebegin);
7494 }
7495 if (!SIZE_ONLY)
7496 ANYOF_BITMAP_SET(ret, '-');
7497 } else
7498 range = 1; /* yeah, it's a range! */
7499 continue; /* but do it the next time */
7500 }
7501 }
7502
7503 /* now is the next time */
7504 /*stored += (value - prevvalue + 1);*/
7505 if (!SIZE_ONLY) {
7506 if (prevvalue < 256) {
7507 const IV ceilvalue = value < 256 ? value : 255;
7508 IV i;
7509#ifdef EBCDIC
7510 /* In EBCDIC [\x89-\x91] should include
7511 * the \x8e but [i-j] should not. */
7512 if (literal_endpoint == 2 &&
7513 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7514 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
7515 {
7516 if (isLOWER(prevvalue)) {
7517 for (i = prevvalue; i <= ceilvalue; i++)
7518 if (isLOWER(i))
7519 ANYOF_BITMAP_SET(ret, i);
7520 } else {
7521 for (i = prevvalue; i <= ceilvalue; i++)
7522 if (isUPPER(i))
7523 ANYOF_BITMAP_SET(ret, i);
7524 }
7525 }
7526 else
7527#endif
7528 for (i = prevvalue; i <= ceilvalue; i++) {
7529 if (!ANYOF_BITMAP_TEST(ret,i)) {
7530 stored++;
7531 ANYOF_BITMAP_SET(ret, i);
7532 }
7533 }
7534 }
7535 if (value > 255 || UTF) {
7536 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7537 const UV natvalue = NATIVE_TO_UNI(value);
7538 stored+=2; /* can't optimize this class */
7539 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7540 if (prevnatvalue < natvalue) { /* what about > ? */
7541 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
7542 prevnatvalue, natvalue);
7543 }
7544 else if (prevnatvalue == natvalue) {
7545 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
7546 if (FOLD) {
7547 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
7548 STRLEN foldlen;
7549 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
7550
7551#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7552 if (RExC_precomp[0] == ':' &&
7553 RExC_precomp[1] == '[' &&
7554 (f == 0xDF || f == 0x92)) {
7555 f = NATIVE_TO_UNI(f);
7556 }
7557#endif
7558 /* If folding and foldable and a single
7559 * character, insert also the folded version
7560 * to the charclass. */
7561 if (f != value) {
7562#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7563 if ((RExC_precomp[0] == ':' &&
7564 RExC_precomp[1] == '[' &&
7565 (f == 0xA2 &&
7566 (value == 0xFB05 || value == 0xFB06))) ?
7567 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7568 foldlen == (STRLEN)UNISKIP(f) )
7569#else
7570 if (foldlen == (STRLEN)UNISKIP(f))
7571#endif
7572 Perl_sv_catpvf(aTHX_ listsv,
7573 "%04"UVxf"\n", f);
7574 else {
7575 /* Any multicharacter foldings
7576 * require the following transform:
7577 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7578 * where E folds into "pq" and F folds
7579 * into "rst", all other characters
7580 * fold to single characters. We save
7581 * away these multicharacter foldings,
7582 * to be later saved as part of the
7583 * additional "s" data. */
7584 SV *sv;
7585
7586 if (!unicode_alternate)
7587 unicode_alternate = newAV();
7588 sv = newSVpvn((char*)foldbuf, foldlen);
7589 SvUTF8_on(sv);
7590 av_push(unicode_alternate, sv);
7591 }
7592 }
7593
7594 /* If folding and the value is one of the Greek
7595 * sigmas insert a few more sigmas to make the
7596 * folding rules of the sigmas to work right.
7597 * Note that not all the possible combinations
7598 * are handled here: some of them are handled
7599 * by the standard folding rules, and some of
7600 * them (literal or EXACTF cases) are handled
7601 * during runtime in regexec.c:S_find_byclass(). */
7602 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7603 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7604 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
7605 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7606 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7607 }
7608 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7609 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
7610 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
7611 }
7612 }
7613 }
7614#ifdef EBCDIC
7615 literal_endpoint = 0;
7616#endif
7617 }
7618
7619 range = 0; /* this range (if it was one) is done now */
7620 }
7621
7622 if (need_class) {
7623 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
7624 if (SIZE_ONLY)
7625 RExC_size += ANYOF_CLASS_ADD_SKIP;
7626 else
7627 RExC_emit += ANYOF_CLASS_ADD_SKIP;
7628 }
7629
7630
7631 if (SIZE_ONLY)
7632 return ret;
7633 /****** !SIZE_ONLY AFTER HERE *********/
7634
7635 if( stored == 1 && value < 256
7636 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7637 ) {
7638 /* optimize single char class to an EXACT node
7639 but *only* when its not a UTF/high char */
7640 const char * cur_parse= RExC_parse;
7641 RExC_emit = (regnode *)orig_emit;
7642 RExC_parse = (char *)orig_parse;
7643 ret = reg_node(pRExC_state,
7644 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
7645 RExC_parse = (char *)cur_parse;
7646 *STRING(ret)= (char)value;
7647 STR_LEN(ret)= 1;
7648 RExC_emit += STR_SZ(1);
7649 return ret;
7650 }
7651 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7652 if ( /* If the only flag is folding (plus possibly inversion). */
7653 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7654 ) {
7655 for (value = 0; value < 256; ++value) {
7656 if (ANYOF_BITMAP_TEST(ret, value)) {
7657 UV fold = PL_fold[value];
7658
7659 if (fold != value)
7660 ANYOF_BITMAP_SET(ret, fold);
7661 }
7662 }
7663 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
7664 }
7665
7666 /* optimize inverted simple patterns (e.g. [^a-z]) */
7667 if (optimize_invert &&
7668 /* If the only flag is inversion. */
7669 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
7670 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
7671 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
7672 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
7673 }
7674 {
7675 AV * const av = newAV();
7676 SV *rv;
7677 /* The 0th element stores the character class description
7678 * in its textual form: used later (regexec.c:Perl_regclass_swash())
7679 * to initialize the appropriate swash (which gets stored in
7680 * the 1st element), and also useful for dumping the regnode.
7681 * The 2nd element stores the multicharacter foldings,
7682 * used later (regexec.c:S_reginclass()). */
7683 av_store(av, 0, listsv);
7684 av_store(av, 1, NULL);
7685 av_store(av, 2, (SV*)unicode_alternate);
7686 rv = newRV_noinc((SV*)av);
7687 n = add_data(pRExC_state, 1, "s");
7688 RExC_rxi->data->data[n] = (void*)rv;
7689 ARG_SET(ret, n);
7690 }
7691 return ret;
7692}
7693#undef _C_C_T_
7694
7695
7696STATIC char*
7697S_nextchar(pTHX_ RExC_state_t *pRExC_state)
7698{
7699 char* const retval = RExC_parse++;
7700
7701 for (;;) {
7702 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7703 RExC_parse[2] == '#') {
7704 while (*RExC_parse != ')') {
7705 if (RExC_parse == RExC_end)
7706 FAIL("Sequence (?#... not terminated");
7707 RExC_parse++;
7708 }
7709 RExC_parse++;
7710 continue;
7711 }
7712 if (RExC_flags & RXf_PMf_EXTENDED) {
7713 if (isSPACE(*RExC_parse)) {
7714 RExC_parse++;
7715 continue;
7716 }
7717 else if (*RExC_parse == '#') {
7718 while (RExC_parse < RExC_end)
7719 if (*RExC_parse++ == '\n') break;
7720 continue;
7721 }
7722 }
7723 return retval;
7724 }
7725}
7726
7727/*
7728- reg_node - emit a node
7729*/
7730STATIC regnode * /* Location. */
7731S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
7732{
7733 dVAR;
7734 register regnode *ptr;
7735 regnode * const ret = RExC_emit;
7736 GET_RE_DEBUG_FLAGS_DECL;
7737
7738 if (SIZE_ONLY) {
7739 SIZE_ALIGN(RExC_size);
7740 RExC_size += 1;
7741 return(ret);
7742 }
7743#ifdef DEBUGGING
7744 if (OP(RExC_emit) == 255)
7745 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7746 reg_name[op], OP(RExC_emit));
7747#endif
7748 NODE_ALIGN_FILL(ret);
7749 ptr = ret;
7750 FILL_ADVANCE_NODE(ptr, op);
7751 if (RExC_offsets) { /* MJD */
7752 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
7753 "reg_node", __LINE__,
7754 reg_name[op],
7755 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7756 ? "Overwriting end of array!\n" : "OK",
7757 (UV)(RExC_emit - RExC_emit_start),
7758 (UV)(RExC_parse - RExC_start),
7759 (UV)RExC_offsets[0]));
7760 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
7761 }
7762
7763 RExC_emit = ptr;
7764 return(ret);
7765}
7766
7767/*
7768- reganode - emit a node with an argument
7769*/
7770STATIC regnode * /* Location. */
7771S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
7772{
7773 dVAR;
7774 register regnode *ptr;
7775 regnode * const ret = RExC_emit;
7776 GET_RE_DEBUG_FLAGS_DECL;
7777
7778 if (SIZE_ONLY) {
7779 SIZE_ALIGN(RExC_size);
7780 RExC_size += 2;
7781 /*
7782 We can't do this:
7783
7784 assert(2==regarglen[op]+1);
7785
7786 Anything larger than this has to allocate the extra amount.
7787 If we changed this to be:
7788
7789 RExC_size += (1 + regarglen[op]);
7790
7791 then it wouldn't matter. Its not clear what side effect
7792 might come from that so its not done so far.
7793 -- dmq
7794 */
7795 return(ret);
7796 }
7797#ifdef DEBUGGING
7798 if (OP(RExC_emit) == 255)
7799 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7800#endif
7801 NODE_ALIGN_FILL(ret);
7802 ptr = ret;
7803 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7804 if (RExC_offsets) { /* MJD */
7805 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7806 "reganode",
7807 __LINE__,
7808 reg_name[op],
7809 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
7810 "Overwriting end of array!\n" : "OK",
7811 (UV)(RExC_emit - RExC_emit_start),
7812 (UV)(RExC_parse - RExC_start),
7813 (UV)RExC_offsets[0]));
7814 Set_Cur_Node_Offset;
7815 }
7816
7817 RExC_emit = ptr;
7818 return(ret);
7819}
7820
7821/*
7822- reguni - emit (if appropriate) a Unicode character
7823*/
7824STATIC STRLEN
7825S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
7826{
7827 dVAR;
7828 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
7829}
7830
7831/*
7832- reginsert - insert an operator in front of already-emitted operand
7833*
7834* Means relocating the operand.
7835*/
7836STATIC void
7837S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
7838{
7839 dVAR;
7840 register regnode *src;
7841 register regnode *dst;
7842 register regnode *place;
7843 const int offset = regarglen[(U8)op];
7844 const int size = NODE_STEP_REGNODE + offset;
7845 GET_RE_DEBUG_FLAGS_DECL;
7846/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
7847 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
7848 if (SIZE_ONLY) {
7849 RExC_size += size;
7850 return;
7851 }
7852
7853 src = RExC_emit;
7854 RExC_emit += size;
7855 dst = RExC_emit;
7856 if (RExC_open_parens) {
7857 int paren;
7858 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
7859 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
7860 if ( RExC_open_parens[paren] >= opnd ) {
7861 DEBUG_PARSE_FMT("open"," - %d",size);
7862 RExC_open_parens[paren] += size;
7863 } else {
7864 DEBUG_PARSE_FMT("open"," - %s","ok");
7865 }
7866 if ( RExC_close_parens[paren] >= opnd ) {
7867 DEBUG_PARSE_FMT("close"," - %d",size);
7868 RExC_close_parens[paren] += size;
7869 } else {
7870 DEBUG_PARSE_FMT("close"," - %s","ok");
7871 }
7872 }
7873 }
7874
7875 while (src > opnd) {
7876 StructCopy(--src, --dst, regnode);
7877 if (RExC_offsets) { /* MJD 20010112 */
7878 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
7879 "reg_insert",
7880 __LINE__,
7881 reg_name[op],
7882 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7883 ? "Overwriting end of array!\n" : "OK",
7884 (UV)(src - RExC_emit_start),
7885 (UV)(dst - RExC_emit_start),
7886 (UV)RExC_offsets[0]));
7887 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7888 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
7889 }
7890 }
7891
7892
7893 place = opnd; /* Op node, where operand used to be. */
7894 if (RExC_offsets) { /* MJD */
7895 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
7896 "reginsert",
7897 __LINE__,
7898 reg_name[op],
7899 (UV)(place - RExC_emit_start) > RExC_offsets[0]
7900 ? "Overwriting end of array!\n" : "OK",
7901 (UV)(place - RExC_emit_start),
7902 (UV)(RExC_parse - RExC_start),
7903 (UV)RExC_offsets[0]));
7904 Set_Node_Offset(place, RExC_parse);
7905 Set_Node_Length(place, 1);
7906 }
7907 src = NEXTOPER(place);
7908 FILL_ADVANCE_NODE(place, op);
7909 Zero(src, offset, regnode);
7910}
7911
7912/*
7913- regtail - set the next-pointer at the end of a node chain of p to val.
7914- SEE ALSO: regtail_study
7915*/
7916/* TODO: All three parms should be const */
7917STATIC void
7918S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7919{
7920 dVAR;
7921 register regnode *scan;
7922 GET_RE_DEBUG_FLAGS_DECL;
7923#ifndef DEBUGGING
7924 PERL_UNUSED_ARG(depth);
7925#endif
7926
7927 if (SIZE_ONLY)
7928 return;
7929
7930 /* Find last node. */
7931 scan = p;
7932 for (;;) {
7933 regnode * const temp = regnext(scan);
7934 DEBUG_PARSE_r({
7935 SV * const mysv=sv_newmortal();
7936 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7937 regprop(RExC_rx, mysv, scan);
7938 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7939 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7940 (temp == NULL ? "->" : ""),
7941 (temp == NULL ? reg_name[OP(val)] : "")
7942 );
7943 });
7944 if (temp == NULL)
7945 break;
7946 scan = temp;
7947 }
7948
7949 if (reg_off_by_arg[OP(scan)]) {
7950 ARG_SET(scan, val - scan);
7951 }
7952 else {
7953 NEXT_OFF(scan) = val - scan;
7954 }
7955}
7956
7957#ifdef DEBUGGING
7958/*
7959- regtail_study - set the next-pointer at the end of a node chain of p to val.
7960- Look for optimizable sequences at the same time.
7961- currently only looks for EXACT chains.
7962
7963This is expermental code. The idea is to use this routine to perform
7964in place optimizations on branches and groups as they are constructed,
7965with the long term intention of removing optimization from study_chunk so
7966that it is purely analytical.
7967
7968Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7969to control which is which.
7970
7971*/
7972/* TODO: All four parms should be const */
7973
7974STATIC U8
7975S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7976{
7977 dVAR;
7978 register regnode *scan;
7979 U8 exact = PSEUDO;
7980#ifdef EXPERIMENTAL_INPLACESCAN
7981 I32 min = 0;
7982#endif
7983
7984 GET_RE_DEBUG_FLAGS_DECL;
7985
7986
7987 if (SIZE_ONLY)
7988 return exact;
7989
7990 /* Find last node. */
7991
7992 scan = p;
7993 for (;;) {
7994 regnode * const temp = regnext(scan);
7995#ifdef EXPERIMENTAL_INPLACESCAN
7996 if (PL_regkind[OP(scan)] == EXACT)
7997 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7998 return EXACT;
7999#endif
8000 if ( exact ) {
8001 switch (OP(scan)) {
8002 case EXACT:
8003 case EXACTF:
8004 case EXACTFL:
8005 if( exact == PSEUDO )
8006 exact= OP(scan);
8007 else if ( exact != OP(scan) )
8008 exact= 0;
8009 case NOTHING:
8010 break;
8011 default:
8012 exact= 0;
8013 }
8014 }
8015 DEBUG_PARSE_r({
8016 SV * const mysv=sv_newmortal();
8017 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8018 regprop(RExC_rx, mysv, scan);
8019 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
8020 SvPV_nolen_const(mysv),
8021 REG_NODE_NUM(scan),
8022 reg_name[exact]);
8023 });
8024 if (temp == NULL)
8025 break;
8026 scan = temp;
8027 }
8028 DEBUG_PARSE_r({
8029 SV * const mysv_val=sv_newmortal();
8030 DEBUG_PARSE_MSG("");
8031 regprop(RExC_rx, mysv_val, val);
8032 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8033 SvPV_nolen_const(mysv_val),
8034 (IV)REG_NODE_NUM(val),
8035 (IV)(val - scan)
8036 );
8037 });
8038 if (reg_off_by_arg[OP(scan)]) {
8039 ARG_SET(scan, val - scan);
8040 }
8041 else {
8042 NEXT_OFF(scan) = val - scan;
8043 }
8044
8045 return exact;
8046}
8047#endif
8048
8049/*
8050 - regcurly - a little FSA that accepts {\d+,?\d*}
8051 */
8052STATIC I32
8053S_regcurly(register const char *s)
8054{
8055 if (*s++ != '{')
8056 return FALSE;
8057 if (!isDIGIT(*s))
8058 return FALSE;
8059 while (isDIGIT(*s))
8060 s++;
8061 if (*s == ',')
8062 s++;
8063 while (isDIGIT(*s))
8064 s++;
8065 if (*s != '}')
8066 return FALSE;
8067 return TRUE;
8068}
8069
8070
8071/*
8072 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
8073 */
8074void
8075Perl_regdump(pTHX_ const regexp *r)
8076{
8077#ifdef DEBUGGING
8078 dVAR;
8079 SV * const sv = sv_newmortal();
8080 SV *dsv= sv_newmortal();
8081 RXi_GET_DECL(r,ri);
8082
8083 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
8084
8085 /* Header fields of interest. */
8086 if (r->anchored_substr) {
8087 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8088 RE_SV_DUMPLEN(r->anchored_substr), 30);
8089 PerlIO_printf(Perl_debug_log,
8090 "anchored %s%s at %"IVdf" ",
8091 s, RE_SV_TAIL(r->anchored_substr),
8092 (IV)r->anchored_offset);
8093 } else if (r->anchored_utf8) {
8094 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8095 RE_SV_DUMPLEN(r->anchored_utf8), 30);
8096 PerlIO_printf(Perl_debug_log,
8097 "anchored utf8 %s%s at %"IVdf" ",
8098 s, RE_SV_TAIL(r->anchored_utf8),
8099 (IV)r->anchored_offset);
8100 }
8101 if (r->float_substr) {
8102 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8103 RE_SV_DUMPLEN(r->float_substr), 30);
8104 PerlIO_printf(Perl_debug_log,
8105 "floating %s%s at %"IVdf"..%"UVuf" ",
8106 s, RE_SV_TAIL(r->float_substr),
8107 (IV)r->float_min_offset, (UV)r->float_max_offset);
8108 } else if (r->float_utf8) {
8109 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8110 RE_SV_DUMPLEN(r->float_utf8), 30);
8111 PerlIO_printf(Perl_debug_log,
8112 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8113 s, RE_SV_TAIL(r->float_utf8),
8114 (IV)r->float_min_offset, (UV)r->float_max_offset);
8115 }
8116 if (r->check_substr || r->check_utf8)
8117 PerlIO_printf(Perl_debug_log,
8118 (const char *)
8119 (r->check_substr == r->float_substr
8120 && r->check_utf8 == r->float_utf8
8121 ? "(checking floating" : "(checking anchored"));
8122 if (r->extflags & RXf_NOSCAN)
8123 PerlIO_printf(Perl_debug_log, " noscan");
8124 if (r->extflags & RXf_CHECK_ALL)
8125 PerlIO_printf(Perl_debug_log, " isall");
8126 if (r->check_substr || r->check_utf8)
8127 PerlIO_printf(Perl_debug_log, ") ");
8128
8129 if (ri->regstclass) {
8130 regprop(r, sv, ri->regstclass);
8131 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
8132 }
8133 if (r->extflags & RXf_ANCH) {
8134 PerlIO_printf(Perl_debug_log, "anchored");
8135 if (r->extflags & RXf_ANCH_BOL)
8136 PerlIO_printf(Perl_debug_log, "(BOL)");
8137 if (r->extflags & RXf_ANCH_MBOL)
8138 PerlIO_printf(Perl_debug_log, "(MBOL)");
8139 if (r->extflags & RXf_ANCH_SBOL)
8140 PerlIO_printf(Perl_debug_log, "(SBOL)");
8141 if (r->extflags & RXf_ANCH_GPOS)
8142 PerlIO_printf(Perl_debug_log, "(GPOS)");
8143 PerlIO_putc(Perl_debug_log, ' ');
8144 }
8145 if (r->extflags & RXf_GPOS_SEEN)
8146 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
8147 if (r->intflags & PREGf_SKIP)
8148 PerlIO_printf(Perl_debug_log, "plus ");
8149 if (r->intflags & PREGf_IMPLICIT)
8150 PerlIO_printf(Perl_debug_log, "implicit ");
8151 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
8152 if (r->extflags & RXf_EVAL_SEEN)
8153 PerlIO_printf(Perl_debug_log, "with eval ");
8154 PerlIO_printf(Perl_debug_log, "\n");
8155#else
8156 PERL_UNUSED_CONTEXT;
8157 PERL_UNUSED_ARG(r);
8158#endif /* DEBUGGING */
8159}
8160
8161/*
8162- regprop - printable representation of opcode
8163*/
8164void
8165Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
8166{
8167#ifdef DEBUGGING
8168 dVAR;
8169 register int k;
8170 RXi_GET_DECL(prog,progi);
8171 GET_RE_DEBUG_FLAGS_DECL;
8172
8173
8174 sv_setpvn(sv, "", 0);
8175
8176 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
8177 /* It would be nice to FAIL() here, but this may be called from
8178 regexec.c, and it would be hard to supply pRExC_state. */
8179 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
8180 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
8181
8182 k = PL_regkind[OP(o)];
8183
8184 if (k == EXACT) {
8185 SV * const dsv = sv_2mortal(newSVpvs(""));
8186 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8187 * is a crude hack but it may be the best for now since
8188 * we have no flag "this EXACTish node was UTF-8"
8189 * --jhi */
8190 const char * const s =
8191 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
8192 PL_colors[0], PL_colors[1],
8193 PERL_PV_ESCAPE_UNI_DETECT |
8194 PERL_PV_PRETTY_ELIPSES |
8195 PERL_PV_PRETTY_LTGT
8196 );
8197 Perl_sv_catpvf(aTHX_ sv, " %s", s );
8198 } else if (k == TRIE) {
8199 /* print the details of the trie in dumpuntil instead, as
8200 * progi->data isn't available here */
8201 const char op = OP(o);
8202 const U32 n = ARG(o);
8203 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8204 (reg_ac_data *)progi->data->data[n] :
8205 NULL;
8206 const reg_trie_data * const trie
8207 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
8208
8209 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8210 DEBUG_TRIE_COMPILE_r(
8211 Perl_sv_catpvf(aTHX_ sv,
8212 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8213 (UV)trie->startstate,
8214 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
8215 (UV)trie->wordcount,
8216 (UV)trie->minlen,
8217 (UV)trie->maxlen,
8218 (UV)TRIE_CHARCOUNT(trie),
8219 (UV)trie->uniquecharcount
8220 )
8221 );
8222 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8223 int i;
8224 int rangestart = -1;
8225 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
8226 Perl_sv_catpvf(aTHX_ sv, "[");
8227 for (i = 0; i <= 256; i++) {
8228 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8229 if (rangestart == -1)
8230 rangestart = i;
8231 } else if (rangestart != -1) {
8232 if (i <= rangestart + 3)
8233 for (; rangestart < i; rangestart++)
8234 put_byte(sv, rangestart);
8235 else {
8236 put_byte(sv, rangestart);
8237 sv_catpvs(sv, "-");
8238 put_byte(sv, i - 1);
8239 }
8240 rangestart = -1;
8241 }
8242 }
8243 Perl_sv_catpvf(aTHX_ sv, "]");
8244 }
8245
8246 } else if (k == CURLY) {
8247 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
8248 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8249 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
8250 }
8251 else if (k == WHILEM && o->flags) /* Ordinal/of */
8252 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
8253 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
8254 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
8255 if ( prog->paren_names ) {
8256 if ( k != REF || OP(o) < NREF) {
8257 AV *list= (AV *)progi->data->data[progi->name_list_idx];
8258 SV **name= av_fetch(list, ARG(o), 0 );
8259 if (name)
8260 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8261 }
8262 else {
8263 AV *list= (AV *)progi->data->data[ progi->name_list_idx ];
8264 SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ];
8265 I32 *nums=(I32*)SvPVX(sv_dat);
8266 SV **name= av_fetch(list, nums[0], 0 );
8267 I32 n;
8268 if (name) {
8269 for ( n=0; n<SvIVX(sv_dat); n++ ) {
8270 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
8271 (n ? "," : ""), (IV)nums[n]);
8272 }
8273 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8274 }
8275 }
8276 }
8277 } else if (k == GOSUB)
8278 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
8279 else if (k == VERB) {
8280 if (!o->flags)
8281 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8282 SVfARG((SV*)progi->data->data[ ARG( o ) ]));
8283 } else if (k == LOGICAL)
8284 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
8285 else if (k == ANYOF) {
8286 int i, rangestart = -1;
8287 const U8 flags = ANYOF_FLAGS(o);
8288
8289 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8290 static const char * const anyofs[] = {
8291 "\\w",
8292 "\\W",
8293 "\\s",
8294 "\\S",
8295 "\\d",
8296 "\\D",
8297 "[:alnum:]",
8298 "[:^alnum:]",
8299 "[:alpha:]",
8300 "[:^alpha:]",
8301 "[:ascii:]",
8302 "[:^ascii:]",
8303 "[:ctrl:]",
8304 "[:^ctrl:]",
8305 "[:graph:]",
8306 "[:^graph:]",
8307 "[:lower:]",
8308 "[:^lower:]",
8309 "[:print:]",
8310 "[:^print:]",
8311 "[:punct:]",
8312 "[:^punct:]",
8313 "[:upper:]",
8314 "[:^upper:]",
8315 "[:xdigit:]",
8316 "[:^xdigit:]",
8317 "[:space:]",
8318 "[:^space:]",
8319 "[:blank:]",
8320 "[:^blank:]"
8321 };
8322
8323 if (flags & ANYOF_LOCALE)
8324 sv_catpvs(sv, "{loc}");
8325 if (flags & ANYOF_FOLD)
8326 sv_catpvs(sv, "{i}");
8327 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
8328 if (flags & ANYOF_INVERT)
8329 sv_catpvs(sv, "^");
8330 for (i = 0; i <= 256; i++) {
8331 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8332 if (rangestart == -1)
8333 rangestart = i;
8334 } else if (rangestart != -1) {
8335 if (i <= rangestart + 3)
8336 for (; rangestart < i; rangestart++)
8337 put_byte(sv, rangestart);
8338 else {
8339 put_byte(sv, rangestart);
8340 sv_catpvs(sv, "-");
8341 put_byte(sv, i - 1);
8342 }
8343 rangestart = -1;
8344 }
8345 }
8346
8347 if (o->flags & ANYOF_CLASS)
8348 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
8349 if (ANYOF_CLASS_TEST(o,i))
8350 sv_catpv(sv, anyofs[i]);
8351
8352 if (flags & ANYOF_UNICODE)
8353 sv_catpvs(sv, "{unicode}");
8354 else if (flags & ANYOF_UNICODE_ALL)
8355 sv_catpvs(sv, "{unicode_all}");
8356
8357 {
8358 SV *lv;
8359 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
8360
8361 if (lv) {
8362 if (sw) {
8363 U8 s[UTF8_MAXBYTES_CASE+1];
8364
8365 for (i = 0; i <= 256; i++) { /* just the first 256 */
8366 uvchr_to_utf8(s, i);
8367
8368 if (i < 256 && swash_fetch(sw, s, TRUE)) {
8369 if (rangestart == -1)
8370 rangestart = i;
8371 } else if (rangestart != -1) {
8372 if (i <= rangestart + 3)
8373 for (; rangestart < i; rangestart++) {
8374 const U8 * const e = uvchr_to_utf8(s,rangestart);
8375 U8 *p;
8376 for(p = s; p < e; p++)
8377 put_byte(sv, *p);
8378 }
8379 else {
8380 const U8 *e = uvchr_to_utf8(s,rangestart);
8381 U8 *p;
8382 for (p = s; p < e; p++)
8383 put_byte(sv, *p);
8384 sv_catpvs(sv, "-");
8385 e = uvchr_to_utf8(s, i-1);
8386 for (p = s; p < e; p++)
8387 put_byte(sv, *p);
8388 }
8389 rangestart = -1;
8390 }
8391 }
8392
8393 sv_catpvs(sv, "..."); /* et cetera */
8394 }
8395
8396 {
8397 char *s = savesvpv(lv);
8398 char * const origs = s;
8399
8400 while (*s && *s != '\n')
8401 s++;
8402
8403 if (*s == '\n') {
8404 const char * const t = ++s;
8405
8406 while (*s) {
8407 if (*s == '\n')
8408 *s = ' ';
8409 s++;
8410 }
8411 if (s[-1] == ' ')
8412 s[-1] = 0;
8413
8414 sv_catpv(sv, t);
8415 }
8416
8417 Safefree(origs);
8418 }
8419 }
8420 }
8421
8422 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8423 }
8424 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
8425 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
8426#else
8427 PERL_UNUSED_CONTEXT;
8428 PERL_UNUSED_ARG(sv);
8429 PERL_UNUSED_ARG(o);
8430 PERL_UNUSED_ARG(prog);
8431#endif /* DEBUGGING */
8432}
8433
8434SV *
8435Perl_re_intuit_string(pTHX_ regexp *prog)
8436{ /* Assume that RE_INTUIT is set */
8437 dVAR;
8438 GET_RE_DEBUG_FLAGS_DECL;
8439 PERL_UNUSED_CONTEXT;
8440
8441 DEBUG_COMPILE_r(
8442 {
8443 const char * const s = SvPV_nolen_const(prog->check_substr
8444 ? prog->check_substr : prog->check_utf8);
8445
8446 if (!PL_colorset) reginitcolors();
8447 PerlIO_printf(Perl_debug_log,
8448 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
8449 PL_colors[4],
8450 prog->check_substr ? "" : "utf8 ",
8451 PL_colors[5],PL_colors[0],
8452 s,
8453 PL_colors[1],
8454 (strlen(s) > 60 ? "..." : ""));
8455 } );
8456
8457 return prog->check_substr ? prog->check_substr : prog->check_utf8;
8458}
8459
8460/*
8461 pregfree()
8462
8463 handles refcounting and freeing the perl core regexp structure. When
8464 it is necessary to actually free the structure the first thing it
8465 does is call the 'free' method of the regexp_engine associated to to
8466 the regexp, allowing the handling of the void *pprivate; member
8467 first. (This routine is not overridable by extensions, which is why
8468 the extensions free is called first.)
8469
8470 See regdupe and regdupe_internal if you change anything here.
8471*/
8472#ifndef PERL_IN_XSUB_RE
8473void
8474Perl_pregfree(pTHX_ struct regexp *r)
8475{
8476 dVAR;
8477 GET_RE_DEBUG_FLAGS_DECL;
8478
8479 if (!r || (--r->refcnt > 0))
8480 return;
8481
8482 CALLREGFREE_PVT(r); /* free the private data */
8483
8484 /* gcov results gave these as non-null 100% of the time, so there's no
8485 optimisation in checking them before calling Safefree */
8486 Safefree(r->precomp);
8487 RX_MATCH_COPY_FREE(r);
8488#ifdef PERL_OLD_COPY_ON_WRITE
8489 if (r->saved_copy)
8490 SvREFCNT_dec(r->saved_copy);
8491#endif
8492 if (r->substrs) {
8493 if (r->anchored_substr)
8494 SvREFCNT_dec(r->anchored_substr);
8495 if (r->anchored_utf8)
8496 SvREFCNT_dec(r->anchored_utf8);
8497 if (r->float_substr)
8498 SvREFCNT_dec(r->float_substr);
8499 if (r->float_utf8)
8500 SvREFCNT_dec(r->float_utf8);
8501 Safefree(r->substrs);
8502 }
8503 if (r->paren_names)
8504 SvREFCNT_dec(r->paren_names);
8505
8506 Safefree(r->startp);
8507 Safefree(r->endp);
8508 Safefree(r);
8509}
8510#endif
8511
8512/* regfree_internal()
8513
8514 Free the private data in a regexp. This is overloadable by
8515 extensions. Perl takes care of the regexp structure in pregfree(),
8516 this covers the *pprivate pointer which technically perldoesnt
8517 know about, however of course we have to handle the
8518 regexp_internal structure when no extension is in use.
8519
8520 Note this is called before freeing anything in the regexp
8521 structure.
8522 */
8523
8524void
8525Perl_regfree_internal(pTHX_ struct regexp *r)
8526{
8527 dVAR;
8528 RXi_GET_DECL(r,ri);
8529 GET_RE_DEBUG_FLAGS_DECL;
8530
8531 DEBUG_COMPILE_r({
8532 if (!PL_colorset)
8533 reginitcolors();
8534 {
8535 SV *dsv= sv_newmortal();
8536 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
8537 dsv, r->precomp, r->prelen, 60);
8538 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8539 PL_colors[4],PL_colors[5],s);
8540 }
8541 });
8542
8543 Safefree(ri->offsets); /* 20010421 MJD */
8544 if (ri->data) {
8545 int n = ri->data->count;
8546 PAD* new_comppad = NULL;
8547 PAD* old_comppad;
8548 PADOFFSET refcnt;
8549
8550 while (--n >= 0) {
8551 /* If you add a ->what type here, update the comment in regcomp.h */
8552 switch (ri->data->what[n]) {
8553 case 's':
8554 case 'S':
8555 case 'u':
8556 SvREFCNT_dec((SV*)ri->data->data[n]);
8557 break;
8558 case 'f':
8559 Safefree(ri->data->data[n]);
8560 break;
8561 case 'p':
8562 new_comppad = (AV*)ri->data->data[n];
8563 break;
8564 case 'o':
8565 if (new_comppad == NULL)
8566 Perl_croak(aTHX_ "panic: pregfree comppad");
8567 PAD_SAVE_LOCAL(old_comppad,
8568 /* Watch out for global destruction's random ordering. */
8569 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
8570 );
8571 OP_REFCNT_LOCK;
8572 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
8573 OP_REFCNT_UNLOCK;
8574 if (!refcnt)
8575 op_free((OP_4tree*)ri->data->data[n]);
8576
8577 PAD_RESTORE_LOCAL(old_comppad);
8578 SvREFCNT_dec((SV*)new_comppad);
8579 new_comppad = NULL;
8580 break;
8581 case 'n':
8582 break;
8583 case 'T':
8584 { /* Aho Corasick add-on structure for a trie node.
8585 Used in stclass optimization only */
8586 U32 refcount;
8587 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
8588 OP_REFCNT_LOCK;
8589 refcount = --aho->refcount;
8590 OP_REFCNT_UNLOCK;
8591 if ( !refcount ) {
8592 PerlMemShared_free(aho->states);
8593 PerlMemShared_free(aho->fail);
8594 /* do this last!!!! */
8595 PerlMemShared_free(ri->data->data[n]);
8596 PerlMemShared_free(ri->regstclass);
8597 }
8598 }
8599 break;
8600 case 't':
8601 {
8602 /* trie structure. */
8603 U32 refcount;
8604 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
8605 OP_REFCNT_LOCK;
8606 refcount = --trie->refcount;
8607 OP_REFCNT_UNLOCK;
8608 if ( !refcount ) {
8609 PerlMemShared_free(trie->charmap);
8610 PerlMemShared_free(trie->states);
8611 PerlMemShared_free(trie->trans);
8612 if (trie->bitmap)
8613 PerlMemShared_free(trie->bitmap);
8614 if (trie->wordlen)
8615 PerlMemShared_free(trie->wordlen);
8616 if (trie->jump)
8617 PerlMemShared_free(trie->jump);
8618 if (trie->nextword)
8619 PerlMemShared_free(trie->nextword);
8620 /* do this last!!!! */
8621 PerlMemShared_free(ri->data->data[n]);
8622 }
8623 }
8624 break;
8625 default:
8626 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
8627 }
8628 }
8629 Safefree(ri->data->what);
8630 Safefree(ri->data);
8631 }
8632 if (ri->swap) {
8633 Safefree(ri->swap->startp);
8634 Safefree(ri->swap->endp);
8635 Safefree(ri->swap);
8636 }
8637 Safefree(ri);
8638}
8639
8640#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8641#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8642#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
8643#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8644
8645/*
8646 regdupe - duplicate a regexp.
8647
8648 This routine is called by sv.c's re_dup and is expected to clone a
8649 given regexp structure. It is a no-op when not under USE_ITHREADS.
8650 (Originally this *was* re_dup() for change history see sv.c)
8651
8652 After all of the core data stored in struct regexp is duplicated
8653 the regexp_engine.dupe method is used to copy any private data
8654 stored in the *pprivate pointer. This allows extensions to handle
8655 any duplication it needs to do.
8656
8657 See pregfree() and regfree_internal() if you change anything here.
8658*/
8659#if defined(USE_ITHREADS)
8660#ifndef PERL_IN_XSUB_RE
8661regexp *
8662Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
8663{
8664 dVAR;
8665 regexp *ret;
8666 int i, npar;
8667 struct reg_substr_datum *s;
8668
8669 if (!r)
8670 return (REGEXP *)NULL;
8671
8672 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8673 return ret;
8674
8675
8676 npar = r->nparens+1;
8677 Newxz(ret, 1, regexp);
8678 Newx(ret->startp, npar, I32);
8679 Copy(r->startp, ret->startp, npar, I32);
8680 Newx(ret->endp, npar, I32);
8681 Copy(r->endp, ret->endp, npar, I32);
8682
8683 if (r->substrs) {
8684 Newx(ret->substrs, 1, struct reg_substr_data);
8685 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8686 s->min_offset = r->substrs->data[i].min_offset;
8687 s->max_offset = r->substrs->data[i].max_offset;
8688 s->end_shift = r->substrs->data[i].end_shift;
8689 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8690 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8691 }
8692 } else
8693 ret->substrs = NULL;
8694
8695 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8696 ret->refcnt = r->refcnt;
8697 ret->minlen = r->minlen;
8698 ret->minlenret = r->minlenret;
8699 ret->prelen = r->prelen;
8700 ret->nparens = r->nparens;
8701 ret->lastparen = r->lastparen;
8702 ret->lastcloseparen = r->lastcloseparen;
8703 ret->intflags = r->intflags;
8704 ret->extflags = r->extflags;
8705
8706 ret->sublen = r->sublen;
8707
8708 ret->engine = r->engine;
8709
8710 ret->paren_names = hv_dup_inc(r->paren_names, param);
8711
8712 if (RX_MATCH_COPIED(ret))
8713 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8714 else
8715 ret->subbeg = NULL;
8716#ifdef PERL_OLD_COPY_ON_WRITE
8717 ret->saved_copy = NULL;
8718#endif
8719
8720 ret->pprivate = r->pprivate;
8721 if (ret->pprivate)
8722 RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
8723
8724 ptr_table_store(PL_ptr_table, r, ret);
8725 return ret;
8726}
8727#endif /* PERL_IN_XSUB_RE */
8728
8729/*
8730 regdupe_internal()
8731
8732 This is the internal complement to regdupe() which is used to copy
8733 the structure pointed to by the *pprivate pointer in the regexp.
8734 This is the core version of the extension overridable cloning hook.
8735 The regexp structure being duplicated will be copied by perl prior
8736 to this and will be provided as the regexp *r argument, however
8737 with the /old/ structures pprivate pointer value. Thus this routine
8738 may override any copying normally done by perl.
8739
8740 It returns a pointer to the new regexp_internal structure.
8741*/
8742
8743void *
8744Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param)
8745{
8746 dVAR;
8747 regexp_internal *reti;
8748 int len, npar;
8749 RXi_GET_DECL(r,ri);
8750
8751 npar = r->nparens+1;
8752 len = ri->offsets[0];
8753
8754 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
8755 Copy(ri->program, reti->program, len+1, regnode);
8756
8757 if(ri->swap) {
8758 Newx(reti->swap, 1, regexp_paren_ofs);
8759 /* no need to copy these */
8760 Newx(reti->swap->startp, npar, I32);
8761 Newx(reti->swap->endp, npar, I32);
8762 } else {
8763 reti->swap = NULL;
8764 }
8765
8766
8767 reti->regstclass = NULL;
8768 if (ri->data) {
8769 struct reg_data *d;
8770 const int count = ri->data->count;
8771 int i;
8772
8773 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8774 char, struct reg_data);
8775 Newx(d->what, count, U8);
8776
8777 d->count = count;
8778 for (i = 0; i < count; i++) {
8779 d->what[i] = ri->data->what[i];
8780 switch (d->what[i]) {
8781 /* legal options are one of: sSfpontTu
8782 see also regcomp.h and pregfree() */
8783 case 's':
8784 case 'S':
8785 case 'p': /* actually an AV, but the dup function is identical. */
8786 case 'u': /* actually an HV, but the dup function is identical. */
8787 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
8788 break;
8789 case 'f':
8790 /* This is cheating. */
8791 Newx(d->data[i], 1, struct regnode_charclass_class);
8792 StructCopy(ri->data->data[i], d->data[i],
8793 struct regnode_charclass_class);
8794 reti->regstclass = (regnode*)d->data[i];
8795 break;
8796 case 'o':
8797 /* Compiled op trees are readonly and in shared memory,
8798 and can thus be shared without duplication. */
8799 OP_REFCNT_LOCK;
8800 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
8801 OP_REFCNT_UNLOCK;
8802 break;
8803 case 'T':
8804 /* Trie stclasses are readonly and can thus be shared
8805 * without duplication. We free the stclass in pregfree
8806 * when the corresponding reg_ac_data struct is freed.
8807 */
8808 reti->regstclass= ri->regstclass;
8809 /* Fall through */
8810 case 't':
8811 OP_REFCNT_LOCK;
8812 ((reg_trie_data*)ri->data->data[i])->refcount++;
8813 OP_REFCNT_UNLOCK;
8814 /* Fall through */
8815 case 'n':
8816 d->data[i] = ri->data->data[i];
8817 break;
8818 default:
8819 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
8820 }
8821 }
8822
8823 reti->data = d;
8824 }
8825 else
8826 reti->data = NULL;
8827
8828 reti->name_list_idx = ri->name_list_idx;
8829
8830 Newx(reti->offsets, 2*len+1, U32);
8831 Copy(ri->offsets, reti->offsets, 2*len+1, U32);
8832
8833 return (void*)reti;
8834}
8835
8836#endif /* USE_ITHREADS */
8837
8838/*
8839 reg_stringify()
8840
8841 converts a regexp embedded in a MAGIC struct to its stringified form,
8842 caching the converted form in the struct and returns the cached
8843 string.
8844
8845 If lp is nonnull then it is used to return the length of the
8846 resulting string
8847
8848 If flags is nonnull and the returned string contains UTF8 then
8849 (*flags & 1) will be true.
8850
8851 If haseval is nonnull then it is used to return whether the pattern
8852 contains evals.
8853
8854 Normally called via macro:
8855
8856 CALLREG_STRINGIFY(mg,&len,&utf8);
8857
8858 And internally with
8859
8860 CALLREG_AS_STR(mg,&lp,&flags,&haseval)
8861
8862 See sv_2pv_flags() in sv.c for an example of internal usage.
8863
8864 */
8865#ifndef PERL_IN_XSUB_RE
8866char *
8867Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8868 dVAR;
8869 const regexp * const re = (regexp *)mg->mg_obj;
8870
8871 if (!mg->mg_ptr) {
8872 const char *fptr = STD_PAT_MODS; /*"msix"*/
8873 char reflags[7];
8874 char ch;
8875 bool hask = ((re->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
8876 bool hasm = ((re->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
8877 U16 reganch = (U16)((re->extflags & RXf_PMf_STD_PMMOD) >> 12);
8878 bool need_newline = 0;
8879 int left = 0;
8880 int right = 4 + hask;
8881 if (hask)
8882 reflags[left++] = KEEPCOPY_PAT_MOD; /*'k'*/
8883 while((ch = *fptr++)) {
8884 if(reganch & 1) {
8885 reflags[left++] = ch;
8886 }
8887 else {
8888 reflags[right--] = ch;
8889 }
8890 reganch >>= 1;
8891 }
8892 if(hasm) {
8893 reflags[left] = '-';
8894 left = 5 + hask;
8895 }
8896 /* printf("[%*.7s]\n",left,reflags); */
8897 mg->mg_len = re->prelen + 4 + left;
8898 /*
8899 * If /x was used, we have to worry about a regex ending with a
8900 * comment later being embedded within another regex. If so, we don't
8901 * want this regex's "commentization" to leak out to the right part of
8902 * the enclosing regex, we must cap it with a newline.
8903 *
8904 * So, if /x was used, we scan backwards from the end of the regex. If
8905 * we find a '#' before we find a newline, we need to add a newline
8906 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8907 * we don't need to add anything. -jfriedl
8908 */
8909 if (PMf_EXTENDED & re->extflags) {
8910 const char *endptr = re->precomp + re->prelen;
8911 while (endptr >= re->precomp) {
8912 const char c = *(endptr--);
8913 if (c == '\n')
8914 break; /* don't need another */
8915 if (c == '#') {
8916 /* we end while in a comment, so we need a newline */
8917 mg->mg_len++; /* save space for it */
8918 need_newline = 1; /* note to add it */
8919 break;
8920 }
8921 }
8922 }
8923
8924 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8925 mg->mg_ptr[0] = '(';
8926 mg->mg_ptr[1] = '?';
8927 Copy(reflags, mg->mg_ptr+2, left, char);
8928 *(mg->mg_ptr+left+2) = ':';
8929 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8930 if (need_newline)
8931 mg->mg_ptr[mg->mg_len - 2] = '\n';
8932 mg->mg_ptr[mg->mg_len - 1] = ')';
8933 mg->mg_ptr[mg->mg_len] = 0;
8934 }
8935 if (haseval)
8936 *haseval = re->seen_evals;
8937 if (flags)
8938 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
8939
8940 if (lp)
8941 *lp = mg->mg_len;
8942 return mg->mg_ptr;
8943}
8944
8945/*
8946 - regnext - dig the "next" pointer out of a node
8947 */
8948regnode *
8949Perl_regnext(pTHX_ register regnode *p)
8950{
8951 dVAR;
8952 register I32 offset;
8953
8954 if (!p)
8955 return(NULL);
8956
8957 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8958 if (offset == 0)
8959 return(NULL);
8960
8961 return(p+offset);
8962}
8963#endif
8964
8965STATIC void
8966S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
8967{
8968 va_list args;
8969 STRLEN l1 = strlen(pat1);
8970 STRLEN l2 = strlen(pat2);
8971 char buf[512];
8972 SV *msv;
8973 const char *message;
8974
8975 if (l1 > 510)
8976 l1 = 510;
8977 if (l1 + l2 > 510)
8978 l2 = 510 - l1;
8979 Copy(pat1, buf, l1 , char);
8980 Copy(pat2, buf + l1, l2 , char);
8981 buf[l1 + l2] = '\n';
8982 buf[l1 + l2 + 1] = '\0';
8983#ifdef I_STDARG
8984 /* ANSI variant takes additional second argument */
8985 va_start(args, pat2);
8986#else
8987 va_start(args);
8988#endif
8989 msv = vmess(buf, &args);
8990 va_end(args);
8991 message = SvPV_const(msv,l1);
8992 if (l1 > 512)
8993 l1 = 512;
8994 Copy(message, buf, l1 , char);
8995 buf[l1-1] = '\0'; /* Overwrite \n */
8996 Perl_croak(aTHX_ "%s", buf);
8997}
8998
8999/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9000
9001#ifndef PERL_IN_XSUB_RE
9002void
9003Perl_save_re_context(pTHX)
9004{
9005 dVAR;
9006
9007 struct re_save_state *state;
9008
9009 SAVEVPTR(PL_curcop);
9010 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9011
9012 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9013 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9014 SSPUSHINT(SAVEt_RE_STATE);
9015
9016 Copy(&PL_reg_state, state, 1, struct re_save_state);
9017
9018 PL_reg_start_tmp = 0;
9019 PL_reg_start_tmpl = 0;
9020 PL_reg_oldsaved = NULL;
9021 PL_reg_oldsavedlen = 0;
9022 PL_reg_maxiter = 0;
9023 PL_reg_leftiter = 0;
9024 PL_reg_poscache = NULL;
9025 PL_reg_poscache_size = 0;
9026#ifdef PERL_OLD_COPY_ON_WRITE
9027 PL_nrs = NULL;
9028#endif
9029
9030 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9031 if (PL_curpm) {
9032 const REGEXP * const rx = PM_GETRE(PL_curpm);
9033 if (rx) {
9034 U32 i;
9035 for (i = 1; i <= rx->nparens; i++) {
9036 char digits[TYPE_CHARS(long)];
9037 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
9038 GV *const *const gvp
9039 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9040
9041 if (gvp) {
9042 GV * const gv = *gvp;
9043 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9044 save_scalar(gv);
9045 }
9046 }
9047 }
9048 }
9049}
9050#endif
9051
9052static void
9053clear_re(pTHX_ void *r)
9054{
9055 dVAR;
9056 ReREFCNT_dec((regexp *)r);
9057}
9058
9059#ifdef DEBUGGING
9060
9061STATIC void
9062S_put_byte(pTHX_ SV *sv, int c)
9063{
9064 if (isCNTRL(c) || c == 255 || !isPRINT(c))
9065 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9066 else if (c == '-' || c == ']' || c == '\\' || c == '^')
9067 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
9068 else
9069 Perl_sv_catpvf(aTHX_ sv, "%c", c);
9070}
9071
9072
9073#define CLEAR_OPTSTART \
9074 if (optstart) STMT_START { \
9075 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
9076 optstart=NULL; \
9077 } STMT_END
9078
9079#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
9080
9081STATIC const regnode *
9082S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
9083 const regnode *last, const regnode *plast,
9084 SV* sv, I32 indent, U32 depth)
9085{
9086 dVAR;
9087 register U8 op = PSEUDO; /* Arbitrary non-END op. */
9088 register const regnode *next;
9089 const regnode *optstart= NULL;
9090
9091 RXi_GET_DECL(r,ri);
9092 GET_RE_DEBUG_FLAGS_DECL;
9093
9094#ifdef DEBUG_DUMPUNTIL
9095 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9096 last ? last-start : 0,plast ? plast-start : 0);
9097#endif
9098
9099 if (plast && plast < last)
9100 last= plast;
9101
9102 while (PL_regkind[op] != END && (!last || node < last)) {
9103 /* While that wasn't END last time... */
9104 NODE_ALIGN(node);
9105 op = OP(node);
9106 if (op == CLOSE || op == WHILEM)
9107 indent--;
9108 next = regnext((regnode *)node);
9109
9110 /* Where, what. */
9111 if (OP(node) == OPTIMIZED) {
9112 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
9113 optstart = node;
9114 else
9115 goto after_print;
9116 } else
9117 CLEAR_OPTSTART;
9118
9119 regprop(r, sv, node);
9120 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
9121 (int)(2*indent + 1), "", SvPVX_const(sv));
9122
9123 if (OP(node) != OPTIMIZED) {
9124 if (next == NULL) /* Next ptr. */
9125 PerlIO_printf(Perl_debug_log, " (0)");
9126 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9127 PerlIO_printf(Perl_debug_log, " (FAIL)");
9128 else
9129 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
9130 (void)PerlIO_putc(Perl_debug_log, '\n');
9131 }
9132
9133 after_print:
9134 if (PL_regkind[(U8)op] == BRANCHJ) {
9135 assert(next);
9136 {
9137 register const regnode *nnode = (OP(next) == LONGJMP
9138 ? regnext((regnode *)next)
9139 : next);
9140 if (last && nnode > last)
9141 nnode = last;
9142 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
9143 }
9144 }
9145 else if (PL_regkind[(U8)op] == BRANCH) {
9146 assert(next);
9147 DUMPUNTIL(NEXTOPER(node), next);
9148 }
9149 else if ( PL_regkind[(U8)op] == TRIE ) {
9150 const regnode *this_trie = node;
9151 const char op = OP(node);
9152 const U32 n = ARG(node);
9153 const reg_ac_data * const ac = op>=AHOCORASICK ?
9154 (reg_ac_data *)ri->data->data[n] :
9155 NULL;
9156 const reg_trie_data * const trie =
9157 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
9158#ifdef DEBUGGING
9159 AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
9160#endif
9161 const regnode *nextbranch= NULL;
9162 I32 word_idx;
9163 sv_setpvn(sv, "", 0);
9164 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
9165 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
9166
9167 PerlIO_printf(Perl_debug_log, "%*s%s ",
9168 (int)(2*(indent+3)), "",
9169 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
9170 PL_colors[0], PL_colors[1],
9171 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9172 PERL_PV_PRETTY_ELIPSES |
9173 PERL_PV_PRETTY_LTGT
9174 )
9175 : "???"
9176 );
9177 if (trie->jump) {
9178 U16 dist= trie->jump[word_idx+1];
9179 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9180 (UV)((dist ? this_trie + dist : next) - start));
9181 if (dist) {
9182 if (!nextbranch)
9183 nextbranch= this_trie + trie->jump[0];
9184 DUMPUNTIL(this_trie + dist, nextbranch);
9185 }
9186 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9187 nextbranch= regnext((regnode *)nextbranch);
9188 } else {
9189 PerlIO_printf(Perl_debug_log, "\n");
9190 }
9191 }
9192 if (last && next > last)
9193 node= last;
9194 else
9195 node= next;
9196 }
9197 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9198 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9199 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
9200 }
9201 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
9202 assert(next);
9203 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
9204 }
9205 else if ( op == PLUS || op == STAR) {
9206 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
9207 }
9208 else if (op == ANYOF) {
9209 /* arglen 1 + class block */
9210 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9211 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9212 node = NEXTOPER(node);
9213 }
9214 else if (PL_regkind[(U8)op] == EXACT) {
9215 /* Literal string, where present. */
9216 node += NODE_SZ_STR(node) - 1;
9217 node = NEXTOPER(node);
9218 }
9219 else {
9220 node = NEXTOPER(node);
9221 node += regarglen[(U8)op];
9222 }
9223 if (op == CURLYX || op == OPEN)
9224 indent++;
9225 }
9226 CLEAR_OPTSTART;
9227#ifdef DEBUG_DUMPUNTIL
9228 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
9229#endif
9230 return node;
9231}
9232
9233#endif /* DEBUGGING */
9234
9235/*
9236 * Local variables:
9237 * c-indentation-style: bsd
9238 * c-basic-offset: 4
9239 * indent-tabs-mode: t
9240 * End:
9241 *
9242 * ex: set ts=8 sts=4 sw=4 noet:
9243 */