This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perlcall: subjunctive, not indicative
[perl5.git] / regcomp.c
... / ...
CommitLineData
1/* regcomp.c
2 */
3
4/*
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8 */
9
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
18 */
19
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
29/* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
32*/
33
34#ifdef PERL_EXT_RE_BUILD
35#include "re_top.h"
36#endif
37
38/*
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
64 ****
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
74#define PERL_IN_REGCOMP_C
75#include "perl.h"
76
77#ifndef PERL_IN_XSUB_RE
78# include "INTERN.h"
79#endif
80
81#define REG_COMP_C
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
87
88#include "dquote_static.c"
89
90#ifdef op
91#undef op
92#endif /* op */
93
94#ifdef MSDOS
95# if defined(BUGGY_MSC6)
96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
97# pragma optimize("a",off)
98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
101#endif /* MSDOS */
102
103#ifndef STATIC
104#define STATIC static
105#endif
106
107typedef struct RExC_state_t {
108 U32 flags; /* are we folding, multilining? */
109 char *precomp; /* uncompiled string. */
110 REGEXP *rx_sv; /* The SV that is the regexp. */
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
113 char *start; /* Start of input for compile */
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
117 regnode *emit_start; /* Start of emitted-code area */
118 regnode *emit_bound; /* First regnode outside of the allocated space */
119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
126 I32 nestroot; /* root parens we are in - used by accept */
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
137 HV *paren_names; /* Paren names */
138
139 regnode **recurse; /* Recurse regops */
140 I32 recurse_count; /* Number of recurse regops */
141#if ADD_TO_REGEXEC
142 char *starttry; /* -Dr: where regtry was called. */
143#define RExC_starttry (pRExC_state->starttry)
144#endif
145#ifdef DEBUGGING
146 const char *lastparse;
147 I32 lastnum;
148 AV *paren_name_list; /* idx -> name */
149#define RExC_lastparse (pRExC_state->lastparse)
150#define RExC_lastnum (pRExC_state->lastnum)
151#define RExC_paren_name_list (pRExC_state->paren_name_list)
152#endif
153} RExC_state_t;
154
155#define RExC_flags (pRExC_state->flags)
156#define RExC_precomp (pRExC_state->precomp)
157#define RExC_rx_sv (pRExC_state->rx_sv)
158#define RExC_rx (pRExC_state->rx)
159#define RExC_rxi (pRExC_state->rxi)
160#define RExC_start (pRExC_state->start)
161#define RExC_end (pRExC_state->end)
162#define RExC_parse (pRExC_state->parse)
163#define RExC_whilem_seen (pRExC_state->whilem_seen)
164#ifdef RE_TRACK_PATTERN_OFFSETS
165#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
166#endif
167#define RExC_emit (pRExC_state->emit)
168#define RExC_emit_start (pRExC_state->emit_start)
169#define RExC_emit_bound (pRExC_state->emit_bound)
170#define RExC_naughty (pRExC_state->naughty)
171#define RExC_sawback (pRExC_state->sawback)
172#define RExC_seen (pRExC_state->seen)
173#define RExC_size (pRExC_state->size)
174#define RExC_npar (pRExC_state->npar)
175#define RExC_nestroot (pRExC_state->nestroot)
176#define RExC_extralen (pRExC_state->extralen)
177#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
178#define RExC_seen_evals (pRExC_state->seen_evals)
179#define RExC_utf8 (pRExC_state->utf8)
180#define RExC_orig_utf8 (pRExC_state->orig_utf8)
181#define RExC_open_parens (pRExC_state->open_parens)
182#define RExC_close_parens (pRExC_state->close_parens)
183#define RExC_opend (pRExC_state->opend)
184#define RExC_paren_names (pRExC_state->paren_names)
185#define RExC_recurse (pRExC_state->recurse)
186#define RExC_recurse_count (pRExC_state->recurse_count)
187
188
189#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
192
193#ifdef SPSTART
194#undef SPSTART /* dratted cpp namespace... */
195#endif
196/*
197 * Flags to be passed up and down.
198 */
199#define WORST 0 /* Worst case. */
200#define HASWIDTH 0x01 /* Known to match non-null strings. */
201
202/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
203 * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
204#define SIMPLE 0x02
205#define SPSTART 0x04 /* Starts with * or +. */
206#define TRYAGAIN 0x08 /* Weeded out a declaration. */
207#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
208
209#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
210
211/* whether trie related optimizations are enabled */
212#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
213#define TRIE_STUDY_OPT
214#define FULL_TRIE_STUDY
215#define TRIE_STCLASS
216#endif
217
218
219
220#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
221#define PBITVAL(paren) (1 << ((paren) & 7))
222#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
223#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
224#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
225
226/* If not already in utf8, do a longjmp back to the beginning */
227#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
228#define REQUIRE_UTF8 STMT_START { \
229 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
230 } STMT_END
231
232/* About scan_data_t.
233
234 During optimisation we recurse through the regexp program performing
235 various inplace (keyhole style) optimisations. In addition study_chunk
236 and scan_commit populate this data structure with information about
237 what strings MUST appear in the pattern. We look for the longest
238 string that must appear at a fixed location, and we look for the
239 longest string that may appear at a floating location. So for instance
240 in the pattern:
241
242 /FOO[xX]A.*B[xX]BAR/
243
244 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
245 strings (because they follow a .* construct). study_chunk will identify
246 both FOO and BAR as being the longest fixed and floating strings respectively.
247
248 The strings can be composites, for instance
249
250 /(f)(o)(o)/
251
252 will result in a composite fixed substring 'foo'.
253
254 For each string some basic information is maintained:
255
256 - offset or min_offset
257 This is the position the string must appear at, or not before.
258 It also implicitly (when combined with minlenp) tells us how many
259 characters must match before the string we are searching for.
260 Likewise when combined with minlenp and the length of the string it
261 tells us how many characters must appear after the string we have
262 found.
263
264 - max_offset
265 Only used for floating strings. This is the rightmost point that
266 the string can appear at. If set to I32 max it indicates that the
267 string can occur infinitely far to the right.
268
269 - minlenp
270 A pointer to the minimum length of the pattern that the string
271 was found inside. This is important as in the case of positive
272 lookahead or positive lookbehind we can have multiple patterns
273 involved. Consider
274
275 /(?=FOO).*F/
276
277 The minimum length of the pattern overall is 3, the minimum length
278 of the lookahead part is 3, but the minimum length of the part that
279 will actually match is 1. So 'FOO's minimum length is 3, but the
280 minimum length for the F is 1. This is important as the minimum length
281 is used to determine offsets in front of and behind the string being
282 looked for. Since strings can be composites this is the length of the
283 pattern at the time it was committed with a scan_commit. Note that
284 the length is calculated by study_chunk, so that the minimum lengths
285 are not known until the full pattern has been compiled, thus the
286 pointer to the value.
287
288 - lookbehind
289
290 In the case of lookbehind the string being searched for can be
291 offset past the start point of the final matching string.
292 If this value was just blithely removed from the min_offset it would
293 invalidate some of the calculations for how many chars must match
294 before or after (as they are derived from min_offset and minlen and
295 the length of the string being searched for).
296 When the final pattern is compiled and the data is moved from the
297 scan_data_t structure into the regexp structure the information
298 about lookbehind is factored in, with the information that would
299 have been lost precalculated in the end_shift field for the
300 associated string.
301
302 The fields pos_min and pos_delta are used to store the minimum offset
303 and the delta to the maximum offset at the current point in the pattern.
304
305*/
306
307typedef struct scan_data_t {
308 /*I32 len_min; unused */
309 /*I32 len_delta; unused */
310 I32 pos_min;
311 I32 pos_delta;
312 SV *last_found;
313 I32 last_end; /* min value, <0 unless valid. */
314 I32 last_start_min;
315 I32 last_start_max;
316 SV **longest; /* Either &l_fixed, or &l_float. */
317 SV *longest_fixed; /* longest fixed string found in pattern */
318 I32 offset_fixed; /* offset where it starts */
319 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
320 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
321 SV *longest_float; /* longest floating string found in pattern */
322 I32 offset_float_min; /* earliest point in string it can appear */
323 I32 offset_float_max; /* latest point in string it can appear */
324 I32 *minlen_float; /* pointer to the minlen relevant to the string */
325 I32 lookbehind_float; /* is the position of the string modified by LB */
326 I32 flags;
327 I32 whilem_c;
328 I32 *last_closep;
329 struct regnode_charclass_class *start_class;
330} scan_data_t;
331
332/*
333 * Forward declarations for pregcomp()'s friends.
334 */
335
336static const scan_data_t zero_scan_data =
337 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
338
339#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
340#define SF_BEFORE_SEOL 0x0001
341#define SF_BEFORE_MEOL 0x0002
342#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
343#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
344
345#ifdef NO_UNARY_PLUS
346# define SF_FIX_SHIFT_EOL (0+2)
347# define SF_FL_SHIFT_EOL (0+4)
348#else
349# define SF_FIX_SHIFT_EOL (+2)
350# define SF_FL_SHIFT_EOL (+4)
351#endif
352
353#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
354#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
355
356#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
357#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
358#define SF_IS_INF 0x0040
359#define SF_HAS_PAR 0x0080
360#define SF_IN_PAR 0x0100
361#define SF_HAS_EVAL 0x0200
362#define SCF_DO_SUBSTR 0x0400
363#define SCF_DO_STCLASS_AND 0x0800
364#define SCF_DO_STCLASS_OR 0x1000
365#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
366#define SCF_WHILEM_VISITED_POS 0x2000
367
368#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
369#define SCF_SEEN_ACCEPT 0x8000
370
371#define UTF cBOOL(RExC_utf8)
372#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
373#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
374#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
375#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
376#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
377
378#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
379
380#define OOB_UNICODE 12345678
381#define OOB_NAMEDCLASS -1
382
383#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
384#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
385
386
387/* length of regex to show in messages that don't mark a position within */
388#define RegexLengthToShowInErrorMessages 127
389
390/*
391 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
392 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
393 * op/pragma/warn/regcomp.
394 */
395#define MARKER1 "<-- HERE" /* marker as it appears in the description */
396#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
397
398#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
399
400/*
401 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
402 * arg. Show regex, up to a maximum length. If it's too long, chop and add
403 * "...".
404 */
405#define _FAIL(code) STMT_START { \
406 const char *ellipses = ""; \
407 IV len = RExC_end - RExC_precomp; \
408 \
409 if (!SIZE_ONLY) \
410 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
411 if (len > RegexLengthToShowInErrorMessages) { \
412 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
413 len = RegexLengthToShowInErrorMessages - 10; \
414 ellipses = "..."; \
415 } \
416 code; \
417} STMT_END
418
419#define FAIL(msg) _FAIL( \
420 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
421 msg, (int)len, RExC_precomp, ellipses))
422
423#define FAIL2(msg,arg) _FAIL( \
424 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
425 arg, (int)len, RExC_precomp, ellipses))
426
427/*
428 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
429 */
430#define Simple_vFAIL(m) STMT_START { \
431 const IV offset = RExC_parse - RExC_precomp; \
432 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
433 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
434} STMT_END
435
436/*
437 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
438 */
439#define vFAIL(m) STMT_START { \
440 if (!SIZE_ONLY) \
441 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
442 Simple_vFAIL(m); \
443} STMT_END
444
445/*
446 * Like Simple_vFAIL(), but accepts two arguments.
447 */
448#define Simple_vFAIL2(m,a1) STMT_START { \
449 const IV offset = RExC_parse - RExC_precomp; \
450 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
451 (int)offset, RExC_precomp, RExC_precomp + offset); \
452} STMT_END
453
454/*
455 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
456 */
457#define vFAIL2(m,a1) STMT_START { \
458 if (!SIZE_ONLY) \
459 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
460 Simple_vFAIL2(m, a1); \
461} STMT_END
462
463
464/*
465 * Like Simple_vFAIL(), but accepts three arguments.
466 */
467#define Simple_vFAIL3(m, a1, a2) STMT_START { \
468 const IV offset = RExC_parse - RExC_precomp; \
469 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
470 (int)offset, RExC_precomp, RExC_precomp + offset); \
471} STMT_END
472
473/*
474 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
475 */
476#define vFAIL3(m,a1,a2) STMT_START { \
477 if (!SIZE_ONLY) \
478 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
479 Simple_vFAIL3(m, a1, a2); \
480} STMT_END
481
482/*
483 * Like Simple_vFAIL(), but accepts four arguments.
484 */
485#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
486 const IV offset = RExC_parse - RExC_precomp; \
487 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
488 (int)offset, RExC_precomp, RExC_precomp + offset); \
489} STMT_END
490
491#define ckWARNreg(loc,m) STMT_START { \
492 const IV offset = loc - RExC_precomp; \
493 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
494 (int)offset, RExC_precomp, RExC_precomp + offset); \
495} STMT_END
496
497#define ckWARNregdep(loc,m) STMT_START { \
498 const IV offset = loc - RExC_precomp; \
499 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
500 m REPORT_LOCATION, \
501 (int)offset, RExC_precomp, RExC_precomp + offset); \
502} STMT_END
503
504#define ckWARN2reg(loc, m, a1) STMT_START { \
505 const IV offset = loc - RExC_precomp; \
506 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
507 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
508} STMT_END
509
510#define vWARN3(loc, m, a1, a2) STMT_START { \
511 const IV offset = loc - RExC_precomp; \
512 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
513 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
514} STMT_END
515
516#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
517 const IV offset = loc - RExC_precomp; \
518 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
519 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
520} STMT_END
521
522#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
523 const IV offset = loc - RExC_precomp; \
524 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
525 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
526} STMT_END
527
528#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
529 const IV offset = loc - RExC_precomp; \
530 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
531 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
532} STMT_END
533
534#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
535 const IV offset = loc - RExC_precomp; \
536 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
537 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
538} STMT_END
539
540
541/* Allow for side effects in s */
542#define REGC(c,s) STMT_START { \
543 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
544} STMT_END
545
546/* Macros for recording node offsets. 20001227 mjd@plover.com
547 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
548 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
549 * Element 0 holds the number n.
550 * Position is 1 indexed.
551 */
552#ifndef RE_TRACK_PATTERN_OFFSETS
553#define Set_Node_Offset_To_R(node,byte)
554#define Set_Node_Offset(node,byte)
555#define Set_Cur_Node_Offset
556#define Set_Node_Length_To_R(node,len)
557#define Set_Node_Length(node,len)
558#define Set_Node_Cur_Length(node)
559#define Node_Offset(n)
560#define Node_Length(n)
561#define Set_Node_Offset_Length(node,offset,len)
562#define ProgLen(ri) ri->u.proglen
563#define SetProgLen(ri,x) ri->u.proglen = x
564#else
565#define ProgLen(ri) ri->u.offsets[0]
566#define SetProgLen(ri,x) ri->u.offsets[0] = x
567#define Set_Node_Offset_To_R(node,byte) STMT_START { \
568 if (! SIZE_ONLY) { \
569 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
570 __LINE__, (int)(node), (int)(byte))); \
571 if((node) < 0) { \
572 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
573 } else { \
574 RExC_offsets[2*(node)-1] = (byte); \
575 } \
576 } \
577} STMT_END
578
579#define Set_Node_Offset(node,byte) \
580 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
581#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
582
583#define Set_Node_Length_To_R(node,len) STMT_START { \
584 if (! SIZE_ONLY) { \
585 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
586 __LINE__, (int)(node), (int)(len))); \
587 if((node) < 0) { \
588 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
589 } else { \
590 RExC_offsets[2*(node)] = (len); \
591 } \
592 } \
593} STMT_END
594
595#define Set_Node_Length(node,len) \
596 Set_Node_Length_To_R((node)-RExC_emit_start, len)
597#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
598#define Set_Node_Cur_Length(node) \
599 Set_Node_Length(node, RExC_parse - parse_start)
600
601/* Get offsets and lengths */
602#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
603#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
604
605#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
606 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
607 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
608} STMT_END
609#endif
610
611#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
612#define EXPERIMENTAL_INPLACESCAN
613#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
614
615#define DEBUG_STUDYDATA(str,data,depth) \
616DEBUG_OPTIMISE_MORE_r(if(data){ \
617 PerlIO_printf(Perl_debug_log, \
618 "%*s" str "Pos:%"IVdf"/%"IVdf \
619 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
620 (int)(depth)*2, "", \
621 (IV)((data)->pos_min), \
622 (IV)((data)->pos_delta), \
623 (UV)((data)->flags), \
624 (IV)((data)->whilem_c), \
625 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
626 is_inf ? "INF " : "" \
627 ); \
628 if ((data)->last_found) \
629 PerlIO_printf(Perl_debug_log, \
630 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
631 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
632 SvPVX_const((data)->last_found), \
633 (IV)((data)->last_end), \
634 (IV)((data)->last_start_min), \
635 (IV)((data)->last_start_max), \
636 ((data)->longest && \
637 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
638 SvPVX_const((data)->longest_fixed), \
639 (IV)((data)->offset_fixed), \
640 ((data)->longest && \
641 (data)->longest==&((data)->longest_float)) ? "*" : "", \
642 SvPVX_const((data)->longest_float), \
643 (IV)((data)->offset_float_min), \
644 (IV)((data)->offset_float_max) \
645 ); \
646 PerlIO_printf(Perl_debug_log,"\n"); \
647});
648
649static void clear_re(pTHX_ void *r);
650
651/* Mark that we cannot extend a found fixed substring at this point.
652 Update the longest found anchored substring and the longest found
653 floating substrings if needed. */
654
655STATIC void
656S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
657{
658 const STRLEN l = CHR_SVLEN(data->last_found);
659 const STRLEN old_l = CHR_SVLEN(*data->longest);
660 GET_RE_DEBUG_FLAGS_DECL;
661
662 PERL_ARGS_ASSERT_SCAN_COMMIT;
663
664 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
665 SvSetMagicSV(*data->longest, data->last_found);
666 if (*data->longest == data->longest_fixed) {
667 data->offset_fixed = l ? data->last_start_min : data->pos_min;
668 if (data->flags & SF_BEFORE_EOL)
669 data->flags
670 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
671 else
672 data->flags &= ~SF_FIX_BEFORE_EOL;
673 data->minlen_fixed=minlenp;
674 data->lookbehind_fixed=0;
675 }
676 else { /* *data->longest == data->longest_float */
677 data->offset_float_min = l ? data->last_start_min : data->pos_min;
678 data->offset_float_max = (l
679 ? data->last_start_max
680 : data->pos_min + data->pos_delta);
681 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
682 data->offset_float_max = I32_MAX;
683 if (data->flags & SF_BEFORE_EOL)
684 data->flags
685 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
686 else
687 data->flags &= ~SF_FL_BEFORE_EOL;
688 data->minlen_float=minlenp;
689 data->lookbehind_float=0;
690 }
691 }
692 SvCUR_set(data->last_found, 0);
693 {
694 SV * const sv = data->last_found;
695 if (SvUTF8(sv) && SvMAGICAL(sv)) {
696 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
697 if (mg)
698 mg->mg_len = 0;
699 }
700 }
701 data->last_end = -1;
702 data->flags &= ~SF_BEFORE_EOL;
703 DEBUG_STUDYDATA("commit: ",data,0);
704}
705
706/* Can match anything (initialization) */
707STATIC void
708S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
709{
710 PERL_ARGS_ASSERT_CL_ANYTHING;
711
712 ANYOF_CLASS_ZERO(cl);
713 ANYOF_BITMAP_SETALL(cl);
714 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
715 if (LOC)
716 cl->flags |= ANYOF_LOCALE;
717}
718
719/* Can match anything (initialization) */
720STATIC int
721S_cl_is_anything(const struct regnode_charclass_class *cl)
722{
723 int value;
724
725 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
726
727 for (value = 0; value <= ANYOF_MAX; value += 2)
728 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
729 return 1;
730 if (!(cl->flags & ANYOF_UNICODE_ALL))
731 return 0;
732 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
733 return 0;
734 return 1;
735}
736
737/* Can match anything (initialization) */
738STATIC void
739S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
740{
741 PERL_ARGS_ASSERT_CL_INIT;
742
743 Zero(cl, 1, struct regnode_charclass_class);
744 cl->type = ANYOF;
745 cl_anything(pRExC_state, cl);
746}
747
748STATIC void
749S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
750{
751 PERL_ARGS_ASSERT_CL_INIT_ZERO;
752
753 Zero(cl, 1, struct regnode_charclass_class);
754 cl->type = ANYOF;
755 cl_anything(pRExC_state, cl);
756 if (LOC)
757 cl->flags |= ANYOF_LOCALE;
758}
759
760/* 'And' a given class with another one. Can create false positives */
761/* We assume that cl is not inverted */
762STATIC void
763S_cl_and(struct regnode_charclass_class *cl,
764 const struct regnode_charclass_class *and_with)
765{
766 PERL_ARGS_ASSERT_CL_AND;
767
768 assert(and_with->type == ANYOF);
769
770 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
771 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
772 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
773 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
774 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
775 int i;
776
777 if (and_with->flags & ANYOF_INVERT)
778 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
779 cl->bitmap[i] &= ~and_with->bitmap[i];
780 else
781 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
782 cl->bitmap[i] &= and_with->bitmap[i];
783 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
784 if (!(and_with->flags & ANYOF_EOS))
785 cl->flags &= ~ANYOF_EOS;
786
787 if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
788 cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
789 if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
790 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
791
792 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
793 !(and_with->flags & ANYOF_INVERT)) {
794 cl->flags &= ~ANYOF_UNICODE_ALL;
795 cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
796 only the one(s)
797 actually set */
798 ARG_SET(cl, ARG(and_with));
799 }
800 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
801 !(and_with->flags & ANYOF_INVERT))
802 cl->flags &= ~ANYOF_UNICODE_ALL;
803 if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
804 !(and_with->flags & ANYOF_INVERT))
805 cl->flags &= ~ANYOF_NONBITMAP;
806}
807
808/* 'OR' a given class with another one. Can create false positives */
809/* We assume that cl is not inverted */
810STATIC void
811S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
812{
813 PERL_ARGS_ASSERT_CL_OR;
814
815 if (or_with->flags & ANYOF_INVERT) {
816 /* We do not use
817 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
818 * <= (B1 | !B2) | (CL1 | !CL2)
819 * which is wasteful if CL2 is small, but we ignore CL2:
820 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
821 * XXXX Can we handle case-fold? Unclear:
822 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
823 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
824 */
825 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
826 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
827 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
828 int i;
829
830 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
831 cl->bitmap[i] |= ~or_with->bitmap[i];
832 } /* XXXX: logic is complicated otherwise */
833 else {
834 cl_anything(pRExC_state, cl);
835 }
836 } else {
837 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
838 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
839 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
840 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
841 int i;
842
843 /* OR char bitmap and class bitmap separately */
844 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
845 cl->bitmap[i] |= or_with->bitmap[i];
846 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
847 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
848 cl->classflags[i] |= or_with->classflags[i];
849 cl->flags |= ANYOF_CLASS;
850 }
851 }
852 else { /* XXXX: logic is complicated, leave it along for a moment. */
853 cl_anything(pRExC_state, cl);
854 }
855 }
856 if (or_with->flags & ANYOF_EOS)
857 cl->flags |= ANYOF_EOS;
858 if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
859 cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
860
861 if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
862 cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
863
864 /* If both nodes match something outside the bitmap, but what they match
865 * outside is not the same pointer, and hence not easily compared, give up
866 * and allow the start class to match everything outside the bitmap */
867 if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
868 ARG(cl) != ARG(or_with)) {
869 cl->flags |= ANYOF_UNICODE_ALL;
870 }
871
872 if (or_with->flags & ANYOF_UNICODE_ALL) {
873 cl->flags |= ANYOF_UNICODE_ALL;
874 }
875}
876
877#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
878#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
879#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
880#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
881
882
883#ifdef DEBUGGING
884/*
885 dump_trie(trie,widecharmap,revcharmap)
886 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
887 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
888
889 These routines dump out a trie in a somewhat readable format.
890 The _interim_ variants are used for debugging the interim
891 tables that are used to generate the final compressed
892 representation which is what dump_trie expects.
893
894 Part of the reason for their existence is to provide a form
895 of documentation as to how the different representations function.
896
897*/
898
899/*
900 Dumps the final compressed table form of the trie to Perl_debug_log.
901 Used for debugging make_trie().
902*/
903
904STATIC void
905S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
906 AV *revcharmap, U32 depth)
907{
908 U32 state;
909 SV *sv=sv_newmortal();
910 int colwidth= widecharmap ? 6 : 4;
911 U16 word;
912 GET_RE_DEBUG_FLAGS_DECL;
913
914 PERL_ARGS_ASSERT_DUMP_TRIE;
915
916 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
917 (int)depth * 2 + 2,"",
918 "Match","Base","Ofs" );
919
920 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
921 SV ** const tmp = av_fetch( revcharmap, state, 0);
922 if ( tmp ) {
923 PerlIO_printf( Perl_debug_log, "%*s",
924 colwidth,
925 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
926 PL_colors[0], PL_colors[1],
927 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
928 PERL_PV_ESCAPE_FIRSTCHAR
929 )
930 );
931 }
932 }
933 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
934 (int)depth * 2 + 2,"");
935
936 for( state = 0 ; state < trie->uniquecharcount ; state++ )
937 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
938 PerlIO_printf( Perl_debug_log, "\n");
939
940 for( state = 1 ; state < trie->statecount ; state++ ) {
941 const U32 base = trie->states[ state ].trans.base;
942
943 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
944
945 if ( trie->states[ state ].wordnum ) {
946 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
947 } else {
948 PerlIO_printf( Perl_debug_log, "%6s", "" );
949 }
950
951 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
952
953 if ( base ) {
954 U32 ofs = 0;
955
956 while( ( base + ofs < trie->uniquecharcount ) ||
957 ( base + ofs - trie->uniquecharcount < trie->lasttrans
958 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
959 ofs++;
960
961 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
962
963 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
964 if ( ( base + ofs >= trie->uniquecharcount ) &&
965 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
966 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
967 {
968 PerlIO_printf( Perl_debug_log, "%*"UVXf,
969 colwidth,
970 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
971 } else {
972 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
973 }
974 }
975
976 PerlIO_printf( Perl_debug_log, "]");
977
978 }
979 PerlIO_printf( Perl_debug_log, "\n" );
980 }
981 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
982 for (word=1; word <= trie->wordcount; word++) {
983 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
984 (int)word, (int)(trie->wordinfo[word].prev),
985 (int)(trie->wordinfo[word].len));
986 }
987 PerlIO_printf(Perl_debug_log, "\n" );
988}
989/*
990 Dumps a fully constructed but uncompressed trie in list form.
991 List tries normally only are used for construction when the number of
992 possible chars (trie->uniquecharcount) is very high.
993 Used for debugging make_trie().
994*/
995STATIC void
996S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
997 HV *widecharmap, AV *revcharmap, U32 next_alloc,
998 U32 depth)
999{
1000 U32 state;
1001 SV *sv=sv_newmortal();
1002 int colwidth= widecharmap ? 6 : 4;
1003 GET_RE_DEBUG_FLAGS_DECL;
1004
1005 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1006
1007 /* print out the table precompression. */
1008 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1009 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1010 "------:-----+-----------------\n" );
1011
1012 for( state=1 ; state < next_alloc ; state ++ ) {
1013 U16 charid;
1014
1015 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1016 (int)depth * 2 + 2,"", (UV)state );
1017 if ( ! trie->states[ state ].wordnum ) {
1018 PerlIO_printf( Perl_debug_log, "%5s| ","");
1019 } else {
1020 PerlIO_printf( Perl_debug_log, "W%4x| ",
1021 trie->states[ state ].wordnum
1022 );
1023 }
1024 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1025 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1026 if ( tmp ) {
1027 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1028 colwidth,
1029 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1030 PL_colors[0], PL_colors[1],
1031 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1032 PERL_PV_ESCAPE_FIRSTCHAR
1033 ) ,
1034 TRIE_LIST_ITEM(state,charid).forid,
1035 (UV)TRIE_LIST_ITEM(state,charid).newstate
1036 );
1037 if (!(charid % 10))
1038 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1039 (int)((depth * 2) + 14), "");
1040 }
1041 }
1042 PerlIO_printf( Perl_debug_log, "\n");
1043 }
1044}
1045
1046/*
1047 Dumps a fully constructed but uncompressed trie in table form.
1048 This is the normal DFA style state transition table, with a few
1049 twists to facilitate compression later.
1050 Used for debugging make_trie().
1051*/
1052STATIC void
1053S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1054 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1055 U32 depth)
1056{
1057 U32 state;
1058 U16 charid;
1059 SV *sv=sv_newmortal();
1060 int colwidth= widecharmap ? 6 : 4;
1061 GET_RE_DEBUG_FLAGS_DECL;
1062
1063 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1064
1065 /*
1066 print out the table precompression so that we can do a visual check
1067 that they are identical.
1068 */
1069
1070 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1071
1072 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1073 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1074 if ( tmp ) {
1075 PerlIO_printf( Perl_debug_log, "%*s",
1076 colwidth,
1077 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1078 PL_colors[0], PL_colors[1],
1079 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1080 PERL_PV_ESCAPE_FIRSTCHAR
1081 )
1082 );
1083 }
1084 }
1085
1086 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1087
1088 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1089 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1090 }
1091
1092 PerlIO_printf( Perl_debug_log, "\n" );
1093
1094 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1095
1096 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1097 (int)depth * 2 + 2,"",
1098 (UV)TRIE_NODENUM( state ) );
1099
1100 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1101 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1102 if (v)
1103 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1104 else
1105 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1106 }
1107 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1108 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1109 } else {
1110 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1111 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1112 }
1113 }
1114}
1115
1116#endif
1117
1118
1119/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1120 startbranch: the first branch in the whole branch sequence
1121 first : start branch of sequence of branch-exact nodes.
1122 May be the same as startbranch
1123 last : Thing following the last branch.
1124 May be the same as tail.
1125 tail : item following the branch sequence
1126 count : words in the sequence
1127 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1128 depth : indent depth
1129
1130Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1131
1132A trie is an N'ary tree where the branches are determined by digital
1133decomposition of the key. IE, at the root node you look up the 1st character and
1134follow that branch repeat until you find the end of the branches. Nodes can be
1135marked as "accepting" meaning they represent a complete word. Eg:
1136
1137 /he|she|his|hers/
1138
1139would convert into the following structure. Numbers represent states, letters
1140following numbers represent valid transitions on the letter from that state, if
1141the number is in square brackets it represents an accepting state, otherwise it
1142will be in parenthesis.
1143
1144 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1145 | |
1146 | (2)
1147 | |
1148 (1) +-i->(6)-+-s->[7]
1149 |
1150 +-s->(3)-+-h->(4)-+-e->[5]
1151
1152 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1153
1154This shows that when matching against the string 'hers' we will begin at state 1
1155read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1156then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1157is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1158single traverse. We store a mapping from accepting to state to which word was
1159matched, and then when we have multiple possibilities we try to complete the
1160rest of the regex in the order in which they occured in the alternation.
1161
1162The only prior NFA like behaviour that would be changed by the TRIE support is
1163the silent ignoring of duplicate alternations which are of the form:
1164
1165 / (DUPE|DUPE) X? (?{ ... }) Y /x
1166
1167Thus EVAL blocks following a trie may be called a different number of times with
1168and without the optimisation. With the optimisations dupes will be silently
1169ignored. This inconsistent behaviour of EVAL type nodes is well established as
1170the following demonstrates:
1171
1172 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1173
1174which prints out 'word' three times, but
1175
1176 'words'=~/(word|word|word)(?{ print $1 })S/
1177
1178which doesnt print it out at all. This is due to other optimisations kicking in.
1179
1180Example of what happens on a structural level:
1181
1182The regexp /(ac|ad|ab)+/ will produce the following debug output:
1183
1184 1: CURLYM[1] {1,32767}(18)
1185 5: BRANCH(8)
1186 6: EXACT <ac>(16)
1187 8: BRANCH(11)
1188 9: EXACT <ad>(16)
1189 11: BRANCH(14)
1190 12: EXACT <ab>(16)
1191 16: SUCCEED(0)
1192 17: NOTHING(18)
1193 18: END(0)
1194
1195This would be optimizable with startbranch=5, first=5, last=16, tail=16
1196and should turn into:
1197
1198 1: CURLYM[1] {1,32767}(18)
1199 5: TRIE(16)
1200 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1201 <ac>
1202 <ad>
1203 <ab>
1204 16: SUCCEED(0)
1205 17: NOTHING(18)
1206 18: END(0)
1207
1208Cases where tail != last would be like /(?foo|bar)baz/:
1209
1210 1: BRANCH(4)
1211 2: EXACT <foo>(8)
1212 4: BRANCH(7)
1213 5: EXACT <bar>(8)
1214 7: TAIL(8)
1215 8: EXACT <baz>(10)
1216 10: END(0)
1217
1218which would be optimizable with startbranch=1, first=1, last=7, tail=8
1219and would end up looking like:
1220
1221 1: TRIE(8)
1222 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1223 <foo>
1224 <bar>
1225 7: TAIL(8)
1226 8: EXACT <baz>(10)
1227 10: END(0)
1228
1229 d = uvuni_to_utf8_flags(d, uv, 0);
1230
1231is the recommended Unicode-aware way of saying
1232
1233 *(d++) = uv;
1234*/
1235
1236#define TRIE_STORE_REVCHAR \
1237 STMT_START { \
1238 if (UTF) { \
1239 SV *zlopp = newSV(2); \
1240 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1241 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1242 SvCUR_set(zlopp, kapow - flrbbbbb); \
1243 SvPOK_on(zlopp); \
1244 SvUTF8_on(zlopp); \
1245 av_push(revcharmap, zlopp); \
1246 } else { \
1247 char ooooff = (char)uvc; \
1248 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1249 } \
1250 } STMT_END
1251
1252#define TRIE_READ_CHAR STMT_START { \
1253 wordlen++; \
1254 if ( UTF ) { \
1255 if ( folder ) { \
1256 if ( foldlen > 0 ) { \
1257 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1258 foldlen -= len; \
1259 scan += len; \
1260 len = 0; \
1261 } else { \
1262 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1263 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1264 foldlen -= UNISKIP( uvc ); \
1265 scan = foldbuf + UNISKIP( uvc ); \
1266 } \
1267 } else { \
1268 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1269 } \
1270 } else { \
1271 uvc = (U32)*uc; \
1272 len = 1; \
1273 } \
1274} STMT_END
1275
1276
1277
1278#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1279 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1280 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1281 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1282 } \
1283 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1284 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1285 TRIE_LIST_CUR( state )++; \
1286} STMT_END
1287
1288#define TRIE_LIST_NEW(state) STMT_START { \
1289 Newxz( trie->states[ state ].trans.list, \
1290 4, reg_trie_trans_le ); \
1291 TRIE_LIST_CUR( state ) = 1; \
1292 TRIE_LIST_LEN( state ) = 4; \
1293} STMT_END
1294
1295#define TRIE_HANDLE_WORD(state) STMT_START { \
1296 U16 dupe= trie->states[ state ].wordnum; \
1297 regnode * const noper_next = regnext( noper ); \
1298 \
1299 DEBUG_r({ \
1300 /* store the word for dumping */ \
1301 SV* tmp; \
1302 if (OP(noper) != NOTHING) \
1303 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1304 else \
1305 tmp = newSVpvn_utf8( "", 0, UTF ); \
1306 av_push( trie_words, tmp ); \
1307 }); \
1308 \
1309 curword++; \
1310 trie->wordinfo[curword].prev = 0; \
1311 trie->wordinfo[curword].len = wordlen; \
1312 trie->wordinfo[curword].accept = state; \
1313 \
1314 if ( noper_next < tail ) { \
1315 if (!trie->jump) \
1316 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1317 trie->jump[curword] = (U16)(noper_next - convert); \
1318 if (!jumper) \
1319 jumper = noper_next; \
1320 if (!nextbranch) \
1321 nextbranch= regnext(cur); \
1322 } \
1323 \
1324 if ( dupe ) { \
1325 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1326 /* chain, so that when the bits of chain are later */\
1327 /* linked together, the dups appear in the chain */\
1328 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1329 trie->wordinfo[dupe].prev = curword; \
1330 } else { \
1331 /* we haven't inserted this word yet. */ \
1332 trie->states[ state ].wordnum = curword; \
1333 } \
1334} STMT_END
1335
1336
1337#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1338 ( ( base + charid >= ucharcount \
1339 && base + charid < ubound \
1340 && state == trie->trans[ base - ucharcount + charid ].check \
1341 && trie->trans[ base - ucharcount + charid ].next ) \
1342 ? trie->trans[ base - ucharcount + charid ].next \
1343 : ( state==1 ? special : 0 ) \
1344 )
1345
1346#define MADE_TRIE 1
1347#define MADE_JUMP_TRIE 2
1348#define MADE_EXACT_TRIE 4
1349
1350STATIC I32
1351S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1352{
1353 dVAR;
1354 /* first pass, loop through and scan words */
1355 reg_trie_data *trie;
1356 HV *widecharmap = NULL;
1357 AV *revcharmap = newAV();
1358 regnode *cur;
1359 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1360 STRLEN len = 0;
1361 UV uvc = 0;
1362 U16 curword = 0;
1363 U32 next_alloc = 0;
1364 regnode *jumper = NULL;
1365 regnode *nextbranch = NULL;
1366 regnode *convert = NULL;
1367 U32 *prev_states; /* temp array mapping each state to previous one */
1368 /* we just use folder as a flag in utf8 */
1369 const U8 * folder = NULL;
1370
1371#ifdef DEBUGGING
1372 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1373 AV *trie_words = NULL;
1374 /* along with revcharmap, this only used during construction but both are
1375 * useful during debugging so we store them in the struct when debugging.
1376 */
1377#else
1378 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1379 STRLEN trie_charcount=0;
1380#endif
1381 SV *re_trie_maxbuff;
1382 GET_RE_DEBUG_FLAGS_DECL;
1383
1384 PERL_ARGS_ASSERT_MAKE_TRIE;
1385#ifndef DEBUGGING
1386 PERL_UNUSED_ARG(depth);
1387#endif
1388
1389 switch (flags) {
1390 case EXACTFU: folder = PL_fold_latin1; break;
1391 case EXACTF: folder = PL_fold; break;
1392 case EXACTFL: folder = PL_fold_locale; break;
1393 }
1394
1395 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1396 trie->refcount = 1;
1397 trie->startstate = 1;
1398 trie->wordcount = word_count;
1399 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1400 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1401 if (!(UTF && folder))
1402 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1403 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1404 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1405
1406 DEBUG_r({
1407 trie_words = newAV();
1408 });
1409
1410 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1411 if (!SvIOK(re_trie_maxbuff)) {
1412 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1413 }
1414 DEBUG_OPTIMISE_r({
1415 PerlIO_printf( Perl_debug_log,
1416 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1417 (int)depth * 2 + 2, "",
1418 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1419 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1420 (int)depth);
1421 });
1422
1423 /* Find the node we are going to overwrite */
1424 if ( first == startbranch && OP( last ) != BRANCH ) {
1425 /* whole branch chain */
1426 convert = first;
1427 } else {
1428 /* branch sub-chain */
1429 convert = NEXTOPER( first );
1430 }
1431
1432 /* -- First loop and Setup --
1433
1434 We first traverse the branches and scan each word to determine if it
1435 contains widechars, and how many unique chars there are, this is
1436 important as we have to build a table with at least as many columns as we
1437 have unique chars.
1438
1439 We use an array of integers to represent the character codes 0..255
1440 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1441 native representation of the character value as the key and IV's for the
1442 coded index.
1443
1444 *TODO* If we keep track of how many times each character is used we can
1445 remap the columns so that the table compression later on is more
1446 efficient in terms of memory by ensuring the most common value is in the
1447 middle and the least common are on the outside. IMO this would be better
1448 than a most to least common mapping as theres a decent chance the most
1449 common letter will share a node with the least common, meaning the node
1450 will not be compressible. With a middle is most common approach the worst
1451 case is when we have the least common nodes twice.
1452
1453 */
1454
1455 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1456 regnode * const noper = NEXTOPER( cur );
1457 const U8 *uc = (U8*)STRING( noper );
1458 const U8 * const e = uc + STR_LEN( noper );
1459 STRLEN foldlen = 0;
1460 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1461 const U8 *scan = (U8*)NULL;
1462 U32 wordlen = 0; /* required init */
1463 STRLEN chars = 0;
1464 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1465
1466 if (OP(noper) == NOTHING) {
1467 trie->minlen= 0;
1468 continue;
1469 }
1470 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1471 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1472 regardless of encoding */
1473
1474 for ( ; uc < e ; uc += len ) {
1475 TRIE_CHARCOUNT(trie)++;
1476 TRIE_READ_CHAR;
1477 chars++;
1478 if ( uvc < 256 ) {
1479 if ( !trie->charmap[ uvc ] ) {
1480 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1481 if ( folder )
1482 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1483 TRIE_STORE_REVCHAR;
1484 }
1485 if ( set_bit ) {
1486 /* store the codepoint in the bitmap, and its folded
1487 * equivalent. */
1488 TRIE_BITMAP_SET(trie,uvc);
1489
1490 /* store the folded codepoint */
1491 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1492
1493 if ( !UTF ) {
1494 /* store first byte of utf8 representation of
1495 variant codepoints */
1496 if (! UNI_IS_INVARIANT(uvc)) {
1497 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1498 }
1499 }
1500 set_bit = 0; /* We've done our bit :-) */
1501 }
1502 } else {
1503 SV** svpp;
1504 if ( !widecharmap )
1505 widecharmap = newHV();
1506
1507 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1508
1509 if ( !svpp )
1510 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1511
1512 if ( !SvTRUE( *svpp ) ) {
1513 sv_setiv( *svpp, ++trie->uniquecharcount );
1514 TRIE_STORE_REVCHAR;
1515 }
1516 }
1517 }
1518 if( cur == first ) {
1519 trie->minlen=chars;
1520 trie->maxlen=chars;
1521 } else if (chars < trie->minlen) {
1522 trie->minlen=chars;
1523 } else if (chars > trie->maxlen) {
1524 trie->maxlen=chars;
1525 }
1526
1527 } /* end first pass */
1528 DEBUG_TRIE_COMPILE_r(
1529 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1530 (int)depth * 2 + 2,"",
1531 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1532 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1533 (int)trie->minlen, (int)trie->maxlen )
1534 );
1535
1536 /*
1537 We now know what we are dealing with in terms of unique chars and
1538 string sizes so we can calculate how much memory a naive
1539 representation using a flat table will take. If it's over a reasonable
1540 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1541 conservative but potentially much slower representation using an array
1542 of lists.
1543
1544 At the end we convert both representations into the same compressed
1545 form that will be used in regexec.c for matching with. The latter
1546 is a form that cannot be used to construct with but has memory
1547 properties similar to the list form and access properties similar
1548 to the table form making it both suitable for fast searches and
1549 small enough that its feasable to store for the duration of a program.
1550
1551 See the comment in the code where the compressed table is produced
1552 inplace from the flat tabe representation for an explanation of how
1553 the compression works.
1554
1555 */
1556
1557
1558 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1559 prev_states[1] = 0;
1560
1561 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1562 /*
1563 Second Pass -- Array Of Lists Representation
1564
1565 Each state will be represented by a list of charid:state records
1566 (reg_trie_trans_le) the first such element holds the CUR and LEN
1567 points of the allocated array. (See defines above).
1568
1569 We build the initial structure using the lists, and then convert
1570 it into the compressed table form which allows faster lookups
1571 (but cant be modified once converted).
1572 */
1573
1574 STRLEN transcount = 1;
1575
1576 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1577 "%*sCompiling trie using list compiler\n",
1578 (int)depth * 2 + 2, ""));
1579
1580 trie->states = (reg_trie_state *)
1581 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1582 sizeof(reg_trie_state) );
1583 TRIE_LIST_NEW(1);
1584 next_alloc = 2;
1585
1586 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1587
1588 regnode * const noper = NEXTOPER( cur );
1589 U8 *uc = (U8*)STRING( noper );
1590 const U8 * const e = uc + STR_LEN( noper );
1591 U32 state = 1; /* required init */
1592 U16 charid = 0; /* sanity init */
1593 U8 *scan = (U8*)NULL; /* sanity init */
1594 STRLEN foldlen = 0; /* required init */
1595 U32 wordlen = 0; /* required init */
1596 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1597
1598 if (OP(noper) != NOTHING) {
1599 for ( ; uc < e ; uc += len ) {
1600
1601 TRIE_READ_CHAR;
1602
1603 if ( uvc < 256 ) {
1604 charid = trie->charmap[ uvc ];
1605 } else {
1606 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1607 if ( !svpp ) {
1608 charid = 0;
1609 } else {
1610 charid=(U16)SvIV( *svpp );
1611 }
1612 }
1613 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1614 if ( charid ) {
1615
1616 U16 check;
1617 U32 newstate = 0;
1618
1619 charid--;
1620 if ( !trie->states[ state ].trans.list ) {
1621 TRIE_LIST_NEW( state );
1622 }
1623 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1624 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1625 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1626 break;
1627 }
1628 }
1629 if ( ! newstate ) {
1630 newstate = next_alloc++;
1631 prev_states[newstate] = state;
1632 TRIE_LIST_PUSH( state, charid, newstate );
1633 transcount++;
1634 }
1635 state = newstate;
1636 } else {
1637 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1638 }
1639 }
1640 }
1641 TRIE_HANDLE_WORD(state);
1642
1643 } /* end second pass */
1644
1645 /* next alloc is the NEXT state to be allocated */
1646 trie->statecount = next_alloc;
1647 trie->states = (reg_trie_state *)
1648 PerlMemShared_realloc( trie->states,
1649 next_alloc
1650 * sizeof(reg_trie_state) );
1651
1652 /* and now dump it out before we compress it */
1653 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1654 revcharmap, next_alloc,
1655 depth+1)
1656 );
1657
1658 trie->trans = (reg_trie_trans *)
1659 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1660 {
1661 U32 state;
1662 U32 tp = 0;
1663 U32 zp = 0;
1664
1665
1666 for( state=1 ; state < next_alloc ; state ++ ) {
1667 U32 base=0;
1668
1669 /*
1670 DEBUG_TRIE_COMPILE_MORE_r(
1671 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1672 );
1673 */
1674
1675 if (trie->states[state].trans.list) {
1676 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1677 U16 maxid=minid;
1678 U16 idx;
1679
1680 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1681 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1682 if ( forid < minid ) {
1683 minid=forid;
1684 } else if ( forid > maxid ) {
1685 maxid=forid;
1686 }
1687 }
1688 if ( transcount < tp + maxid - minid + 1) {
1689 transcount *= 2;
1690 trie->trans = (reg_trie_trans *)
1691 PerlMemShared_realloc( trie->trans,
1692 transcount
1693 * sizeof(reg_trie_trans) );
1694 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1695 }
1696 base = trie->uniquecharcount + tp - minid;
1697 if ( maxid == minid ) {
1698 U32 set = 0;
1699 for ( ; zp < tp ; zp++ ) {
1700 if ( ! trie->trans[ zp ].next ) {
1701 base = trie->uniquecharcount + zp - minid;
1702 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1703 trie->trans[ zp ].check = state;
1704 set = 1;
1705 break;
1706 }
1707 }
1708 if ( !set ) {
1709 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1710 trie->trans[ tp ].check = state;
1711 tp++;
1712 zp = tp;
1713 }
1714 } else {
1715 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1716 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1717 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1718 trie->trans[ tid ].check = state;
1719 }
1720 tp += ( maxid - minid + 1 );
1721 }
1722 Safefree(trie->states[ state ].trans.list);
1723 }
1724 /*
1725 DEBUG_TRIE_COMPILE_MORE_r(
1726 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1727 );
1728 */
1729 trie->states[ state ].trans.base=base;
1730 }
1731 trie->lasttrans = tp + 1;
1732 }
1733 } else {
1734 /*
1735 Second Pass -- Flat Table Representation.
1736
1737 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1738 We know that we will need Charcount+1 trans at most to store the data
1739 (one row per char at worst case) So we preallocate both structures
1740 assuming worst case.
1741
1742 We then construct the trie using only the .next slots of the entry
1743 structs.
1744
1745 We use the .check field of the first entry of the node temporarily to
1746 make compression both faster and easier by keeping track of how many non
1747 zero fields are in the node.
1748
1749 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1750 transition.
1751
1752 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1753 number representing the first entry of the node, and state as a
1754 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1755 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1756 are 2 entrys per node. eg:
1757
1758 A B A B
1759 1. 2 4 1. 3 7
1760 2. 0 3 3. 0 5
1761 3. 0 0 5. 0 0
1762 4. 0 0 7. 0 0
1763
1764 The table is internally in the right hand, idx form. However as we also
1765 have to deal with the states array which is indexed by nodenum we have to
1766 use TRIE_NODENUM() to convert.
1767
1768 */
1769 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1770 "%*sCompiling trie using table compiler\n",
1771 (int)depth * 2 + 2, ""));
1772
1773 trie->trans = (reg_trie_trans *)
1774 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1775 * trie->uniquecharcount + 1,
1776 sizeof(reg_trie_trans) );
1777 trie->states = (reg_trie_state *)
1778 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1779 sizeof(reg_trie_state) );
1780 next_alloc = trie->uniquecharcount + 1;
1781
1782
1783 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1784
1785 regnode * const noper = NEXTOPER( cur );
1786 const U8 *uc = (U8*)STRING( noper );
1787 const U8 * const e = uc + STR_LEN( noper );
1788
1789 U32 state = 1; /* required init */
1790
1791 U16 charid = 0; /* sanity init */
1792 U32 accept_state = 0; /* sanity init */
1793 U8 *scan = (U8*)NULL; /* sanity init */
1794
1795 STRLEN foldlen = 0; /* required init */
1796 U32 wordlen = 0; /* required init */
1797 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1798
1799 if ( OP(noper) != NOTHING ) {
1800 for ( ; uc < e ; uc += len ) {
1801
1802 TRIE_READ_CHAR;
1803
1804 if ( uvc < 256 ) {
1805 charid = trie->charmap[ uvc ];
1806 } else {
1807 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1808 charid = svpp ? (U16)SvIV(*svpp) : 0;
1809 }
1810 if ( charid ) {
1811 charid--;
1812 if ( !trie->trans[ state + charid ].next ) {
1813 trie->trans[ state + charid ].next = next_alloc;
1814 trie->trans[ state ].check++;
1815 prev_states[TRIE_NODENUM(next_alloc)]
1816 = TRIE_NODENUM(state);
1817 next_alloc += trie->uniquecharcount;
1818 }
1819 state = trie->trans[ state + charid ].next;
1820 } else {
1821 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1822 }
1823 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1824 }
1825 }
1826 accept_state = TRIE_NODENUM( state );
1827 TRIE_HANDLE_WORD(accept_state);
1828
1829 } /* end second pass */
1830
1831 /* and now dump it out before we compress it */
1832 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1833 revcharmap,
1834 next_alloc, depth+1));
1835
1836 {
1837 /*
1838 * Inplace compress the table.*
1839
1840 For sparse data sets the table constructed by the trie algorithm will
1841 be mostly 0/FAIL transitions or to put it another way mostly empty.
1842 (Note that leaf nodes will not contain any transitions.)
1843
1844 This algorithm compresses the tables by eliminating most such
1845 transitions, at the cost of a modest bit of extra work during lookup:
1846
1847 - Each states[] entry contains a .base field which indicates the
1848 index in the state[] array wheres its transition data is stored.
1849
1850 - If .base is 0 there are no valid transitions from that node.
1851
1852 - If .base is nonzero then charid is added to it to find an entry in
1853 the trans array.
1854
1855 -If trans[states[state].base+charid].check!=state then the
1856 transition is taken to be a 0/Fail transition. Thus if there are fail
1857 transitions at the front of the node then the .base offset will point
1858 somewhere inside the previous nodes data (or maybe even into a node
1859 even earlier), but the .check field determines if the transition is
1860 valid.
1861
1862 XXX - wrong maybe?
1863 The following process inplace converts the table to the compressed
1864 table: We first do not compress the root node 1,and mark all its
1865 .check pointers as 1 and set its .base pointer as 1 as well. This
1866 allows us to do a DFA construction from the compressed table later,
1867 and ensures that any .base pointers we calculate later are greater
1868 than 0.
1869
1870 - We set 'pos' to indicate the first entry of the second node.
1871
1872 - We then iterate over the columns of the node, finding the first and
1873 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1874 and set the .check pointers accordingly, and advance pos
1875 appropriately and repreat for the next node. Note that when we copy
1876 the next pointers we have to convert them from the original
1877 NODEIDX form to NODENUM form as the former is not valid post
1878 compression.
1879
1880 - If a node has no transitions used we mark its base as 0 and do not
1881 advance the pos pointer.
1882
1883 - If a node only has one transition we use a second pointer into the
1884 structure to fill in allocated fail transitions from other states.
1885 This pointer is independent of the main pointer and scans forward
1886 looking for null transitions that are allocated to a state. When it
1887 finds one it writes the single transition into the "hole". If the
1888 pointer doesnt find one the single transition is appended as normal.
1889
1890 - Once compressed we can Renew/realloc the structures to release the
1891 excess space.
1892
1893 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1894 specifically Fig 3.47 and the associated pseudocode.
1895
1896 demq
1897 */
1898 const U32 laststate = TRIE_NODENUM( next_alloc );
1899 U32 state, charid;
1900 U32 pos = 0, zp=0;
1901 trie->statecount = laststate;
1902
1903 for ( state = 1 ; state < laststate ; state++ ) {
1904 U8 flag = 0;
1905 const U32 stateidx = TRIE_NODEIDX( state );
1906 const U32 o_used = trie->trans[ stateidx ].check;
1907 U32 used = trie->trans[ stateidx ].check;
1908 trie->trans[ stateidx ].check = 0;
1909
1910 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1911 if ( flag || trie->trans[ stateidx + charid ].next ) {
1912 if ( trie->trans[ stateidx + charid ].next ) {
1913 if (o_used == 1) {
1914 for ( ; zp < pos ; zp++ ) {
1915 if ( ! trie->trans[ zp ].next ) {
1916 break;
1917 }
1918 }
1919 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1920 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1921 trie->trans[ zp ].check = state;
1922 if ( ++zp > pos ) pos = zp;
1923 break;
1924 }
1925 used--;
1926 }
1927 if ( !flag ) {
1928 flag = 1;
1929 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1930 }
1931 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1932 trie->trans[ pos ].check = state;
1933 pos++;
1934 }
1935 }
1936 }
1937 trie->lasttrans = pos + 1;
1938 trie->states = (reg_trie_state *)
1939 PerlMemShared_realloc( trie->states, laststate
1940 * sizeof(reg_trie_state) );
1941 DEBUG_TRIE_COMPILE_MORE_r(
1942 PerlIO_printf( Perl_debug_log,
1943 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1944 (int)depth * 2 + 2,"",
1945 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1946 (IV)next_alloc,
1947 (IV)pos,
1948 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1949 );
1950
1951 } /* end table compress */
1952 }
1953 DEBUG_TRIE_COMPILE_MORE_r(
1954 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1955 (int)depth * 2 + 2, "",
1956 (UV)trie->statecount,
1957 (UV)trie->lasttrans)
1958 );
1959 /* resize the trans array to remove unused space */
1960 trie->trans = (reg_trie_trans *)
1961 PerlMemShared_realloc( trie->trans, trie->lasttrans
1962 * sizeof(reg_trie_trans) );
1963
1964 { /* Modify the program and insert the new TRIE node */
1965 U8 nodetype =(U8)(flags & 0xFF);
1966 char *str=NULL;
1967
1968#ifdef DEBUGGING
1969 regnode *optimize = NULL;
1970#ifdef RE_TRACK_PATTERN_OFFSETS
1971
1972 U32 mjd_offset = 0;
1973 U32 mjd_nodelen = 0;
1974#endif /* RE_TRACK_PATTERN_OFFSETS */
1975#endif /* DEBUGGING */
1976 /*
1977 This means we convert either the first branch or the first Exact,
1978 depending on whether the thing following (in 'last') is a branch
1979 or not and whther first is the startbranch (ie is it a sub part of
1980 the alternation or is it the whole thing.)
1981 Assuming its a sub part we convert the EXACT otherwise we convert
1982 the whole branch sequence, including the first.
1983 */
1984 /* Find the node we are going to overwrite */
1985 if ( first != startbranch || OP( last ) == BRANCH ) {
1986 /* branch sub-chain */
1987 NEXT_OFF( first ) = (U16)(last - first);
1988#ifdef RE_TRACK_PATTERN_OFFSETS
1989 DEBUG_r({
1990 mjd_offset= Node_Offset((convert));
1991 mjd_nodelen= Node_Length((convert));
1992 });
1993#endif
1994 /* whole branch chain */
1995 }
1996#ifdef RE_TRACK_PATTERN_OFFSETS
1997 else {
1998 DEBUG_r({
1999 const regnode *nop = NEXTOPER( convert );
2000 mjd_offset= Node_Offset((nop));
2001 mjd_nodelen= Node_Length((nop));
2002 });
2003 }
2004 DEBUG_OPTIMISE_r(
2005 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2006 (int)depth * 2 + 2, "",
2007 (UV)mjd_offset, (UV)mjd_nodelen)
2008 );
2009#endif
2010 /* But first we check to see if there is a common prefix we can
2011 split out as an EXACT and put in front of the TRIE node. */
2012 trie->startstate= 1;
2013 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2014 U32 state;
2015 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2016 U32 ofs = 0;
2017 I32 idx = -1;
2018 U32 count = 0;
2019 const U32 base = trie->states[ state ].trans.base;
2020
2021 if ( trie->states[state].wordnum )
2022 count = 1;
2023
2024 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2025 if ( ( base + ofs >= trie->uniquecharcount ) &&
2026 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2027 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2028 {
2029 if ( ++count > 1 ) {
2030 SV **tmp = av_fetch( revcharmap, ofs, 0);
2031 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2032 if ( state == 1 ) break;
2033 if ( count == 2 ) {
2034 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2035 DEBUG_OPTIMISE_r(
2036 PerlIO_printf(Perl_debug_log,
2037 "%*sNew Start State=%"UVuf" Class: [",
2038 (int)depth * 2 + 2, "",
2039 (UV)state));
2040 if (idx >= 0) {
2041 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2042 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2043
2044 TRIE_BITMAP_SET(trie,*ch);
2045 if ( folder )
2046 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2047 DEBUG_OPTIMISE_r(
2048 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2049 );
2050 }
2051 }
2052 TRIE_BITMAP_SET(trie,*ch);
2053 if ( folder )
2054 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2055 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2056 }
2057 idx = ofs;
2058 }
2059 }
2060 if ( count == 1 ) {
2061 SV **tmp = av_fetch( revcharmap, idx, 0);
2062 STRLEN len;
2063 char *ch = SvPV( *tmp, len );
2064 DEBUG_OPTIMISE_r({
2065 SV *sv=sv_newmortal();
2066 PerlIO_printf( Perl_debug_log,
2067 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2068 (int)depth * 2 + 2, "",
2069 (UV)state, (UV)idx,
2070 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2071 PL_colors[0], PL_colors[1],
2072 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2073 PERL_PV_ESCAPE_FIRSTCHAR
2074 )
2075 );
2076 });
2077 if ( state==1 ) {
2078 OP( convert ) = nodetype;
2079 str=STRING(convert);
2080 STR_LEN(convert)=0;
2081 }
2082 STR_LEN(convert) += len;
2083 while (len--)
2084 *str++ = *ch++;
2085 } else {
2086#ifdef DEBUGGING
2087 if (state>1)
2088 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2089#endif
2090 break;
2091 }
2092 }
2093 trie->prefixlen = (state-1);
2094 if (str) {
2095 regnode *n = convert+NODE_SZ_STR(convert);
2096 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2097 trie->startstate = state;
2098 trie->minlen -= (state - 1);
2099 trie->maxlen -= (state - 1);
2100#ifdef DEBUGGING
2101 /* At least the UNICOS C compiler choked on this
2102 * being argument to DEBUG_r(), so let's just have
2103 * it right here. */
2104 if (
2105#ifdef PERL_EXT_RE_BUILD
2106 1
2107#else
2108 DEBUG_r_TEST
2109#endif
2110 ) {
2111 regnode *fix = convert;
2112 U32 word = trie->wordcount;
2113 mjd_nodelen++;
2114 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2115 while( ++fix < n ) {
2116 Set_Node_Offset_Length(fix, 0, 0);
2117 }
2118 while (word--) {
2119 SV ** const tmp = av_fetch( trie_words, word, 0 );
2120 if (tmp) {
2121 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2122 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2123 else
2124 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2125 }
2126 }
2127 }
2128#endif
2129 if (trie->maxlen) {
2130 convert = n;
2131 } else {
2132 NEXT_OFF(convert) = (U16)(tail - convert);
2133 DEBUG_r(optimize= n);
2134 }
2135 }
2136 }
2137 if (!jumper)
2138 jumper = last;
2139 if ( trie->maxlen ) {
2140 NEXT_OFF( convert ) = (U16)(tail - convert);
2141 ARG_SET( convert, data_slot );
2142 /* Store the offset to the first unabsorbed branch in
2143 jump[0], which is otherwise unused by the jump logic.
2144 We use this when dumping a trie and during optimisation. */
2145 if (trie->jump)
2146 trie->jump[0] = (U16)(nextbranch - convert);
2147
2148 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2149 * and there is a bitmap
2150 * and the first "jump target" node we found leaves enough room
2151 * then convert the TRIE node into a TRIEC node, with the bitmap
2152 * embedded inline in the opcode - this is hypothetically faster.
2153 */
2154 if ( !trie->states[trie->startstate].wordnum
2155 && trie->bitmap
2156 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2157 {
2158 OP( convert ) = TRIEC;
2159 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2160 PerlMemShared_free(trie->bitmap);
2161 trie->bitmap= NULL;
2162 } else
2163 OP( convert ) = TRIE;
2164
2165 /* store the type in the flags */
2166 convert->flags = nodetype;
2167 DEBUG_r({
2168 optimize = convert
2169 + NODE_STEP_REGNODE
2170 + regarglen[ OP( convert ) ];
2171 });
2172 /* XXX We really should free up the resource in trie now,
2173 as we won't use them - (which resources?) dmq */
2174 }
2175 /* needed for dumping*/
2176 DEBUG_r(if (optimize) {
2177 regnode *opt = convert;
2178
2179 while ( ++opt < optimize) {
2180 Set_Node_Offset_Length(opt,0,0);
2181 }
2182 /*
2183 Try to clean up some of the debris left after the
2184 optimisation.
2185 */
2186 while( optimize < jumper ) {
2187 mjd_nodelen += Node_Length((optimize));
2188 OP( optimize ) = OPTIMIZED;
2189 Set_Node_Offset_Length(optimize,0,0);
2190 optimize++;
2191 }
2192 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2193 });
2194 } /* end node insert */
2195
2196 /* Finish populating the prev field of the wordinfo array. Walk back
2197 * from each accept state until we find another accept state, and if
2198 * so, point the first word's .prev field at the second word. If the
2199 * second already has a .prev field set, stop now. This will be the
2200 * case either if we've already processed that word's accept state,
2201 * or that state had multiple words, and the overspill words were
2202 * already linked up earlier.
2203 */
2204 {
2205 U16 word;
2206 U32 state;
2207 U16 prev;
2208
2209 for (word=1; word <= trie->wordcount; word++) {
2210 prev = 0;
2211 if (trie->wordinfo[word].prev)
2212 continue;
2213 state = trie->wordinfo[word].accept;
2214 while (state) {
2215 state = prev_states[state];
2216 if (!state)
2217 break;
2218 prev = trie->states[state].wordnum;
2219 if (prev)
2220 break;
2221 }
2222 trie->wordinfo[word].prev = prev;
2223 }
2224 Safefree(prev_states);
2225 }
2226
2227
2228 /* and now dump out the compressed format */
2229 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2230
2231 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2232#ifdef DEBUGGING
2233 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2234 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2235#else
2236 SvREFCNT_dec(revcharmap);
2237#endif
2238 return trie->jump
2239 ? MADE_JUMP_TRIE
2240 : trie->startstate>1
2241 ? MADE_EXACT_TRIE
2242 : MADE_TRIE;
2243}
2244
2245STATIC void
2246S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2247{
2248/* The Trie is constructed and compressed now so we can build a fail array if it's needed
2249
2250 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2251 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2252 ISBN 0-201-10088-6
2253
2254 We find the fail state for each state in the trie, this state is the longest proper
2255 suffix of the current state's 'word' that is also a proper prefix of another word in our
2256 trie. State 1 represents the word '' and is thus the default fail state. This allows
2257 the DFA not to have to restart after its tried and failed a word at a given point, it
2258 simply continues as though it had been matching the other word in the first place.
2259 Consider
2260 'abcdgu'=~/abcdefg|cdgu/
2261 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2262 fail, which would bring us to the state representing 'd' in the second word where we would
2263 try 'g' and succeed, proceeding to match 'cdgu'.
2264 */
2265 /* add a fail transition */
2266 const U32 trie_offset = ARG(source);
2267 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2268 U32 *q;
2269 const U32 ucharcount = trie->uniquecharcount;
2270 const U32 numstates = trie->statecount;
2271 const U32 ubound = trie->lasttrans + ucharcount;
2272 U32 q_read = 0;
2273 U32 q_write = 0;
2274 U32 charid;
2275 U32 base = trie->states[ 1 ].trans.base;
2276 U32 *fail;
2277 reg_ac_data *aho;
2278 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2279 GET_RE_DEBUG_FLAGS_DECL;
2280
2281 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2282#ifndef DEBUGGING
2283 PERL_UNUSED_ARG(depth);
2284#endif
2285
2286
2287 ARG_SET( stclass, data_slot );
2288 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2289 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2290 aho->trie=trie_offset;
2291 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2292 Copy( trie->states, aho->states, numstates, reg_trie_state );
2293 Newxz( q, numstates, U32);
2294 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2295 aho->refcount = 1;
2296 fail = aho->fail;
2297 /* initialize fail[0..1] to be 1 so that we always have
2298 a valid final fail state */
2299 fail[ 0 ] = fail[ 1 ] = 1;
2300
2301 for ( charid = 0; charid < ucharcount ; charid++ ) {
2302 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2303 if ( newstate ) {
2304 q[ q_write ] = newstate;
2305 /* set to point at the root */
2306 fail[ q[ q_write++ ] ]=1;
2307 }
2308 }
2309 while ( q_read < q_write) {
2310 const U32 cur = q[ q_read++ % numstates ];
2311 base = trie->states[ cur ].trans.base;
2312
2313 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2314 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2315 if (ch_state) {
2316 U32 fail_state = cur;
2317 U32 fail_base;
2318 do {
2319 fail_state = fail[ fail_state ];
2320 fail_base = aho->states[ fail_state ].trans.base;
2321 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2322
2323 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2324 fail[ ch_state ] = fail_state;
2325 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2326 {
2327 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2328 }
2329 q[ q_write++ % numstates] = ch_state;
2330 }
2331 }
2332 }
2333 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2334 when we fail in state 1, this allows us to use the
2335 charclass scan to find a valid start char. This is based on the principle
2336 that theres a good chance the string being searched contains lots of stuff
2337 that cant be a start char.
2338 */
2339 fail[ 0 ] = fail[ 1 ] = 0;
2340 DEBUG_TRIE_COMPILE_r({
2341 PerlIO_printf(Perl_debug_log,
2342 "%*sStclass Failtable (%"UVuf" states): 0",
2343 (int)(depth * 2), "", (UV)numstates
2344 );
2345 for( q_read=1; q_read<numstates; q_read++ ) {
2346 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2347 }
2348 PerlIO_printf(Perl_debug_log, "\n");
2349 });
2350 Safefree(q);
2351 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2352}
2353
2354
2355/*
2356 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2357 * These need to be revisited when a newer toolchain becomes available.
2358 */
2359#if defined(__sparc64__) && defined(__GNUC__)
2360# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2361# undef SPARC64_GCC_WORKAROUND
2362# define SPARC64_GCC_WORKAROUND 1
2363# endif
2364#endif
2365
2366#define DEBUG_PEEP(str,scan,depth) \
2367 DEBUG_OPTIMISE_r({if (scan){ \
2368 SV * const mysv=sv_newmortal(); \
2369 regnode *Next = regnext(scan); \
2370 regprop(RExC_rx, mysv, scan); \
2371 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2372 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2373 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2374 }});
2375
2376
2377
2378
2379
2380#define JOIN_EXACT(scan,min,flags) \
2381 if (PL_regkind[OP(scan)] == EXACT) \
2382 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2383
2384STATIC U32
2385S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2386 /* Merge several consecutive EXACTish nodes into one. */
2387 regnode *n = regnext(scan);
2388 U32 stringok = 1;
2389 regnode *next = scan + NODE_SZ_STR(scan);
2390 U32 merged = 0;
2391 U32 stopnow = 0;
2392#ifdef DEBUGGING
2393 regnode *stop = scan;
2394 GET_RE_DEBUG_FLAGS_DECL;
2395#else
2396 PERL_UNUSED_ARG(depth);
2397#endif
2398
2399 PERL_ARGS_ASSERT_JOIN_EXACT;
2400#ifndef EXPERIMENTAL_INPLACESCAN
2401 PERL_UNUSED_ARG(flags);
2402 PERL_UNUSED_ARG(val);
2403#endif
2404 DEBUG_PEEP("join",scan,depth);
2405
2406 /* Skip NOTHING, merge EXACT*. */
2407 while (n &&
2408 ( PL_regkind[OP(n)] == NOTHING ||
2409 (stringok && (OP(n) == OP(scan))))
2410 && NEXT_OFF(n)
2411 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2412
2413 if (OP(n) == TAIL || n > next)
2414 stringok = 0;
2415 if (PL_regkind[OP(n)] == NOTHING) {
2416 DEBUG_PEEP("skip:",n,depth);
2417 NEXT_OFF(scan) += NEXT_OFF(n);
2418 next = n + NODE_STEP_REGNODE;
2419#ifdef DEBUGGING
2420 if (stringok)
2421 stop = n;
2422#endif
2423 n = regnext(n);
2424 }
2425 else if (stringok) {
2426 const unsigned int oldl = STR_LEN(scan);
2427 regnode * const nnext = regnext(n);
2428
2429 DEBUG_PEEP("merg",n,depth);
2430
2431 merged++;
2432 if (oldl + STR_LEN(n) > U8_MAX)
2433 break;
2434 NEXT_OFF(scan) += NEXT_OFF(n);
2435 STR_LEN(scan) += STR_LEN(n);
2436 next = n + NODE_SZ_STR(n);
2437 /* Now we can overwrite *n : */
2438 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2439#ifdef DEBUGGING
2440 stop = next - 1;
2441#endif
2442 n = nnext;
2443 if (stopnow) break;
2444 }
2445
2446#ifdef EXPERIMENTAL_INPLACESCAN
2447 if (flags && !NEXT_OFF(n)) {
2448 DEBUG_PEEP("atch", val, depth);
2449 if (reg_off_by_arg[OP(n)]) {
2450 ARG_SET(n, val - n);
2451 }
2452 else {
2453 NEXT_OFF(n) = val - n;
2454 }
2455 stopnow = 1;
2456 }
2457#endif
2458 }
2459#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2460#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2461#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2462#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2463
2464 if (UTF
2465 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
2466 && ( STR_LEN(scan) >= 6 ) )
2467 {
2468 /*
2469 Two problematic code points in Unicode casefolding of EXACT nodes:
2470
2471 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2472 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2473
2474 which casefold to
2475
2476 Unicode UTF-8
2477
2478 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2479 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2480
2481 This means that in case-insensitive matching (or "loose matching",
2482 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2483 length of the above casefolded versions) can match a target string
2484 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2485 This would rather mess up the minimum length computation.
2486
2487 What we'll do is to look for the tail four bytes, and then peek
2488 at the preceding two bytes to see whether we need to decrease
2489 the minimum length by four (six minus two).
2490
2491 Thanks to the design of UTF-8, there cannot be false matches:
2492 A sequence of valid UTF-8 bytes cannot be a subsequence of
2493 another valid sequence of UTF-8 bytes.
2494
2495 */
2496 char * const s0 = STRING(scan), *s, *t;
2497 char * const s1 = s0 + STR_LEN(scan) - 1;
2498 char * const s2 = s1 - 4;
2499#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2500 const char t0[] = "\xaf\x49\xaf\x42";
2501#else
2502 const char t0[] = "\xcc\x88\xcc\x81";
2503#endif
2504 const char * const t1 = t0 + 3;
2505
2506 for (s = s0 + 2;
2507 s < s2 && (t = ninstr(s, s1, t0, t1));
2508 s = t + 4) {
2509#ifdef EBCDIC
2510 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2511 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2512#else
2513 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2514 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2515#endif
2516 *min -= 4;
2517 }
2518 }
2519
2520#ifdef DEBUGGING
2521 /* Allow dumping */
2522 n = scan + NODE_SZ_STR(scan);
2523 while (n <= stop) {
2524 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2525 OP(n) = OPTIMIZED;
2526 NEXT_OFF(n) = 0;
2527 }
2528 n++;
2529 }
2530#endif
2531 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2532 return stopnow;
2533}
2534
2535/* REx optimizer. Converts nodes into quicker variants "in place".
2536 Finds fixed substrings. */
2537
2538/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2539 to the position after last scanned or to NULL. */
2540
2541#define INIT_AND_WITHP \
2542 assert(!and_withp); \
2543 Newx(and_withp,1,struct regnode_charclass_class); \
2544 SAVEFREEPV(and_withp)
2545
2546/* this is a chain of data about sub patterns we are processing that
2547 need to be handled separately/specially in study_chunk. Its so
2548 we can simulate recursion without losing state. */
2549struct scan_frame;
2550typedef struct scan_frame {
2551 regnode *last; /* last node to process in this frame */
2552 regnode *next; /* next node to process when last is reached */
2553 struct scan_frame *prev; /*previous frame*/
2554 I32 stop; /* what stopparen do we use */
2555} scan_frame;
2556
2557
2558#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2559
2560#define CASE_SYNST_FNC(nAmE) \
2561case nAmE: \
2562 if (flags & SCF_DO_STCLASS_AND) { \
2563 for (value = 0; value < 256; value++) \
2564 if (!is_ ## nAmE ## _cp(value)) \
2565 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2566 } \
2567 else { \
2568 for (value = 0; value < 256; value++) \
2569 if (is_ ## nAmE ## _cp(value)) \
2570 ANYOF_BITMAP_SET(data->start_class, value); \
2571 } \
2572 break; \
2573case N ## nAmE: \
2574 if (flags & SCF_DO_STCLASS_AND) { \
2575 for (value = 0; value < 256; value++) \
2576 if (is_ ## nAmE ## _cp(value)) \
2577 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2578 } \
2579 else { \
2580 for (value = 0; value < 256; value++) \
2581 if (!is_ ## nAmE ## _cp(value)) \
2582 ANYOF_BITMAP_SET(data->start_class, value); \
2583 } \
2584 break
2585
2586
2587
2588STATIC I32
2589S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2590 I32 *minlenp, I32 *deltap,
2591 regnode *last,
2592 scan_data_t *data,
2593 I32 stopparen,
2594 U8* recursed,
2595 struct regnode_charclass_class *and_withp,
2596 U32 flags, U32 depth)
2597 /* scanp: Start here (read-write). */
2598 /* deltap: Write maxlen-minlen here. */
2599 /* last: Stop before this one. */
2600 /* data: string data about the pattern */
2601 /* stopparen: treat close N as END */
2602 /* recursed: which subroutines have we recursed into */
2603 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2604{
2605 dVAR;
2606 I32 min = 0, pars = 0, code;
2607 regnode *scan = *scanp, *next;
2608 I32 delta = 0;
2609 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2610 int is_inf_internal = 0; /* The studied chunk is infinite */
2611 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2612 scan_data_t data_fake;
2613 SV *re_trie_maxbuff = NULL;
2614 regnode *first_non_open = scan;
2615 I32 stopmin = I32_MAX;
2616 scan_frame *frame = NULL;
2617 GET_RE_DEBUG_FLAGS_DECL;
2618
2619 PERL_ARGS_ASSERT_STUDY_CHUNK;
2620
2621#ifdef DEBUGGING
2622 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2623#endif
2624
2625 if ( depth == 0 ) {
2626 while (first_non_open && OP(first_non_open) == OPEN)
2627 first_non_open=regnext(first_non_open);
2628 }
2629
2630
2631 fake_study_recurse:
2632 while ( scan && OP(scan) != END && scan < last ){
2633 /* Peephole optimizer: */
2634 DEBUG_STUDYDATA("Peep:", data,depth);
2635 DEBUG_PEEP("Peep",scan,depth);
2636 JOIN_EXACT(scan,&min,0);
2637
2638 /* Follow the next-chain of the current node and optimize
2639 away all the NOTHINGs from it. */
2640 if (OP(scan) != CURLYX) {
2641 const int max = (reg_off_by_arg[OP(scan)]
2642 ? I32_MAX
2643 /* I32 may be smaller than U16 on CRAYs! */
2644 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2645 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2646 int noff;
2647 regnode *n = scan;
2648
2649 /* Skip NOTHING and LONGJMP. */
2650 while ((n = regnext(n))
2651 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2652 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2653 && off + noff < max)
2654 off += noff;
2655 if (reg_off_by_arg[OP(scan)])
2656 ARG(scan) = off;
2657 else
2658 NEXT_OFF(scan) = off;
2659 }
2660
2661
2662
2663 /* The principal pseudo-switch. Cannot be a switch, since we
2664 look into several different things. */
2665 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2666 || OP(scan) == IFTHEN) {
2667 next = regnext(scan);
2668 code = OP(scan);
2669 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2670
2671 if (OP(next) == code || code == IFTHEN) {
2672 /* NOTE - There is similar code to this block below for handling
2673 TRIE nodes on a re-study. If you change stuff here check there
2674 too. */
2675 I32 max1 = 0, min1 = I32_MAX, num = 0;
2676 struct regnode_charclass_class accum;
2677 regnode * const startbranch=scan;
2678
2679 if (flags & SCF_DO_SUBSTR)
2680 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2681 if (flags & SCF_DO_STCLASS)
2682 cl_init_zero(pRExC_state, &accum);
2683
2684 while (OP(scan) == code) {
2685 I32 deltanext, minnext, f = 0, fake;
2686 struct regnode_charclass_class this_class;
2687
2688 num++;
2689 data_fake.flags = 0;
2690 if (data) {
2691 data_fake.whilem_c = data->whilem_c;
2692 data_fake.last_closep = data->last_closep;
2693 }
2694 else
2695 data_fake.last_closep = &fake;
2696
2697 data_fake.pos_delta = delta;
2698 next = regnext(scan);
2699 scan = NEXTOPER(scan);
2700 if (code != BRANCH)
2701 scan = NEXTOPER(scan);
2702 if (flags & SCF_DO_STCLASS) {
2703 cl_init(pRExC_state, &this_class);
2704 data_fake.start_class = &this_class;
2705 f = SCF_DO_STCLASS_AND;
2706 }
2707 if (flags & SCF_WHILEM_VISITED_POS)
2708 f |= SCF_WHILEM_VISITED_POS;
2709
2710 /* we suppose the run is continuous, last=next...*/
2711 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2712 next, &data_fake,
2713 stopparen, recursed, NULL, f,depth+1);
2714 if (min1 > minnext)
2715 min1 = minnext;
2716 if (max1 < minnext + deltanext)
2717 max1 = minnext + deltanext;
2718 if (deltanext == I32_MAX)
2719 is_inf = is_inf_internal = 1;
2720 scan = next;
2721 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2722 pars++;
2723 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2724 if ( stopmin > minnext)
2725 stopmin = min + min1;
2726 flags &= ~SCF_DO_SUBSTR;
2727 if (data)
2728 data->flags |= SCF_SEEN_ACCEPT;
2729 }
2730 if (data) {
2731 if (data_fake.flags & SF_HAS_EVAL)
2732 data->flags |= SF_HAS_EVAL;
2733 data->whilem_c = data_fake.whilem_c;
2734 }
2735 if (flags & SCF_DO_STCLASS)
2736 cl_or(pRExC_state, &accum, &this_class);
2737 }
2738 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2739 min1 = 0;
2740 if (flags & SCF_DO_SUBSTR) {
2741 data->pos_min += min1;
2742 data->pos_delta += max1 - min1;
2743 if (max1 != min1 || is_inf)
2744 data->longest = &(data->longest_float);
2745 }
2746 min += min1;
2747 delta += max1 - min1;
2748 if (flags & SCF_DO_STCLASS_OR) {
2749 cl_or(pRExC_state, data->start_class, &accum);
2750 if (min1) {
2751 cl_and(data->start_class, and_withp);
2752 flags &= ~SCF_DO_STCLASS;
2753 }
2754 }
2755 else if (flags & SCF_DO_STCLASS_AND) {
2756 if (min1) {
2757 cl_and(data->start_class, &accum);
2758 flags &= ~SCF_DO_STCLASS;
2759 }
2760 else {
2761 /* Switch to OR mode: cache the old value of
2762 * data->start_class */
2763 INIT_AND_WITHP;
2764 StructCopy(data->start_class, and_withp,
2765 struct regnode_charclass_class);
2766 flags &= ~SCF_DO_STCLASS_AND;
2767 StructCopy(&accum, data->start_class,
2768 struct regnode_charclass_class);
2769 flags |= SCF_DO_STCLASS_OR;
2770 data->start_class->flags |= ANYOF_EOS;
2771 }
2772 }
2773
2774 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2775 /* demq.
2776
2777 Assuming this was/is a branch we are dealing with: 'scan' now
2778 points at the item that follows the branch sequence, whatever
2779 it is. We now start at the beginning of the sequence and look
2780 for subsequences of
2781
2782 BRANCH->EXACT=>x1
2783 BRANCH->EXACT=>x2
2784 tail
2785
2786 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2787
2788 If we can find such a subsequence we need to turn the first
2789 element into a trie and then add the subsequent branch exact
2790 strings to the trie.
2791
2792 We have two cases
2793
2794 1. patterns where the whole set of branches can be converted.
2795
2796 2. patterns where only a subset can be converted.
2797
2798 In case 1 we can replace the whole set with a single regop
2799 for the trie. In case 2 we need to keep the start and end
2800 branches so
2801
2802 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2803 becomes BRANCH TRIE; BRANCH X;
2804
2805 There is an additional case, that being where there is a
2806 common prefix, which gets split out into an EXACT like node
2807 preceding the TRIE node.
2808
2809 If x(1..n)==tail then we can do a simple trie, if not we make
2810 a "jump" trie, such that when we match the appropriate word
2811 we "jump" to the appropriate tail node. Essentially we turn
2812 a nested if into a case structure of sorts.
2813
2814 */
2815
2816 int made=0;
2817 if (!re_trie_maxbuff) {
2818 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2819 if (!SvIOK(re_trie_maxbuff))
2820 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2821 }
2822 if ( SvIV(re_trie_maxbuff)>=0 ) {
2823 regnode *cur;
2824 regnode *first = (regnode *)NULL;
2825 regnode *last = (regnode *)NULL;
2826 regnode *tail = scan;
2827 U8 optype = 0;
2828 U32 count=0;
2829
2830#ifdef DEBUGGING
2831 SV * const mysv = sv_newmortal(); /* for dumping */
2832#endif
2833 /* var tail is used because there may be a TAIL
2834 regop in the way. Ie, the exacts will point to the
2835 thing following the TAIL, but the last branch will
2836 point at the TAIL. So we advance tail. If we
2837 have nested (?:) we may have to move through several
2838 tails.
2839 */
2840
2841 while ( OP( tail ) == TAIL ) {
2842 /* this is the TAIL generated by (?:) */
2843 tail = regnext( tail );
2844 }
2845
2846
2847 DEBUG_OPTIMISE_r({
2848 regprop(RExC_rx, mysv, tail );
2849 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2850 (int)depth * 2 + 2, "",
2851 "Looking for TRIE'able sequences. Tail node is: ",
2852 SvPV_nolen_const( mysv )
2853 );
2854 });
2855
2856 /*
2857
2858 step through the branches, cur represents each
2859 branch, noper is the first thing to be matched
2860 as part of that branch and noper_next is the
2861 regnext() of that node. if noper is an EXACT
2862 and noper_next is the same as scan (our current
2863 position in the regex) then the EXACT branch is
2864 a possible optimization target. Once we have
2865 two or more consecutive such branches we can
2866 create a trie of the EXACT's contents and stich
2867 it in place. If the sequence represents all of
2868 the branches we eliminate the whole thing and
2869 replace it with a single TRIE. If it is a
2870 subsequence then we need to stitch it in. This
2871 means the first branch has to remain, and needs
2872 to be repointed at the item on the branch chain
2873 following the last branch optimized. This could
2874 be either a BRANCH, in which case the
2875 subsequence is internal, or it could be the
2876 item following the branch sequence in which
2877 case the subsequence is at the end.
2878
2879 */
2880
2881 /* dont use tail as the end marker for this traverse */
2882 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2883 regnode * const noper = NEXTOPER( cur );
2884#if defined(DEBUGGING) || defined(NOJUMPTRIE)
2885 regnode * const noper_next = regnext( noper );
2886#endif
2887
2888 DEBUG_OPTIMISE_r({
2889 regprop(RExC_rx, mysv, cur);
2890 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2891 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2892
2893 regprop(RExC_rx, mysv, noper);
2894 PerlIO_printf( Perl_debug_log, " -> %s",
2895 SvPV_nolen_const(mysv));
2896
2897 if ( noper_next ) {
2898 regprop(RExC_rx, mysv, noper_next );
2899 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2900 SvPV_nolen_const(mysv));
2901 }
2902 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2903 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2904 });
2905 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2906 : PL_regkind[ OP( noper ) ] == EXACT )
2907 || OP(noper) == NOTHING )
2908#ifdef NOJUMPTRIE
2909 && noper_next == tail
2910#endif
2911 && count < U16_MAX)
2912 {
2913 count++;
2914 if ( !first || optype == NOTHING ) {
2915 if (!first) first = cur;
2916 optype = OP( noper );
2917 } else {
2918 last = cur;
2919 }
2920 } else {
2921/*
2922 Currently we do not believe that the trie logic can
2923 handle case insensitive matching properly when the
2924 pattern is not unicode (thus forcing unicode semantics).
2925
2926 If/when this is fixed the following define can be swapped
2927 in below to fully enable trie logic.
2928
2929#define TRIE_TYPE_IS_SAFE 1
2930
2931*/
2932#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2933
2934 if ( last && TRIE_TYPE_IS_SAFE ) {
2935 make_trie( pRExC_state,
2936 startbranch, first, cur, tail, count,
2937 optype, depth+1 );
2938 }
2939 if ( PL_regkind[ OP( noper ) ] == EXACT
2940#ifdef NOJUMPTRIE
2941 && noper_next == tail
2942#endif
2943 ){
2944 count = 1;
2945 first = cur;
2946 optype = OP( noper );
2947 } else {
2948 count = 0;
2949 first = NULL;
2950 optype = 0;
2951 }
2952 last = NULL;
2953 }
2954 }
2955 DEBUG_OPTIMISE_r({
2956 regprop(RExC_rx, mysv, cur);
2957 PerlIO_printf( Perl_debug_log,
2958 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2959 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2960
2961 });
2962
2963 if ( last && TRIE_TYPE_IS_SAFE ) {
2964 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2965#ifdef TRIE_STUDY_OPT
2966 if ( ((made == MADE_EXACT_TRIE &&
2967 startbranch == first)
2968 || ( first_non_open == first )) &&
2969 depth==0 ) {
2970 flags |= SCF_TRIE_RESTUDY;
2971 if ( startbranch == first
2972 && scan == tail )
2973 {
2974 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2975 }
2976 }
2977#endif
2978 }
2979 }
2980
2981 } /* do trie */
2982
2983 }
2984 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2985 scan = NEXTOPER(NEXTOPER(scan));
2986 } else /* single branch is optimized. */
2987 scan = NEXTOPER(scan);
2988 continue;
2989 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2990 scan_frame *newframe = NULL;
2991 I32 paren;
2992 regnode *start;
2993 regnode *end;
2994
2995 if (OP(scan) != SUSPEND) {
2996 /* set the pointer */
2997 if (OP(scan) == GOSUB) {
2998 paren = ARG(scan);
2999 RExC_recurse[ARG2L(scan)] = scan;
3000 start = RExC_open_parens[paren-1];
3001 end = RExC_close_parens[paren-1];
3002 } else {
3003 paren = 0;
3004 start = RExC_rxi->program + 1;
3005 end = RExC_opend;
3006 }
3007 if (!recursed) {
3008 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3009 SAVEFREEPV(recursed);
3010 }
3011 if (!PAREN_TEST(recursed,paren+1)) {
3012 PAREN_SET(recursed,paren+1);
3013 Newx(newframe,1,scan_frame);
3014 } else {
3015 if (flags & SCF_DO_SUBSTR) {
3016 SCAN_COMMIT(pRExC_state,data,minlenp);
3017 data->longest = &(data->longest_float);
3018 }
3019 is_inf = is_inf_internal = 1;
3020 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3021 cl_anything(pRExC_state, data->start_class);
3022 flags &= ~SCF_DO_STCLASS;
3023 }
3024 } else {
3025 Newx(newframe,1,scan_frame);
3026 paren = stopparen;
3027 start = scan+2;
3028 end = regnext(scan);
3029 }
3030 if (newframe) {
3031 assert(start);
3032 assert(end);
3033 SAVEFREEPV(newframe);
3034 newframe->next = regnext(scan);
3035 newframe->last = last;
3036 newframe->stop = stopparen;
3037 newframe->prev = frame;
3038
3039 frame = newframe;
3040 scan = start;
3041 stopparen = paren;
3042 last = end;
3043
3044 continue;
3045 }
3046 }
3047 else if (OP(scan) == EXACT) {
3048 I32 l = STR_LEN(scan);
3049 UV uc;
3050 if (UTF) {
3051 const U8 * const s = (U8*)STRING(scan);
3052 l = utf8_length(s, s + l);
3053 uc = utf8_to_uvchr(s, NULL);
3054 } else {
3055 uc = *((U8*)STRING(scan));
3056 }
3057 min += l;
3058 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3059 /* The code below prefers earlier match for fixed
3060 offset, later match for variable offset. */
3061 if (data->last_end == -1) { /* Update the start info. */
3062 data->last_start_min = data->pos_min;
3063 data->last_start_max = is_inf
3064 ? I32_MAX : data->pos_min + data->pos_delta;
3065 }
3066 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3067 if (UTF)
3068 SvUTF8_on(data->last_found);
3069 {
3070 SV * const sv = data->last_found;
3071 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3072 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3073 if (mg && mg->mg_len >= 0)
3074 mg->mg_len += utf8_length((U8*)STRING(scan),
3075 (U8*)STRING(scan)+STR_LEN(scan));
3076 }
3077 data->last_end = data->pos_min + l;
3078 data->pos_min += l; /* As in the first entry. */
3079 data->flags &= ~SF_BEFORE_EOL;
3080 }
3081 if (flags & SCF_DO_STCLASS_AND) {
3082 /* Check whether it is compatible with what we know already! */
3083 int compat = 1;
3084
3085
3086 /* If compatible, we or it in below. It is compatible if is
3087 * in the bitmp and either 1) its bit or its fold is set, or 2)
3088 * it's for a locale. Even if there isn't unicode semantics
3089 * here, at runtime there may be because of matching against a
3090 * utf8 string, so accept a possible false positive for
3091 * latin1-range folds */
3092 if (uc >= 0x100 ||
3093 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3094 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3095 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3096 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3097 )
3098 compat = 0;
3099 ANYOF_CLASS_ZERO(data->start_class);
3100 ANYOF_BITMAP_ZERO(data->start_class);
3101 if (compat)
3102 ANYOF_BITMAP_SET(data->start_class, uc);
3103 data->start_class->flags &= ~ANYOF_EOS;
3104 if (uc < 0x100)
3105 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3106 }
3107 else if (flags & SCF_DO_STCLASS_OR) {
3108 /* false positive possible if the class is case-folded */
3109 if (uc < 0x100)
3110 ANYOF_BITMAP_SET(data->start_class, uc);
3111 else
3112 data->start_class->flags |= ANYOF_UNICODE_ALL;
3113 data->start_class->flags &= ~ANYOF_EOS;
3114 cl_and(data->start_class, and_withp);
3115 }
3116 flags &= ~SCF_DO_STCLASS;
3117 }
3118 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3119 I32 l = STR_LEN(scan);
3120 UV uc = *((U8*)STRING(scan));
3121
3122 /* Search for fixed substrings supports EXACT only. */
3123 if (flags & SCF_DO_SUBSTR) {
3124 assert(data);
3125 SCAN_COMMIT(pRExC_state, data, minlenp);
3126 }
3127 if (UTF) {
3128 const U8 * const s = (U8 *)STRING(scan);
3129 l = utf8_length(s, s + l);
3130 uc = utf8_to_uvchr(s, NULL);
3131 }
3132 min += l;
3133 if (flags & SCF_DO_SUBSTR)
3134 data->pos_min += l;
3135 if (flags & SCF_DO_STCLASS_AND) {
3136 /* Check whether it is compatible with what we know already! */
3137 int compat = 1;
3138 if (uc >= 0x100 ||
3139 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3140 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3141 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3142 {
3143 compat = 0;
3144 }
3145 ANYOF_CLASS_ZERO(data->start_class);
3146 ANYOF_BITMAP_ZERO(data->start_class);
3147 if (compat) {
3148 ANYOF_BITMAP_SET(data->start_class, uc);
3149 data->start_class->flags &= ~ANYOF_EOS;
3150 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3151 if (OP(scan) == EXACTFL) {
3152 data->start_class->flags |= ANYOF_LOCALE;
3153 }
3154 else {
3155
3156 /* Also set the other member of the fold pair. In case
3157 * that unicode semantics is called for at runtime, use
3158 * the full latin1 fold. (Can't do this for locale,
3159 * because not known until runtime */
3160 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3161 }
3162 }
3163 }
3164 else if (flags & SCF_DO_STCLASS_OR) {
3165 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3166 /* false positive possible if the class is case-folded.
3167 Assume that the locale settings are the same... */
3168 if (uc < 0x100) {
3169 ANYOF_BITMAP_SET(data->start_class, uc);
3170 if (OP(scan) != EXACTFL) {
3171
3172 /* And set the other member of the fold pair, but
3173 * can't do that in locale because not known until
3174 * run-time */
3175 ANYOF_BITMAP_SET(data->start_class,
3176 PL_fold_latin1[uc]);
3177 }
3178 }
3179 data->start_class->flags &= ~ANYOF_EOS;
3180 }
3181 cl_and(data->start_class, and_withp);
3182 }
3183 flags &= ~SCF_DO_STCLASS;
3184 }
3185 else if (REGNODE_VARIES(OP(scan))) {
3186 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3187 I32 f = flags, pos_before = 0;
3188 regnode * const oscan = scan;
3189 struct regnode_charclass_class this_class;
3190 struct regnode_charclass_class *oclass = NULL;
3191 I32 next_is_eval = 0;
3192
3193 switch (PL_regkind[OP(scan)]) {
3194 case WHILEM: /* End of (?:...)* . */
3195 scan = NEXTOPER(scan);
3196 goto finish;
3197 case PLUS:
3198 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3199 next = NEXTOPER(scan);
3200 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3201 mincount = 1;
3202 maxcount = REG_INFTY;
3203 next = regnext(scan);
3204 scan = NEXTOPER(scan);
3205 goto do_curly;
3206 }
3207 }
3208 if (flags & SCF_DO_SUBSTR)
3209 data->pos_min++;
3210 min++;
3211 /* Fall through. */
3212 case STAR:
3213 if (flags & SCF_DO_STCLASS) {
3214 mincount = 0;
3215 maxcount = REG_INFTY;
3216 next = regnext(scan);
3217 scan = NEXTOPER(scan);
3218 goto do_curly;
3219 }
3220 is_inf = is_inf_internal = 1;
3221 scan = regnext(scan);
3222 if (flags & SCF_DO_SUBSTR) {
3223 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3224 data->longest = &(data->longest_float);
3225 }
3226 goto optimize_curly_tail;
3227 case CURLY:
3228 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3229 && (scan->flags == stopparen))
3230 {
3231 mincount = 1;
3232 maxcount = 1;
3233 } else {
3234 mincount = ARG1(scan);
3235 maxcount = ARG2(scan);
3236 }
3237 next = regnext(scan);
3238 if (OP(scan) == CURLYX) {
3239 I32 lp = (data ? *(data->last_closep) : 0);
3240 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3241 }
3242 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3243 next_is_eval = (OP(scan) == EVAL);
3244 do_curly:
3245 if (flags & SCF_DO_SUBSTR) {
3246 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3247 pos_before = data->pos_min;
3248 }
3249 if (data) {
3250 fl = data->flags;
3251 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3252 if (is_inf)
3253 data->flags |= SF_IS_INF;
3254 }
3255 if (flags & SCF_DO_STCLASS) {
3256 cl_init(pRExC_state, &this_class);
3257 oclass = data->start_class;
3258 data->start_class = &this_class;
3259 f |= SCF_DO_STCLASS_AND;
3260 f &= ~SCF_DO_STCLASS_OR;
3261 }
3262 /* Exclude from super-linear cache processing any {n,m}
3263 regops for which the combination of input pos and regex
3264 pos is not enough information to determine if a match
3265 will be possible.
3266
3267 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3268 regex pos at the \s*, the prospects for a match depend not
3269 only on the input position but also on how many (bar\s*)
3270 repeats into the {4,8} we are. */
3271 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3272 f &= ~SCF_WHILEM_VISITED_POS;
3273
3274 /* This will finish on WHILEM, setting scan, or on NULL: */
3275 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3276 last, data, stopparen, recursed, NULL,
3277 (mincount == 0
3278 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3279
3280 if (flags & SCF_DO_STCLASS)
3281 data->start_class = oclass;
3282 if (mincount == 0 || minnext == 0) {
3283 if (flags & SCF_DO_STCLASS_OR) {
3284 cl_or(pRExC_state, data->start_class, &this_class);
3285 }
3286 else if (flags & SCF_DO_STCLASS_AND) {
3287 /* Switch to OR mode: cache the old value of
3288 * data->start_class */
3289 INIT_AND_WITHP;
3290 StructCopy(data->start_class, and_withp,
3291 struct regnode_charclass_class);
3292 flags &= ~SCF_DO_STCLASS_AND;
3293 StructCopy(&this_class, data->start_class,
3294 struct regnode_charclass_class);
3295 flags |= SCF_DO_STCLASS_OR;
3296 data->start_class->flags |= ANYOF_EOS;
3297 }
3298 } else { /* Non-zero len */
3299 if (flags & SCF_DO_STCLASS_OR) {
3300 cl_or(pRExC_state, data->start_class, &this_class);
3301 cl_and(data->start_class, and_withp);
3302 }
3303 else if (flags & SCF_DO_STCLASS_AND)
3304 cl_and(data->start_class, &this_class);
3305 flags &= ~SCF_DO_STCLASS;
3306 }
3307 if (!scan) /* It was not CURLYX, but CURLY. */
3308 scan = next;
3309 if ( /* ? quantifier ok, except for (?{ ... }) */
3310 (next_is_eval || !(mincount == 0 && maxcount == 1))
3311 && (minnext == 0) && (deltanext == 0)
3312 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3313 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3314 {
3315 ckWARNreg(RExC_parse,
3316 "Quantifier unexpected on zero-length expression");
3317 }
3318
3319 min += minnext * mincount;
3320 is_inf_internal |= ((maxcount == REG_INFTY
3321 && (minnext + deltanext) > 0)
3322 || deltanext == I32_MAX);
3323 is_inf |= is_inf_internal;
3324 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3325
3326 /* Try powerful optimization CURLYX => CURLYN. */
3327 if ( OP(oscan) == CURLYX && data
3328 && data->flags & SF_IN_PAR
3329 && !(data->flags & SF_HAS_EVAL)
3330 && !deltanext && minnext == 1 ) {
3331 /* Try to optimize to CURLYN. */
3332 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3333 regnode * const nxt1 = nxt;
3334#ifdef DEBUGGING
3335 regnode *nxt2;
3336#endif
3337
3338 /* Skip open. */
3339 nxt = regnext(nxt);
3340 if (!REGNODE_SIMPLE(OP(nxt))
3341 && !(PL_regkind[OP(nxt)] == EXACT
3342 && STR_LEN(nxt) == 1))
3343 goto nogo;
3344#ifdef DEBUGGING
3345 nxt2 = nxt;
3346#endif
3347 nxt = regnext(nxt);
3348 if (OP(nxt) != CLOSE)
3349 goto nogo;
3350 if (RExC_open_parens) {
3351 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3352 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3353 }
3354 /* Now we know that nxt2 is the only contents: */
3355 oscan->flags = (U8)ARG(nxt);
3356 OP(oscan) = CURLYN;
3357 OP(nxt1) = NOTHING; /* was OPEN. */
3358
3359#ifdef DEBUGGING
3360 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3361 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3362 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3363 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3364 OP(nxt + 1) = OPTIMIZED; /* was count. */
3365 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3366#endif
3367 }
3368 nogo:
3369
3370 /* Try optimization CURLYX => CURLYM. */
3371 if ( OP(oscan) == CURLYX && data
3372 && !(data->flags & SF_HAS_PAR)
3373 && !(data->flags & SF_HAS_EVAL)
3374 && !deltanext /* atom is fixed width */
3375 && minnext != 0 /* CURLYM can't handle zero width */
3376 ) {
3377 /* XXXX How to optimize if data == 0? */
3378 /* Optimize to a simpler form. */
3379 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3380 regnode *nxt2;
3381
3382 OP(oscan) = CURLYM;
3383 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3384 && (OP(nxt2) != WHILEM))
3385 nxt = nxt2;
3386 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3387 /* Need to optimize away parenths. */
3388 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3389 /* Set the parenth number. */
3390 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3391
3392 oscan->flags = (U8)ARG(nxt);
3393 if (RExC_open_parens) {
3394 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3395 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3396 }
3397 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3398 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3399
3400#ifdef DEBUGGING
3401 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3402 OP(nxt + 1) = OPTIMIZED; /* was count. */
3403 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3404 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3405#endif
3406#if 0
3407 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3408 regnode *nnxt = regnext(nxt1);
3409 if (nnxt == nxt) {
3410 if (reg_off_by_arg[OP(nxt1)])
3411 ARG_SET(nxt1, nxt2 - nxt1);
3412 else if (nxt2 - nxt1 < U16_MAX)
3413 NEXT_OFF(nxt1) = nxt2 - nxt1;
3414 else
3415 OP(nxt) = NOTHING; /* Cannot beautify */
3416 }
3417 nxt1 = nnxt;
3418 }
3419#endif
3420 /* Optimize again: */
3421 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3422 NULL, stopparen, recursed, NULL, 0,depth+1);
3423 }
3424 else
3425 oscan->flags = 0;
3426 }
3427 else if ((OP(oscan) == CURLYX)
3428 && (flags & SCF_WHILEM_VISITED_POS)
3429 /* See the comment on a similar expression above.
3430 However, this time it's not a subexpression
3431 we care about, but the expression itself. */
3432 && (maxcount == REG_INFTY)
3433 && data && ++data->whilem_c < 16) {
3434 /* This stays as CURLYX, we can put the count/of pair. */
3435 /* Find WHILEM (as in regexec.c) */
3436 regnode *nxt = oscan + NEXT_OFF(oscan);
3437
3438 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3439 nxt += ARG(nxt);
3440 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3441 | (RExC_whilem_seen << 4)); /* On WHILEM */
3442 }
3443 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3444 pars++;
3445 if (flags & SCF_DO_SUBSTR) {
3446 SV *last_str = NULL;
3447 int counted = mincount != 0;
3448
3449 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3450#if defined(SPARC64_GCC_WORKAROUND)
3451 I32 b = 0;
3452 STRLEN l = 0;
3453 const char *s = NULL;
3454 I32 old = 0;
3455
3456 if (pos_before >= data->last_start_min)
3457 b = pos_before;
3458 else
3459 b = data->last_start_min;
3460
3461 l = 0;
3462 s = SvPV_const(data->last_found, l);
3463 old = b - data->last_start_min;
3464
3465#else
3466 I32 b = pos_before >= data->last_start_min
3467 ? pos_before : data->last_start_min;
3468 STRLEN l;
3469 const char * const s = SvPV_const(data->last_found, l);
3470 I32 old = b - data->last_start_min;
3471#endif
3472
3473 if (UTF)
3474 old = utf8_hop((U8*)s, old) - (U8*)s;
3475 l -= old;
3476 /* Get the added string: */
3477 last_str = newSVpvn_utf8(s + old, l, UTF);
3478 if (deltanext == 0 && pos_before == b) {
3479 /* What was added is a constant string */
3480 if (mincount > 1) {
3481 SvGROW(last_str, (mincount * l) + 1);
3482 repeatcpy(SvPVX(last_str) + l,
3483 SvPVX_const(last_str), l, mincount - 1);
3484 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3485 /* Add additional parts. */
3486 SvCUR_set(data->last_found,
3487 SvCUR(data->last_found) - l);
3488 sv_catsv(data->last_found, last_str);
3489 {
3490 SV * sv = data->last_found;
3491 MAGIC *mg =
3492 SvUTF8(sv) && SvMAGICAL(sv) ?
3493 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3494 if (mg && mg->mg_len >= 0)
3495 mg->mg_len += CHR_SVLEN(last_str) - l;
3496 }
3497 data->last_end += l * (mincount - 1);
3498 }
3499 } else {
3500 /* start offset must point into the last copy */
3501 data->last_start_min += minnext * (mincount - 1);
3502 data->last_start_max += is_inf ? I32_MAX
3503 : (maxcount - 1) * (minnext + data->pos_delta);
3504 }
3505 }
3506 /* It is counted once already... */
3507 data->pos_min += minnext * (mincount - counted);
3508 data->pos_delta += - counted * deltanext +
3509 (minnext + deltanext) * maxcount - minnext * mincount;
3510 if (mincount != maxcount) {
3511 /* Cannot extend fixed substrings found inside
3512 the group. */
3513 SCAN_COMMIT(pRExC_state,data,minlenp);
3514 if (mincount && last_str) {
3515 SV * const sv = data->last_found;
3516 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3517 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3518
3519 if (mg)
3520 mg->mg_len = -1;
3521 sv_setsv(sv, last_str);
3522 data->last_end = data->pos_min;
3523 data->last_start_min =
3524 data->pos_min - CHR_SVLEN(last_str);
3525 data->last_start_max = is_inf
3526 ? I32_MAX
3527 : data->pos_min + data->pos_delta
3528 - CHR_SVLEN(last_str);
3529 }
3530 data->longest = &(data->longest_float);
3531 }
3532 SvREFCNT_dec(last_str);
3533 }
3534 if (data && (fl & SF_HAS_EVAL))
3535 data->flags |= SF_HAS_EVAL;
3536 optimize_curly_tail:
3537 if (OP(oscan) != CURLYX) {
3538 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3539 && NEXT_OFF(next))
3540 NEXT_OFF(oscan) += NEXT_OFF(next);
3541 }
3542 continue;
3543 default: /* REF, ANYOFV, and CLUMP only? */
3544 if (flags & SCF_DO_SUBSTR) {
3545 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3546 data->longest = &(data->longest_float);
3547 }
3548 is_inf = is_inf_internal = 1;
3549 if (flags & SCF_DO_STCLASS_OR)
3550 cl_anything(pRExC_state, data->start_class);
3551 flags &= ~SCF_DO_STCLASS;
3552 break;
3553 }
3554 }
3555 else if (OP(scan) == LNBREAK) {
3556 if (flags & SCF_DO_STCLASS) {
3557 int value = 0;
3558 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3559 if (flags & SCF_DO_STCLASS_AND) {
3560 for (value = 0; value < 256; value++)
3561 if (!is_VERTWS_cp(value))
3562 ANYOF_BITMAP_CLEAR(data->start_class, value);
3563 }
3564 else {
3565 for (value = 0; value < 256; value++)
3566 if (is_VERTWS_cp(value))
3567 ANYOF_BITMAP_SET(data->start_class, value);
3568 }
3569 if (flags & SCF_DO_STCLASS_OR)
3570 cl_and(data->start_class, and_withp);
3571 flags &= ~SCF_DO_STCLASS;
3572 }
3573 min += 1;
3574 delta += 1;
3575 if (flags & SCF_DO_SUBSTR) {
3576 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3577 data->pos_min += 1;
3578 data->pos_delta += 1;
3579 data->longest = &(data->longest_float);
3580 }
3581 }
3582 else if (OP(scan) == FOLDCHAR) {
3583 int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3584 flags &= ~SCF_DO_STCLASS;
3585 min += 1;
3586 delta += d;
3587 if (flags & SCF_DO_SUBSTR) {
3588 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3589 data->pos_min += 1;
3590 data->pos_delta += d;
3591 data->longest = &(data->longest_float);
3592 }
3593 }
3594 else if (REGNODE_SIMPLE(OP(scan))) {
3595 int value = 0;
3596
3597 if (flags & SCF_DO_SUBSTR) {
3598 SCAN_COMMIT(pRExC_state,data,minlenp);
3599 data->pos_min++;
3600 }
3601 min++;
3602 if (flags & SCF_DO_STCLASS) {
3603 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3604
3605 /* Some of the logic below assumes that switching
3606 locale on will only add false positives. */
3607 switch (PL_regkind[OP(scan)]) {
3608 case SANY:
3609 default:
3610 do_default:
3611 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3612 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3613 cl_anything(pRExC_state, data->start_class);
3614 break;
3615 case REG_ANY:
3616 if (OP(scan) == SANY)
3617 goto do_default;
3618 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3619 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3620 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3621 cl_anything(pRExC_state, data->start_class);
3622 }
3623 if (flags & SCF_DO_STCLASS_AND || !value)
3624 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3625 break;
3626 case ANYOF:
3627 if (flags & SCF_DO_STCLASS_AND)
3628 cl_and(data->start_class,
3629 (struct regnode_charclass_class*)scan);
3630 else
3631 cl_or(pRExC_state, data->start_class,
3632 (struct regnode_charclass_class*)scan);
3633 break;
3634 case ALNUM:
3635 if (flags & SCF_DO_STCLASS_AND) {
3636 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3637 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3638 if (OP(scan) == ALNUMU) {
3639 for (value = 0; value < 256; value++) {
3640 if (!isWORDCHAR_L1(value)) {
3641 ANYOF_BITMAP_CLEAR(data->start_class, value);
3642 }
3643 }
3644 } else {
3645 for (value = 0; value < 256; value++) {
3646 if (!isALNUM(value)) {
3647 ANYOF_BITMAP_CLEAR(data->start_class, value);
3648 }
3649 }
3650 }
3651 }
3652 }
3653 else {
3654 if (data->start_class->flags & ANYOF_LOCALE)
3655 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3656 else if (OP(scan) == ALNUMU) {
3657 for (value = 0; value < 256; value++) {
3658 if (isWORDCHAR_L1(value)) {
3659 ANYOF_BITMAP_SET(data->start_class, value);
3660 }
3661 }
3662 } else {
3663 for (value = 0; value < 256; value++) {
3664 if (isALNUM(value)) {
3665 ANYOF_BITMAP_SET(data->start_class, value);
3666 }
3667 }
3668 }
3669 }
3670 break;
3671 case NALNUM:
3672 if (flags & SCF_DO_STCLASS_AND) {
3673 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3674 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3675 if (OP(scan) == NALNUMU) {
3676 for (value = 0; value < 256; value++) {
3677 if (isWORDCHAR_L1(value)) {
3678 ANYOF_BITMAP_CLEAR(data->start_class, value);
3679 }
3680 }
3681 } else {
3682 for (value = 0; value < 256; value++) {
3683 if (isALNUM(value)) {
3684 ANYOF_BITMAP_CLEAR(data->start_class, value);
3685 }
3686 }
3687 }
3688 }
3689 }
3690 else {
3691 if (data->start_class->flags & ANYOF_LOCALE)
3692 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3693 else {
3694 if (OP(scan) == NALNUMU) {
3695 for (value = 0; value < 256; value++) {
3696 if (! isWORDCHAR_L1(value)) {
3697 ANYOF_BITMAP_SET(data->start_class, value);
3698 }
3699 }
3700 } else {
3701 for (value = 0; value < 256; value++) {
3702 if (! isALNUM(value)) {
3703 ANYOF_BITMAP_SET(data->start_class, value);
3704 }
3705 }
3706 }
3707 }
3708 }
3709 break;
3710 case SPACE:
3711 if (flags & SCF_DO_STCLASS_AND) {
3712 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3713 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3714 if (OP(scan) == SPACEU) {
3715 for (value = 0; value < 256; value++) {
3716 if (!isSPACE_L1(value)) {
3717 ANYOF_BITMAP_CLEAR(data->start_class, value);
3718 }
3719 }
3720 } else {
3721 for (value = 0; value < 256; value++) {
3722 if (!isSPACE(value)) {
3723 ANYOF_BITMAP_CLEAR(data->start_class, value);
3724 }
3725 }
3726 }
3727 }
3728 }
3729 else {
3730 if (data->start_class->flags & ANYOF_LOCALE) {
3731 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3732 }
3733 else if (OP(scan) == SPACEU) {
3734 for (value = 0; value < 256; value++) {
3735 if (isSPACE_L1(value)) {
3736 ANYOF_BITMAP_SET(data->start_class, value);
3737 }
3738 }
3739 } else {
3740 for (value = 0; value < 256; value++) {
3741 if (isSPACE(value)) {
3742 ANYOF_BITMAP_SET(data->start_class, value);
3743 }
3744 }
3745 }
3746 }
3747 break;
3748 case NSPACE:
3749 if (flags & SCF_DO_STCLASS_AND) {
3750 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3751 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3752 if (OP(scan) == NSPACEU) {
3753 for (value = 0; value < 256; value++) {
3754 if (isSPACE_L1(value)) {
3755 ANYOF_BITMAP_CLEAR(data->start_class, value);
3756 }
3757 }
3758 } else {
3759 for (value = 0; value < 256; value++) {
3760 if (isSPACE(value)) {
3761 ANYOF_BITMAP_CLEAR(data->start_class, value);
3762 }
3763 }
3764 }
3765 }
3766 }
3767 else {
3768 if (data->start_class->flags & ANYOF_LOCALE)
3769 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3770 else if (OP(scan) == NSPACEU) {
3771 for (value = 0; value < 256; value++) {
3772 if (!isSPACE_L1(value)) {
3773 ANYOF_BITMAP_SET(data->start_class, value);
3774 }
3775 }
3776 }
3777 else {
3778 for (value = 0; value < 256; value++) {
3779 if (!isSPACE(value)) {
3780 ANYOF_BITMAP_SET(data->start_class, value);
3781 }
3782 }
3783 }
3784 }
3785 break;
3786 case DIGIT:
3787 if (flags & SCF_DO_STCLASS_AND) {
3788 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3789 for (value = 0; value < 256; value++)
3790 if (!isDIGIT(value))
3791 ANYOF_BITMAP_CLEAR(data->start_class, value);
3792 }
3793 else {
3794 if (data->start_class->flags & ANYOF_LOCALE)
3795 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3796 else {
3797 for (value = 0; value < 256; value++)
3798 if (isDIGIT(value))
3799 ANYOF_BITMAP_SET(data->start_class, value);
3800 }
3801 }
3802 break;
3803 case NDIGIT:
3804 if (flags & SCF_DO_STCLASS_AND) {
3805 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3806 for (value = 0; value < 256; value++)
3807 if (isDIGIT(value))
3808 ANYOF_BITMAP_CLEAR(data->start_class, value);
3809 }
3810 else {
3811 if (data->start_class->flags & ANYOF_LOCALE)
3812 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3813 else {
3814 for (value = 0; value < 256; value++)
3815 if (!isDIGIT(value))
3816 ANYOF_BITMAP_SET(data->start_class, value);
3817 }
3818 }
3819 break;
3820 CASE_SYNST_FNC(VERTWS);
3821 CASE_SYNST_FNC(HORIZWS);
3822
3823 }
3824 if (flags & SCF_DO_STCLASS_OR)
3825 cl_and(data->start_class, and_withp);
3826 flags &= ~SCF_DO_STCLASS;
3827 }
3828 }
3829 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3830 data->flags |= (OP(scan) == MEOL
3831 ? SF_BEFORE_MEOL
3832 : SF_BEFORE_SEOL);
3833 }
3834 else if ( PL_regkind[OP(scan)] == BRANCHJ
3835 /* Lookbehind, or need to calculate parens/evals/stclass: */
3836 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3837 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3838 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3839 || OP(scan) == UNLESSM )
3840 {
3841 /* Negative Lookahead/lookbehind
3842 In this case we can't do fixed string optimisation.
3843 */
3844
3845 I32 deltanext, minnext, fake = 0;
3846 regnode *nscan;
3847 struct regnode_charclass_class intrnl;
3848 int f = 0;
3849
3850 data_fake.flags = 0;
3851 if (data) {
3852 data_fake.whilem_c = data->whilem_c;
3853 data_fake.last_closep = data->last_closep;
3854 }
3855 else
3856 data_fake.last_closep = &fake;
3857 data_fake.pos_delta = delta;
3858 if ( flags & SCF_DO_STCLASS && !scan->flags
3859 && OP(scan) == IFMATCH ) { /* Lookahead */
3860 cl_init(pRExC_state, &intrnl);
3861 data_fake.start_class = &intrnl;
3862 f |= SCF_DO_STCLASS_AND;
3863 }
3864 if (flags & SCF_WHILEM_VISITED_POS)
3865 f |= SCF_WHILEM_VISITED_POS;
3866 next = regnext(scan);
3867 nscan = NEXTOPER(NEXTOPER(scan));
3868 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3869 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3870 if (scan->flags) {
3871 if (deltanext) {
3872 FAIL("Variable length lookbehind not implemented");
3873 }
3874 else if (minnext > (I32)U8_MAX) {
3875 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3876 }
3877 scan->flags = (U8)minnext;
3878 }
3879 if (data) {
3880 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3881 pars++;
3882 if (data_fake.flags & SF_HAS_EVAL)
3883 data->flags |= SF_HAS_EVAL;
3884 data->whilem_c = data_fake.whilem_c;
3885 }
3886 if (f & SCF_DO_STCLASS_AND) {
3887 if (flags & SCF_DO_STCLASS_OR) {
3888 /* OR before, AND after: ideally we would recurse with
3889 * data_fake to get the AND applied by study of the
3890 * remainder of the pattern, and then derecurse;
3891 * *** HACK *** for now just treat as "no information".
3892 * See [perl #56690].
3893 */
3894 cl_init(pRExC_state, data->start_class);
3895 } else {
3896 /* AND before and after: combine and continue */
3897 const int was = (data->start_class->flags & ANYOF_EOS);
3898
3899 cl_and(data->start_class, &intrnl);
3900 if (was)
3901 data->start_class->flags |= ANYOF_EOS;
3902 }
3903 }
3904 }
3905#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3906 else {
3907 /* Positive Lookahead/lookbehind
3908 In this case we can do fixed string optimisation,
3909 but we must be careful about it. Note in the case of
3910 lookbehind the positions will be offset by the minimum
3911 length of the pattern, something we won't know about
3912 until after the recurse.
3913 */
3914 I32 deltanext, fake = 0;
3915 regnode *nscan;
3916 struct regnode_charclass_class intrnl;
3917 int f = 0;
3918 /* We use SAVEFREEPV so that when the full compile
3919 is finished perl will clean up the allocated
3920 minlens when it's all done. This way we don't
3921 have to worry about freeing them when we know
3922 they wont be used, which would be a pain.
3923 */
3924 I32 *minnextp;
3925 Newx( minnextp, 1, I32 );
3926 SAVEFREEPV(minnextp);
3927
3928 if (data) {
3929 StructCopy(data, &data_fake, scan_data_t);
3930 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3931 f |= SCF_DO_SUBSTR;
3932 if (scan->flags)
3933 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3934 data_fake.last_found=newSVsv(data->last_found);
3935 }
3936 }
3937 else
3938 data_fake.last_closep = &fake;
3939 data_fake.flags = 0;
3940 data_fake.pos_delta = delta;
3941 if (is_inf)
3942 data_fake.flags |= SF_IS_INF;
3943 if ( flags & SCF_DO_STCLASS && !scan->flags
3944 && OP(scan) == IFMATCH ) { /* Lookahead */
3945 cl_init(pRExC_state, &intrnl);
3946 data_fake.start_class = &intrnl;
3947 f |= SCF_DO_STCLASS_AND;
3948 }
3949 if (flags & SCF_WHILEM_VISITED_POS)
3950 f |= SCF_WHILEM_VISITED_POS;
3951 next = regnext(scan);
3952 nscan = NEXTOPER(NEXTOPER(scan));
3953
3954 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3955 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3956 if (scan->flags) {
3957 if (deltanext) {
3958 FAIL("Variable length lookbehind not implemented");
3959 }
3960 else if (*minnextp > (I32)U8_MAX) {
3961 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3962 }
3963 scan->flags = (U8)*minnextp;
3964 }
3965
3966 *minnextp += min;
3967
3968 if (f & SCF_DO_STCLASS_AND) {
3969 const int was = (data->start_class->flags & ANYOF_EOS);
3970
3971 cl_and(data->start_class, &intrnl);
3972 if (was)
3973 data->start_class->flags |= ANYOF_EOS;
3974 }
3975 if (data) {
3976 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3977 pars++;
3978 if (data_fake.flags & SF_HAS_EVAL)
3979 data->flags |= SF_HAS_EVAL;
3980 data->whilem_c = data_fake.whilem_c;
3981 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3982 if (RExC_rx->minlen<*minnextp)
3983 RExC_rx->minlen=*minnextp;
3984 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3985 SvREFCNT_dec(data_fake.last_found);
3986
3987 if ( data_fake.minlen_fixed != minlenp )
3988 {
3989 data->offset_fixed= data_fake.offset_fixed;
3990 data->minlen_fixed= data_fake.minlen_fixed;
3991 data->lookbehind_fixed+= scan->flags;
3992 }
3993 if ( data_fake.minlen_float != minlenp )
3994 {
3995 data->minlen_float= data_fake.minlen_float;
3996 data->offset_float_min=data_fake.offset_float_min;
3997 data->offset_float_max=data_fake.offset_float_max;
3998 data->lookbehind_float+= scan->flags;
3999 }
4000 }
4001 }
4002
4003
4004 }
4005#endif
4006 }
4007 else if (OP(scan) == OPEN) {
4008 if (stopparen != (I32)ARG(scan))
4009 pars++;
4010 }
4011 else if (OP(scan) == CLOSE) {
4012 if (stopparen == (I32)ARG(scan)) {
4013 break;
4014 }
4015 if ((I32)ARG(scan) == is_par) {
4016 next = regnext(scan);
4017
4018 if ( next && (OP(next) != WHILEM) && next < last)
4019 is_par = 0; /* Disable optimization */
4020 }
4021 if (data)
4022 *(data->last_closep) = ARG(scan);
4023 }
4024 else if (OP(scan) == EVAL) {
4025 if (data)
4026 data->flags |= SF_HAS_EVAL;
4027 }
4028 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4029 if (flags & SCF_DO_SUBSTR) {
4030 SCAN_COMMIT(pRExC_state,data,minlenp);
4031 flags &= ~SCF_DO_SUBSTR;
4032 }
4033 if (data && OP(scan)==ACCEPT) {
4034 data->flags |= SCF_SEEN_ACCEPT;
4035 if (stopmin > min)
4036 stopmin = min;
4037 }
4038 }
4039 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4040 {
4041 if (flags & SCF_DO_SUBSTR) {
4042 SCAN_COMMIT(pRExC_state,data,minlenp);
4043 data->longest = &(data->longest_float);
4044 }
4045 is_inf = is_inf_internal = 1;
4046 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4047 cl_anything(pRExC_state, data->start_class);
4048 flags &= ~SCF_DO_STCLASS;
4049 }
4050 else if (OP(scan) == GPOS) {
4051 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4052 !(delta || is_inf || (data && data->pos_delta)))
4053 {
4054 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4055 RExC_rx->extflags |= RXf_ANCH_GPOS;
4056 if (RExC_rx->gofs < (U32)min)
4057 RExC_rx->gofs = min;
4058 } else {
4059 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4060 RExC_rx->gofs = 0;
4061 }
4062 }
4063#ifdef TRIE_STUDY_OPT
4064#ifdef FULL_TRIE_STUDY
4065 else if (PL_regkind[OP(scan)] == TRIE) {
4066 /* NOTE - There is similar code to this block above for handling
4067 BRANCH nodes on the initial study. If you change stuff here
4068 check there too. */
4069 regnode *trie_node= scan;
4070 regnode *tail= regnext(scan);
4071 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4072 I32 max1 = 0, min1 = I32_MAX;
4073 struct regnode_charclass_class accum;
4074
4075 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4076 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4077 if (flags & SCF_DO_STCLASS)
4078 cl_init_zero(pRExC_state, &accum);
4079
4080 if (!trie->jump) {
4081 min1= trie->minlen;
4082 max1= trie->maxlen;
4083 } else {
4084 const regnode *nextbranch= NULL;
4085 U32 word;
4086
4087 for ( word=1 ; word <= trie->wordcount ; word++)
4088 {
4089 I32 deltanext=0, minnext=0, f = 0, fake;
4090 struct regnode_charclass_class this_class;
4091
4092 data_fake.flags = 0;
4093 if (data) {
4094 data_fake.whilem_c = data->whilem_c;
4095 data_fake.last_closep = data->last_closep;
4096 }
4097 else
4098 data_fake.last_closep = &fake;
4099 data_fake.pos_delta = delta;
4100 if (flags & SCF_DO_STCLASS) {
4101 cl_init(pRExC_state, &this_class);
4102 data_fake.start_class = &this_class;
4103 f = SCF_DO_STCLASS_AND;
4104 }
4105 if (flags & SCF_WHILEM_VISITED_POS)
4106 f |= SCF_WHILEM_VISITED_POS;
4107
4108 if (trie->jump[word]) {
4109 if (!nextbranch)
4110 nextbranch = trie_node + trie->jump[0];
4111 scan= trie_node + trie->jump[word];
4112 /* We go from the jump point to the branch that follows
4113 it. Note this means we need the vestigal unused branches
4114 even though they arent otherwise used.
4115 */
4116 minnext = study_chunk(pRExC_state, &scan, minlenp,
4117 &deltanext, (regnode *)nextbranch, &data_fake,
4118 stopparen, recursed, NULL, f,depth+1);
4119 }
4120 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4121 nextbranch= regnext((regnode*)nextbranch);
4122
4123 if (min1 > (I32)(minnext + trie->minlen))
4124 min1 = minnext + trie->minlen;
4125 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4126 max1 = minnext + deltanext + trie->maxlen;
4127 if (deltanext == I32_MAX)
4128 is_inf = is_inf_internal = 1;
4129
4130 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4131 pars++;
4132 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4133 if ( stopmin > min + min1)
4134 stopmin = min + min1;
4135 flags &= ~SCF_DO_SUBSTR;
4136 if (data)
4137 data->flags |= SCF_SEEN_ACCEPT;
4138 }
4139 if (data) {
4140 if (data_fake.flags & SF_HAS_EVAL)
4141 data->flags |= SF_HAS_EVAL;
4142 data->whilem_c = data_fake.whilem_c;
4143 }
4144 if (flags & SCF_DO_STCLASS)
4145 cl_or(pRExC_state, &accum, &this_class);
4146 }
4147 }
4148 if (flags & SCF_DO_SUBSTR) {
4149 data->pos_min += min1;
4150 data->pos_delta += max1 - min1;
4151 if (max1 != min1 || is_inf)
4152 data->longest = &(data->longest_float);
4153 }
4154 min += min1;
4155 delta += max1 - min1;
4156 if (flags & SCF_DO_STCLASS_OR) {
4157 cl_or(pRExC_state, data->start_class, &accum);
4158 if (min1) {
4159 cl_and(data->start_class, and_withp);
4160 flags &= ~SCF_DO_STCLASS;
4161 }
4162 }
4163 else if (flags & SCF_DO_STCLASS_AND) {
4164 if (min1) {
4165 cl_and(data->start_class, &accum);
4166 flags &= ~SCF_DO_STCLASS;
4167 }
4168 else {
4169 /* Switch to OR mode: cache the old value of
4170 * data->start_class */
4171 INIT_AND_WITHP;
4172 StructCopy(data->start_class, and_withp,
4173 struct regnode_charclass_class);
4174 flags &= ~SCF_DO_STCLASS_AND;
4175 StructCopy(&accum, data->start_class,
4176 struct regnode_charclass_class);
4177 flags |= SCF_DO_STCLASS_OR;
4178 data->start_class->flags |= ANYOF_EOS;
4179 }
4180 }
4181 scan= tail;
4182 continue;
4183 }
4184#else
4185 else if (PL_regkind[OP(scan)] == TRIE) {
4186 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4187 U8*bang=NULL;
4188
4189 min += trie->minlen;
4190 delta += (trie->maxlen - trie->minlen);
4191 flags &= ~SCF_DO_STCLASS; /* xxx */
4192 if (flags & SCF_DO_SUBSTR) {
4193 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4194 data->pos_min += trie->minlen;
4195 data->pos_delta += (trie->maxlen - trie->minlen);
4196 if (trie->maxlen != trie->minlen)
4197 data->longest = &(data->longest_float);
4198 }
4199 if (trie->jump) /* no more substrings -- for now /grr*/
4200 flags &= ~SCF_DO_SUBSTR;
4201 }
4202#endif /* old or new */
4203#endif /* TRIE_STUDY_OPT */
4204
4205 /* Else: zero-length, ignore. */
4206 scan = regnext(scan);
4207 }
4208 if (frame) {
4209 last = frame->last;
4210 scan = frame->next;
4211 stopparen = frame->stop;
4212 frame = frame->prev;
4213 goto fake_study_recurse;
4214 }
4215
4216 finish:
4217 assert(!frame);
4218 DEBUG_STUDYDATA("pre-fin:",data,depth);
4219
4220 *scanp = scan;
4221 *deltap = is_inf_internal ? I32_MAX : delta;
4222 if (flags & SCF_DO_SUBSTR && is_inf)
4223 data->pos_delta = I32_MAX - data->pos_min;
4224 if (is_par > (I32)U8_MAX)
4225 is_par = 0;
4226 if (is_par && pars==1 && data) {
4227 data->flags |= SF_IN_PAR;
4228 data->flags &= ~SF_HAS_PAR;
4229 }
4230 else if (pars && data) {
4231 data->flags |= SF_HAS_PAR;
4232 data->flags &= ~SF_IN_PAR;
4233 }
4234 if (flags & SCF_DO_STCLASS_OR)
4235 cl_and(data->start_class, and_withp);
4236 if (flags & SCF_TRIE_RESTUDY)
4237 data->flags |= SCF_TRIE_RESTUDY;
4238
4239 DEBUG_STUDYDATA("post-fin:",data,depth);
4240
4241 return min < stopmin ? min : stopmin;
4242}
4243
4244STATIC U32
4245S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4246{
4247 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4248
4249 PERL_ARGS_ASSERT_ADD_DATA;
4250
4251 Renewc(RExC_rxi->data,
4252 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4253 char, struct reg_data);
4254 if(count)
4255 Renew(RExC_rxi->data->what, count + n, U8);
4256 else
4257 Newx(RExC_rxi->data->what, n, U8);
4258 RExC_rxi->data->count = count + n;
4259 Copy(s, RExC_rxi->data->what + count, n, U8);
4260 return count;
4261}
4262
4263/*XXX: todo make this not included in a non debugging perl */
4264#ifndef PERL_IN_XSUB_RE
4265void
4266Perl_reginitcolors(pTHX)
4267{
4268 dVAR;
4269 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4270 if (s) {
4271 char *t = savepv(s);
4272 int i = 0;
4273 PL_colors[0] = t;
4274 while (++i < 6) {
4275 t = strchr(t, '\t');
4276 if (t) {
4277 *t = '\0';
4278 PL_colors[i] = ++t;
4279 }
4280 else
4281 PL_colors[i] = t = (char *)"";
4282 }
4283 } else {
4284 int i = 0;
4285 while (i < 6)
4286 PL_colors[i++] = (char *)"";
4287 }
4288 PL_colorset = 1;
4289}
4290#endif
4291
4292
4293#ifdef TRIE_STUDY_OPT
4294#define CHECK_RESTUDY_GOTO \
4295 if ( \
4296 (data.flags & SCF_TRIE_RESTUDY) \
4297 && ! restudied++ \
4298 ) goto reStudy
4299#else
4300#define CHECK_RESTUDY_GOTO
4301#endif
4302
4303/*
4304 - pregcomp - compile a regular expression into internal code
4305 *
4306 * We can't allocate space until we know how big the compiled form will be,
4307 * but we can't compile it (and thus know how big it is) until we've got a
4308 * place to put the code. So we cheat: we compile it twice, once with code
4309 * generation turned off and size counting turned on, and once "for real".
4310 * This also means that we don't allocate space until we are sure that the
4311 * thing really will compile successfully, and we never have to move the
4312 * code and thus invalidate pointers into it. (Note that it has to be in
4313 * one piece because free() must be able to free it all.) [NB: not true in perl]
4314 *
4315 * Beware that the optimization-preparation code in here knows about some
4316 * of the structure of the compiled regexp. [I'll say.]
4317 */
4318
4319
4320
4321#ifndef PERL_IN_XSUB_RE
4322#define RE_ENGINE_PTR &PL_core_reg_engine
4323#else
4324extern const struct regexp_engine my_reg_engine;
4325#define RE_ENGINE_PTR &my_reg_engine
4326#endif
4327
4328#ifndef PERL_IN_XSUB_RE
4329REGEXP *
4330Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4331{
4332 dVAR;
4333 HV * const table = GvHV(PL_hintgv);
4334
4335 PERL_ARGS_ASSERT_PREGCOMP;
4336
4337 /* Dispatch a request to compile a regexp to correct
4338 regexp engine. */
4339 if (table) {
4340 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4341 GET_RE_DEBUG_FLAGS_DECL;
4342 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4343 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4344 DEBUG_COMPILE_r({
4345 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4346 SvIV(*ptr));
4347 });
4348 return CALLREGCOMP_ENG(eng, pattern, flags);
4349 }
4350 }
4351 return Perl_re_compile(aTHX_ pattern, flags);
4352}
4353#endif
4354
4355REGEXP *
4356Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4357{
4358 dVAR;
4359 REGEXP *rx;
4360 struct regexp *r;
4361 register regexp_internal *ri;
4362 STRLEN plen;
4363 char *exp;
4364 char* xend;
4365 regnode *scan;
4366 I32 flags;
4367 I32 minlen = 0;
4368 U32 pm_flags;
4369
4370 /* these are all flags - maybe they should be turned
4371 * into a single int with different bit masks */
4372 I32 sawlookahead = 0;
4373 I32 sawplus = 0;
4374 I32 sawopen = 0;
4375 bool used_setjump = FALSE;
4376
4377 U8 jump_ret = 0;
4378 dJMPENV;
4379 scan_data_t data;
4380 RExC_state_t RExC_state;
4381 RExC_state_t * const pRExC_state = &RExC_state;
4382#ifdef TRIE_STUDY_OPT
4383 int restudied;
4384 RExC_state_t copyRExC_state;
4385#endif
4386 GET_RE_DEBUG_FLAGS_DECL;
4387
4388 PERL_ARGS_ASSERT_RE_COMPILE;
4389
4390 DEBUG_r(if (!PL_colorset) reginitcolors());
4391
4392 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4393
4394 /****************** LONG JUMP TARGET HERE***********************/
4395 /* Longjmp back to here if have to switch in midstream to utf8 */
4396 if (! RExC_orig_utf8) {
4397 JMPENV_PUSH(jump_ret);
4398 used_setjump = TRUE;
4399 }
4400
4401 if (jump_ret == 0) { /* First time through */
4402 exp = SvPV(pattern, plen);
4403 xend = exp + plen;
4404 /* ignore the utf8ness if the pattern is 0 length */
4405 if (plen == 0) {
4406 RExC_utf8 = RExC_orig_utf8 = 0;
4407 }
4408
4409 DEBUG_COMPILE_r({
4410 SV *dsv= sv_newmortal();
4411 RE_PV_QUOTED_DECL(s, RExC_utf8,
4412 dsv, exp, plen, 60);
4413 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4414 PL_colors[4],PL_colors[5],s);
4415 });
4416 }
4417 else { /* longjumped back */
4418 STRLEN len = plen;
4419
4420 /* If the cause for the longjmp was other than changing to utf8, pop
4421 * our own setjmp, and longjmp to the correct handler */
4422 if (jump_ret != UTF8_LONGJMP) {
4423 JMPENV_POP;
4424 JMPENV_JUMP(jump_ret);
4425 }
4426
4427 GET_RE_DEBUG_FLAGS;
4428
4429 /* It's possible to write a regexp in ascii that represents Unicode
4430 codepoints outside of the byte range, such as via \x{100}. If we
4431 detect such a sequence we have to convert the entire pattern to utf8
4432 and then recompile, as our sizing calculation will have been based
4433 on 1 byte == 1 character, but we will need to use utf8 to encode
4434 at least some part of the pattern, and therefore must convert the whole
4435 thing.
4436 -- dmq */
4437 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4438 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4439 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4440 xend = exp + len;
4441 RExC_orig_utf8 = RExC_utf8 = 1;
4442 SAVEFREEPV(exp);
4443 }
4444
4445#ifdef TRIE_STUDY_OPT
4446 restudied = 0;
4447#endif
4448
4449 /* Set to use unicode semantics if the pattern is in utf8 and has the
4450 * 'depends' charset specified, as it means unicode when utf8 */
4451 pm_flags = orig_pm_flags;
4452
4453 if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) {
4454 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4455 }
4456
4457 RExC_precomp = exp;
4458 RExC_flags = pm_flags;
4459 RExC_sawback = 0;
4460
4461 RExC_seen = 0;
4462 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4463 RExC_seen_evals = 0;
4464 RExC_extralen = 0;
4465
4466 /* First pass: determine size, legality. */
4467 RExC_parse = exp;
4468 RExC_start = exp;
4469 RExC_end = xend;
4470 RExC_naughty = 0;
4471 RExC_npar = 1;
4472 RExC_nestroot = 0;
4473 RExC_size = 0L;
4474 RExC_emit = &PL_regdummy;
4475 RExC_whilem_seen = 0;
4476 RExC_open_parens = NULL;
4477 RExC_close_parens = NULL;
4478 RExC_opend = NULL;
4479 RExC_paren_names = NULL;
4480#ifdef DEBUGGING
4481 RExC_paren_name_list = NULL;
4482#endif
4483 RExC_recurse = NULL;
4484 RExC_recurse_count = 0;
4485
4486#if 0 /* REGC() is (currently) a NOP at the first pass.
4487 * Clever compilers notice this and complain. --jhi */
4488 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4489#endif
4490 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4491 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4492 RExC_precomp = NULL;
4493 return(NULL);
4494 }
4495
4496 /* Here, finished first pass. Get rid of any added setjmp */
4497 if (used_setjump) {
4498 JMPENV_POP;
4499 }
4500 DEBUG_PARSE_r({
4501 PerlIO_printf(Perl_debug_log,
4502 "Required size %"IVdf" nodes\n"
4503 "Starting second pass (creation)\n",
4504 (IV)RExC_size);
4505 RExC_lastnum=0;
4506 RExC_lastparse=NULL;
4507 });
4508 /* Small enough for pointer-storage convention?
4509 If extralen==0, this means that we will not need long jumps. */
4510 if (RExC_size >= 0x10000L && RExC_extralen)
4511 RExC_size += RExC_extralen;
4512 else
4513 RExC_extralen = 0;
4514 if (RExC_whilem_seen > 15)
4515 RExC_whilem_seen = 15;
4516
4517 /* Allocate space and zero-initialize. Note, the two step process
4518 of zeroing when in debug mode, thus anything assigned has to
4519 happen after that */
4520 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4521 r = (struct regexp*)SvANY(rx);
4522 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4523 char, regexp_internal);
4524 if ( r == NULL || ri == NULL )
4525 FAIL("Regexp out of space");
4526#ifdef DEBUGGING
4527 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4528 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4529#else
4530 /* bulk initialize base fields with 0. */
4531 Zero(ri, sizeof(regexp_internal), char);
4532#endif
4533
4534 /* non-zero initialization begins here */
4535 RXi_SET( r, ri );
4536 r->engine= RE_ENGINE_PTR;
4537 r->extflags = pm_flags;
4538 {
4539 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4540 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
4541
4542 /* The caret is output if there are any defaults: if not all the STD
4543 * flags are set, or if no character set specifier is needed */
4544 bool has_default =
4545 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4546 || ! has_charset);
4547 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4548 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4549 >> RXf_PMf_STD_PMMOD_SHIFT);
4550 const char *fptr = STD_PAT_MODS; /*"msix"*/
4551 char *p;
4552 /* Allocate for the worst case, which is all the std flags are turned
4553 * on. If more precision is desired, we could do a population count of
4554 * the flags set. This could be done with a small lookup table, or by
4555 * shifting, masking and adding, or even, when available, assembly
4556 * language for a machine-language population count.
4557 * We never output a minus, as all those are defaults, so are
4558 * covered by the caret */
4559 const STRLEN wraplen = plen + has_p + has_runon
4560 + has_default /* If needs a caret */
4561
4562 /* If needs a character set specifier */
4563 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
4564 + (sizeof(STD_PAT_MODS) - 1)
4565 + (sizeof("(?:)") - 1);
4566
4567 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4568 SvPOK_on(rx);
4569 SvFLAGS(rx) |= SvUTF8(pattern);
4570 *p++='('; *p++='?';
4571
4572 /* If a default, cover it using the caret */
4573 if (has_default) {
4574 *p++= DEFAULT_PAT_MOD;
4575 }
4576 if (has_charset) {
4577 STRLEN len;
4578 const char* const name = get_regex_charset_name(r->extflags, &len);
4579 Copy(name, p, len, char);
4580 p += len;
4581 }
4582 if (has_p)
4583 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4584 {
4585 char ch;
4586 while((ch = *fptr++)) {
4587 if(reganch & 1)
4588 *p++ = ch;
4589 reganch >>= 1;
4590 }
4591 }
4592
4593 *p++ = ':';
4594 Copy(RExC_precomp, p, plen, char);
4595 assert ((RX_WRAPPED(rx) - p) < 16);
4596 r->pre_prefix = p - RX_WRAPPED(rx);
4597 p += plen;
4598 if (has_runon)
4599 *p++ = '\n';
4600 *p++ = ')';
4601 *p = 0;
4602 SvCUR_set(rx, p - SvPVX_const(rx));
4603 }
4604
4605 r->intflags = 0;
4606 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4607
4608 if (RExC_seen & REG_SEEN_RECURSE) {
4609 Newxz(RExC_open_parens, RExC_npar,regnode *);
4610 SAVEFREEPV(RExC_open_parens);
4611 Newxz(RExC_close_parens,RExC_npar,regnode *);
4612 SAVEFREEPV(RExC_close_parens);
4613 }
4614
4615 /* Useful during FAIL. */
4616#ifdef RE_TRACK_PATTERN_OFFSETS
4617 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4618 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4619 "%s %"UVuf" bytes for offset annotations.\n",
4620 ri->u.offsets ? "Got" : "Couldn't get",
4621 (UV)((2*RExC_size+1) * sizeof(U32))));
4622#endif
4623 SetProgLen(ri,RExC_size);
4624 RExC_rx_sv = rx;
4625 RExC_rx = r;
4626 RExC_rxi = ri;
4627
4628 /* Second pass: emit code. */
4629 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4630 RExC_parse = exp;
4631 RExC_end = xend;
4632 RExC_naughty = 0;
4633 RExC_npar = 1;
4634 RExC_emit_start = ri->program;
4635 RExC_emit = ri->program;
4636 RExC_emit_bound = ri->program + RExC_size + 1;
4637
4638 /* Store the count of eval-groups for security checks: */
4639 RExC_rx->seen_evals = RExC_seen_evals;
4640 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4641 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4642 ReREFCNT_dec(rx);
4643 return(NULL);
4644 }
4645 /* XXXX To minimize changes to RE engine we always allocate
4646 3-units-long substrs field. */
4647 Newx(r->substrs, 1, struct reg_substr_data);
4648 if (RExC_recurse_count) {
4649 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4650 SAVEFREEPV(RExC_recurse);
4651 }
4652
4653reStudy:
4654 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4655 Zero(r->substrs, 1, struct reg_substr_data);
4656
4657#ifdef TRIE_STUDY_OPT
4658 if (!restudied) {
4659 StructCopy(&zero_scan_data, &data, scan_data_t);
4660 copyRExC_state = RExC_state;
4661 } else {
4662 U32 seen=RExC_seen;
4663 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4664
4665 RExC_state = copyRExC_state;
4666 if (seen & REG_TOP_LEVEL_BRANCHES)
4667 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4668 else
4669 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4670 if (data.last_found) {
4671 SvREFCNT_dec(data.longest_fixed);
4672 SvREFCNT_dec(data.longest_float);
4673 SvREFCNT_dec(data.last_found);
4674 }
4675 StructCopy(&zero_scan_data, &data, scan_data_t);
4676 }
4677#else
4678 StructCopy(&zero_scan_data, &data, scan_data_t);
4679#endif
4680
4681 /* Dig out information for optimizations. */
4682 r->extflags = RExC_flags; /* was pm_op */
4683 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4684
4685 if (UTF)
4686 SvUTF8_on(rx); /* Unicode in it? */
4687 ri->regstclass = NULL;
4688 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4689 r->intflags |= PREGf_NAUGHTY;
4690 scan = ri->program + 1; /* First BRANCH. */
4691
4692 /* testing for BRANCH here tells us whether there is "must appear"
4693 data in the pattern. If there is then we can use it for optimisations */
4694 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4695 I32 fake;
4696 STRLEN longest_float_length, longest_fixed_length;
4697 struct regnode_charclass_class ch_class; /* pointed to by data */
4698 int stclass_flag;
4699 I32 last_close = 0; /* pointed to by data */
4700 regnode *first= scan;
4701 regnode *first_next= regnext(first);
4702 /*
4703 * Skip introductions and multiplicators >= 1
4704 * so that we can extract the 'meat' of the pattern that must
4705 * match in the large if() sequence following.
4706 * NOTE that EXACT is NOT covered here, as it is normally
4707 * picked up by the optimiser separately.
4708 *
4709 * This is unfortunate as the optimiser isnt handling lookahead
4710 * properly currently.
4711 *
4712 */
4713 while ((OP(first) == OPEN && (sawopen = 1)) ||
4714 /* An OR of *one* alternative - should not happen now. */
4715 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4716 /* for now we can't handle lookbehind IFMATCH*/
4717 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4718 (OP(first) == PLUS) ||
4719 (OP(first) == MINMOD) ||
4720 /* An {n,m} with n>0 */
4721 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4722 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4723 {
4724 /*
4725 * the only op that could be a regnode is PLUS, all the rest
4726 * will be regnode_1 or regnode_2.
4727 *
4728 */
4729 if (OP(first) == PLUS)
4730 sawplus = 1;
4731 else
4732 first += regarglen[OP(first)];
4733
4734 first = NEXTOPER(first);
4735 first_next= regnext(first);
4736 }
4737
4738 /* Starting-point info. */
4739 again:
4740 DEBUG_PEEP("first:",first,0);
4741 /* Ignore EXACT as we deal with it later. */
4742 if (PL_regkind[OP(first)] == EXACT) {
4743 if (OP(first) == EXACT)
4744 NOOP; /* Empty, get anchored substr later. */
4745 else
4746 ri->regstclass = first;
4747 }
4748#ifdef TRIE_STCLASS
4749 else if (PL_regkind[OP(first)] == TRIE &&
4750 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4751 {
4752 regnode *trie_op;
4753 /* this can happen only on restudy */
4754 if ( OP(first) == TRIE ) {
4755 struct regnode_1 *trieop = (struct regnode_1 *)
4756 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4757 StructCopy(first,trieop,struct regnode_1);
4758 trie_op=(regnode *)trieop;
4759 } else {
4760 struct regnode_charclass *trieop = (struct regnode_charclass *)
4761 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4762 StructCopy(first,trieop,struct regnode_charclass);
4763 trie_op=(regnode *)trieop;
4764 }
4765 OP(trie_op)+=2;
4766 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4767 ri->regstclass = trie_op;
4768 }
4769#endif
4770 else if (REGNODE_SIMPLE(OP(first)))
4771 ri->regstclass = first;
4772 else if (PL_regkind[OP(first)] == BOUND ||
4773 PL_regkind[OP(first)] == NBOUND)
4774 ri->regstclass = first;
4775 else if (PL_regkind[OP(first)] == BOL) {
4776 r->extflags |= (OP(first) == MBOL
4777 ? RXf_ANCH_MBOL
4778 : (OP(first) == SBOL
4779 ? RXf_ANCH_SBOL
4780 : RXf_ANCH_BOL));
4781 first = NEXTOPER(first);
4782 goto again;
4783 }
4784 else if (OP(first) == GPOS) {
4785 r->extflags |= RXf_ANCH_GPOS;
4786 first = NEXTOPER(first);
4787 goto again;
4788 }
4789 else if ((!sawopen || !RExC_sawback) &&
4790 (OP(first) == STAR &&
4791 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4792 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4793 {
4794 /* turn .* into ^.* with an implied $*=1 */
4795 const int type =
4796 (OP(NEXTOPER(first)) == REG_ANY)
4797 ? RXf_ANCH_MBOL
4798 : RXf_ANCH_SBOL;
4799 r->extflags |= type;
4800 r->intflags |= PREGf_IMPLICIT;
4801 first = NEXTOPER(first);
4802 goto again;
4803 }
4804 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4805 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4806 /* x+ must match at the 1st pos of run of x's */
4807 r->intflags |= PREGf_SKIP;
4808
4809 /* Scan is after the zeroth branch, first is atomic matcher. */
4810#ifdef TRIE_STUDY_OPT
4811 DEBUG_PARSE_r(
4812 if (!restudied)
4813 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4814 (IV)(first - scan + 1))
4815 );
4816#else
4817 DEBUG_PARSE_r(
4818 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4819 (IV)(first - scan + 1))
4820 );
4821#endif
4822
4823
4824 /*
4825 * If there's something expensive in the r.e., find the
4826 * longest literal string that must appear and make it the
4827 * regmust. Resolve ties in favor of later strings, since
4828 * the regstart check works with the beginning of the r.e.
4829 * and avoiding duplication strengthens checking. Not a
4830 * strong reason, but sufficient in the absence of others.
4831 * [Now we resolve ties in favor of the earlier string if
4832 * it happens that c_offset_min has been invalidated, since the
4833 * earlier string may buy us something the later one won't.]
4834 */
4835
4836 data.longest_fixed = newSVpvs("");
4837 data.longest_float = newSVpvs("");
4838 data.last_found = newSVpvs("");
4839 data.longest = &(data.longest_fixed);
4840 first = scan;
4841 if (!ri->regstclass) {
4842 cl_init(pRExC_state, &ch_class);
4843 data.start_class = &ch_class;
4844 stclass_flag = SCF_DO_STCLASS_AND;
4845 } else /* XXXX Check for BOUND? */
4846 stclass_flag = 0;
4847 data.last_closep = &last_close;
4848
4849 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4850 &data, -1, NULL, NULL,
4851 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4852
4853
4854 CHECK_RESTUDY_GOTO;
4855
4856
4857 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4858 && data.last_start_min == 0 && data.last_end > 0
4859 && !RExC_seen_zerolen
4860 && !(RExC_seen & REG_SEEN_VERBARG)
4861 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4862 r->extflags |= RXf_CHECK_ALL;
4863 scan_commit(pRExC_state, &data,&minlen,0);
4864 SvREFCNT_dec(data.last_found);
4865
4866 /* Note that code very similar to this but for anchored string
4867 follows immediately below, changes may need to be made to both.
4868 Be careful.
4869 */
4870 longest_float_length = CHR_SVLEN(data.longest_float);
4871 if (longest_float_length
4872 || (data.flags & SF_FL_BEFORE_EOL
4873 && (!(data.flags & SF_FL_BEFORE_MEOL)
4874 || (RExC_flags & RXf_PMf_MULTILINE))))
4875 {
4876 I32 t,ml;
4877
4878 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4879 && data.offset_fixed == data.offset_float_min
4880 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4881 goto remove_float; /* As in (a)+. */
4882
4883 /* copy the information about the longest float from the reg_scan_data
4884 over to the program. */
4885 if (SvUTF8(data.longest_float)) {
4886 r->float_utf8 = data.longest_float;
4887 r->float_substr = NULL;
4888 } else {
4889 r->float_substr = data.longest_float;
4890 r->float_utf8 = NULL;
4891 }
4892 /* float_end_shift is how many chars that must be matched that
4893 follow this item. We calculate it ahead of time as once the
4894 lookbehind offset is added in we lose the ability to correctly
4895 calculate it.*/
4896 ml = data.minlen_float ? *(data.minlen_float)
4897 : (I32)longest_float_length;
4898 r->float_end_shift = ml - data.offset_float_min
4899 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4900 + data.lookbehind_float;
4901 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4902 r->float_max_offset = data.offset_float_max;
4903 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4904 r->float_max_offset -= data.lookbehind_float;
4905
4906 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4907 && (!(data.flags & SF_FL_BEFORE_MEOL)
4908 || (RExC_flags & RXf_PMf_MULTILINE)));
4909 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4910 }
4911 else {
4912 remove_float:
4913 r->float_substr = r->float_utf8 = NULL;
4914 SvREFCNT_dec(data.longest_float);
4915 longest_float_length = 0;
4916 }
4917
4918 /* Note that code very similar to this but for floating string
4919 is immediately above, changes may need to be made to both.
4920 Be careful.
4921 */
4922 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4923 if (longest_fixed_length
4924 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4925 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4926 || (RExC_flags & RXf_PMf_MULTILINE))))
4927 {
4928 I32 t,ml;
4929
4930 /* copy the information about the longest fixed
4931 from the reg_scan_data over to the program. */
4932 if (SvUTF8(data.longest_fixed)) {
4933 r->anchored_utf8 = data.longest_fixed;
4934 r->anchored_substr = NULL;
4935 } else {
4936 r->anchored_substr = data.longest_fixed;
4937 r->anchored_utf8 = NULL;
4938 }
4939 /* fixed_end_shift is how many chars that must be matched that
4940 follow this item. We calculate it ahead of time as once the
4941 lookbehind offset is added in we lose the ability to correctly
4942 calculate it.*/
4943 ml = data.minlen_fixed ? *(data.minlen_fixed)
4944 : (I32)longest_fixed_length;
4945 r->anchored_end_shift = ml - data.offset_fixed
4946 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4947 + data.lookbehind_fixed;
4948 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4949
4950 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4951 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4952 || (RExC_flags & RXf_PMf_MULTILINE)));
4953 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4954 }
4955 else {
4956 r->anchored_substr = r->anchored_utf8 = NULL;
4957 SvREFCNT_dec(data.longest_fixed);
4958 longest_fixed_length = 0;
4959 }
4960 if (ri->regstclass
4961 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4962 ri->regstclass = NULL;
4963
4964 /* If the synthetic start class were to ever be used when EOS is set,
4965 * that bit would have to be cleared, as it is shared with another */
4966 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4967 && stclass_flag
4968 && !(data.start_class->flags & ANYOF_EOS)
4969 && !cl_is_anything(data.start_class))
4970 {
4971 const U32 n = add_data(pRExC_state, 1, "f");
4972
4973 Newx(RExC_rxi->data->data[n], 1,
4974 struct regnode_charclass_class);
4975 StructCopy(data.start_class,
4976 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4977 struct regnode_charclass_class);
4978 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4979 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4980 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4981 regprop(r, sv, (regnode*)data.start_class);
4982 PerlIO_printf(Perl_debug_log,
4983 "synthetic stclass \"%s\".\n",
4984 SvPVX_const(sv));});
4985 }
4986
4987 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
4988 if (longest_fixed_length > longest_float_length) {
4989 r->check_end_shift = r->anchored_end_shift;
4990 r->check_substr = r->anchored_substr;
4991 r->check_utf8 = r->anchored_utf8;
4992 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4993 if (r->extflags & RXf_ANCH_SINGLE)
4994 r->extflags |= RXf_NOSCAN;
4995 }
4996 else {
4997 r->check_end_shift = r->float_end_shift;
4998 r->check_substr = r->float_substr;
4999 r->check_utf8 = r->float_utf8;
5000 r->check_offset_min = r->float_min_offset;
5001 r->check_offset_max = r->float_max_offset;
5002 }
5003 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5004 This should be changed ASAP! */
5005 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5006 r->extflags |= RXf_USE_INTUIT;
5007 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5008 r->extflags |= RXf_INTUIT_TAIL;
5009 }
5010 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5011 if ( (STRLEN)minlen < longest_float_length )
5012 minlen= longest_float_length;
5013 if ( (STRLEN)minlen < longest_fixed_length )
5014 minlen= longest_fixed_length;
5015 */
5016 }
5017 else {
5018 /* Several toplevels. Best we can is to set minlen. */
5019 I32 fake;
5020 struct regnode_charclass_class ch_class;
5021 I32 last_close = 0;
5022
5023 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5024
5025 scan = ri->program + 1;
5026 cl_init(pRExC_state, &ch_class);
5027 data.start_class = &ch_class;
5028 data.last_closep = &last_close;
5029
5030
5031 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5032 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5033
5034 CHECK_RESTUDY_GOTO;
5035
5036 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5037 = r->float_substr = r->float_utf8 = NULL;
5038
5039 /* If the synthetic start class were to ever be used when EOS is set,
5040 * that bit would have to be cleared, as it is shared with another */
5041 if (!(data.start_class->flags & ANYOF_EOS)
5042 && !cl_is_anything(data.start_class))
5043 {
5044 const U32 n = add_data(pRExC_state, 1, "f");
5045
5046 Newx(RExC_rxi->data->data[n], 1,
5047 struct regnode_charclass_class);
5048 StructCopy(data.start_class,
5049 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5050 struct regnode_charclass_class);
5051 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5052 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5053 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5054 regprop(r, sv, (regnode*)data.start_class);
5055 PerlIO_printf(Perl_debug_log,
5056 "synthetic stclass \"%s\".\n",
5057 SvPVX_const(sv));});
5058 }
5059 }
5060
5061 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5062 the "real" pattern. */
5063 DEBUG_OPTIMISE_r({
5064 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5065 (IV)minlen, (IV)r->minlen);
5066 });
5067 r->minlenret = minlen;
5068 if (r->minlen < minlen)
5069 r->minlen = minlen;
5070
5071 if (RExC_seen & REG_SEEN_GPOS)
5072 r->extflags |= RXf_GPOS_SEEN;
5073 if (RExC_seen & REG_SEEN_LOOKBEHIND)
5074 r->extflags |= RXf_LOOKBEHIND_SEEN;
5075 if (RExC_seen & REG_SEEN_EVAL)
5076 r->extflags |= RXf_EVAL_SEEN;
5077 if (RExC_seen & REG_SEEN_CANY)
5078 r->extflags |= RXf_CANY_SEEN;
5079 if (RExC_seen & REG_SEEN_VERBARG)
5080 r->intflags |= PREGf_VERBARG_SEEN;
5081 if (RExC_seen & REG_SEEN_CUTGROUP)
5082 r->intflags |= PREGf_CUTGROUP_SEEN;
5083 if (RExC_paren_names)
5084 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5085 else
5086 RXp_PAREN_NAMES(r) = NULL;
5087
5088#ifdef STUPID_PATTERN_CHECKS
5089 if (RX_PRELEN(rx) == 0)
5090 r->extflags |= RXf_NULL;
5091 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5092 /* XXX: this should happen BEFORE we compile */
5093 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5094 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5095 r->extflags |= RXf_WHITE;
5096 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5097 r->extflags |= RXf_START_ONLY;
5098#else
5099 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5100 /* XXX: this should happen BEFORE we compile */
5101 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5102 else {
5103 regnode *first = ri->program + 1;
5104 U8 fop = OP(first);
5105 U8 nop = OP(NEXTOPER(first));
5106
5107 if (PL_regkind[fop] == NOTHING && nop == END)
5108 r->extflags |= RXf_NULL;
5109 else if (PL_regkind[fop] == BOL && nop == END)
5110 r->extflags |= RXf_START_ONLY;
5111 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5112 r->extflags |= RXf_WHITE;
5113 }
5114#endif
5115#ifdef DEBUGGING
5116 if (RExC_paren_names) {
5117 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5118 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5119 } else
5120#endif
5121 ri->name_list_idx = 0;
5122
5123 if (RExC_recurse_count) {
5124 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5125 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5126 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5127 }
5128 }
5129 Newxz(r->offs, RExC_npar, regexp_paren_pair);
5130 /* assume we don't need to swap parens around before we match */
5131
5132 DEBUG_DUMP_r({
5133 PerlIO_printf(Perl_debug_log,"Final program:\n");
5134 regdump(r);
5135 });
5136#ifdef RE_TRACK_PATTERN_OFFSETS
5137 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5138 const U32 len = ri->u.offsets[0];
5139 U32 i;
5140 GET_RE_DEBUG_FLAGS_DECL;
5141 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5142 for (i = 1; i <= len; i++) {
5143 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5144 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5145 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5146 }
5147 PerlIO_printf(Perl_debug_log, "\n");
5148 });
5149#endif
5150 return rx;
5151}
5152
5153#undef RE_ENGINE_PTR
5154
5155
5156SV*
5157Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5158 const U32 flags)
5159{
5160 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5161
5162 PERL_UNUSED_ARG(value);
5163
5164 if (flags & RXapif_FETCH) {
5165 return reg_named_buff_fetch(rx, key, flags);
5166 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5167 Perl_croak_no_modify(aTHX);
5168 return NULL;
5169 } else if (flags & RXapif_EXISTS) {
5170 return reg_named_buff_exists(rx, key, flags)
5171 ? &PL_sv_yes
5172 : &PL_sv_no;
5173 } else if (flags & RXapif_REGNAMES) {
5174 return reg_named_buff_all(rx, flags);
5175 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5176 return reg_named_buff_scalar(rx, flags);
5177 } else {
5178 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5179 return NULL;
5180 }
5181}
5182
5183SV*
5184Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5185 const U32 flags)
5186{
5187 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5188 PERL_UNUSED_ARG(lastkey);
5189
5190 if (flags & RXapif_FIRSTKEY)
5191 return reg_named_buff_firstkey(rx, flags);
5192 else if (flags & RXapif_NEXTKEY)
5193 return reg_named_buff_nextkey(rx, flags);
5194 else {
5195 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5196 return NULL;
5197 }
5198}
5199
5200SV*
5201Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5202 const U32 flags)
5203{
5204 AV *retarray = NULL;
5205 SV *ret;
5206 struct regexp *const rx = (struct regexp *)SvANY(r);
5207
5208 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5209
5210 if (flags & RXapif_ALL)
5211 retarray=newAV();
5212
5213 if (rx && RXp_PAREN_NAMES(rx)) {
5214 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5215 if (he_str) {
5216 IV i;
5217 SV* sv_dat=HeVAL(he_str);
5218 I32 *nums=(I32*)SvPVX(sv_dat);
5219 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5220 if ((I32)(rx->nparens) >= nums[i]
5221 && rx->offs[nums[i]].start != -1
5222 && rx->offs[nums[i]].end != -1)
5223 {
5224 ret = newSVpvs("");
5225 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5226 if (!retarray)
5227 return ret;
5228 } else {
5229 ret = newSVsv(&PL_sv_undef);
5230 }
5231 if (retarray)
5232 av_push(retarray, ret);
5233 }
5234 if (retarray)
5235 return newRV_noinc(MUTABLE_SV(retarray));
5236 }
5237 }
5238 return NULL;
5239}
5240
5241bool
5242Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5243 const U32 flags)
5244{
5245 struct regexp *const rx = (struct regexp *)SvANY(r);
5246
5247 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5248
5249 if (rx && RXp_PAREN_NAMES(rx)) {
5250 if (flags & RXapif_ALL) {
5251 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5252 } else {
5253 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5254 if (sv) {
5255 SvREFCNT_dec(sv);
5256 return TRUE;
5257 } else {
5258 return FALSE;
5259 }
5260 }
5261 } else {
5262 return FALSE;
5263 }
5264}
5265
5266SV*
5267Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5268{
5269 struct regexp *const rx = (struct regexp *)SvANY(r);
5270
5271 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5272
5273 if ( rx && RXp_PAREN_NAMES(rx) ) {
5274 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5275
5276 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5277 } else {
5278 return FALSE;
5279 }
5280}
5281
5282SV*
5283Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5284{
5285 struct regexp *const rx = (struct regexp *)SvANY(r);
5286 GET_RE_DEBUG_FLAGS_DECL;
5287
5288 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5289
5290 if (rx && RXp_PAREN_NAMES(rx)) {
5291 HV *hv = RXp_PAREN_NAMES(rx);
5292 HE *temphe;
5293 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5294 IV i;
5295 IV parno = 0;
5296 SV* sv_dat = HeVAL(temphe);
5297 I32 *nums = (I32*)SvPVX(sv_dat);
5298 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5299 if ((I32)(rx->lastparen) >= nums[i] &&
5300 rx->offs[nums[i]].start != -1 &&
5301 rx->offs[nums[i]].end != -1)
5302 {
5303 parno = nums[i];
5304 break;
5305 }
5306 }
5307 if (parno || flags & RXapif_ALL) {
5308 return newSVhek(HeKEY_hek(temphe));
5309 }
5310 }
5311 }
5312 return NULL;
5313}
5314
5315SV*
5316Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5317{
5318 SV *ret;
5319 AV *av;
5320 I32 length;
5321 struct regexp *const rx = (struct regexp *)SvANY(r);
5322
5323 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5324
5325 if (rx && RXp_PAREN_NAMES(rx)) {
5326 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5327 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5328 } else if (flags & RXapif_ONE) {
5329 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5330 av = MUTABLE_AV(SvRV(ret));
5331 length = av_len(av);
5332 SvREFCNT_dec(ret);
5333 return newSViv(length + 1);
5334 } else {
5335 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5336 return NULL;
5337 }
5338 }
5339 return &PL_sv_undef;
5340}
5341
5342SV*
5343Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5344{
5345 struct regexp *const rx = (struct regexp *)SvANY(r);
5346 AV *av = newAV();
5347
5348 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5349
5350 if (rx && RXp_PAREN_NAMES(rx)) {
5351 HV *hv= RXp_PAREN_NAMES(rx);
5352 HE *temphe;
5353 (void)hv_iterinit(hv);
5354 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5355 IV i;
5356 IV parno = 0;
5357 SV* sv_dat = HeVAL(temphe);
5358 I32 *nums = (I32*)SvPVX(sv_dat);
5359 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5360 if ((I32)(rx->lastparen) >= nums[i] &&
5361 rx->offs[nums[i]].start != -1 &&
5362 rx->offs[nums[i]].end != -1)
5363 {
5364 parno = nums[i];
5365 break;
5366 }
5367 }
5368 if (parno || flags & RXapif_ALL) {
5369 av_push(av, newSVhek(HeKEY_hek(temphe)));
5370 }
5371 }
5372 }
5373
5374 return newRV_noinc(MUTABLE_SV(av));
5375}
5376
5377void
5378Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5379 SV * const sv)
5380{
5381 struct regexp *const rx = (struct regexp *)SvANY(r);
5382 char *s = NULL;
5383 I32 i = 0;
5384 I32 s1, t1;
5385
5386 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5387
5388 if (!rx->subbeg) {
5389 sv_setsv(sv,&PL_sv_undef);
5390 return;
5391 }
5392 else
5393 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5394 /* $` */
5395 i = rx->offs[0].start;
5396 s = rx->subbeg;
5397 }
5398 else
5399 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5400 /* $' */
5401 s = rx->subbeg + rx->offs[0].end;
5402 i = rx->sublen - rx->offs[0].end;
5403 }
5404 else
5405 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5406 (s1 = rx->offs[paren].start) != -1 &&
5407 (t1 = rx->offs[paren].end) != -1)
5408 {
5409 /* $& $1 ... */
5410 i = t1 - s1;
5411 s = rx->subbeg + s1;
5412 } else {
5413 sv_setsv(sv,&PL_sv_undef);
5414 return;
5415 }
5416 assert(rx->sublen >= (s - rx->subbeg) + i );
5417 if (i >= 0) {
5418 const int oldtainted = PL_tainted;
5419 TAINT_NOT;
5420 sv_setpvn(sv, s, i);
5421 PL_tainted = oldtainted;
5422 if ( (rx->extflags & RXf_CANY_SEEN)
5423 ? (RXp_MATCH_UTF8(rx)
5424 && (!i || is_utf8_string((U8*)s, i)))
5425 : (RXp_MATCH_UTF8(rx)) )
5426 {
5427 SvUTF8_on(sv);
5428 }
5429 else
5430 SvUTF8_off(sv);
5431 if (PL_tainting) {
5432 if (RXp_MATCH_TAINTED(rx)) {
5433 if (SvTYPE(sv) >= SVt_PVMG) {
5434 MAGIC* const mg = SvMAGIC(sv);
5435 MAGIC* mgt;
5436 PL_tainted = 1;
5437 SvMAGIC_set(sv, mg->mg_moremagic);
5438 SvTAINT(sv);
5439 if ((mgt = SvMAGIC(sv))) {
5440 mg->mg_moremagic = mgt;
5441 SvMAGIC_set(sv, mg);
5442 }
5443 } else {
5444 PL_tainted = 1;
5445 SvTAINT(sv);
5446 }
5447 } else
5448 SvTAINTED_off(sv);
5449 }
5450 } else {
5451 sv_setsv(sv,&PL_sv_undef);
5452 return;
5453 }
5454}
5455
5456void
5457Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5458 SV const * const value)
5459{
5460 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5461
5462 PERL_UNUSED_ARG(rx);
5463 PERL_UNUSED_ARG(paren);
5464 PERL_UNUSED_ARG(value);
5465
5466 if (!PL_localizing)
5467 Perl_croak_no_modify(aTHX);
5468}
5469
5470I32
5471Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5472 const I32 paren)
5473{
5474 struct regexp *const rx = (struct regexp *)SvANY(r);
5475 I32 i;
5476 I32 s1, t1;
5477
5478 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5479
5480 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5481 switch (paren) {
5482 /* $` / ${^PREMATCH} */
5483 case RX_BUFF_IDX_PREMATCH:
5484 if (rx->offs[0].start != -1) {
5485 i = rx->offs[0].start;
5486 if (i > 0) {
5487 s1 = 0;
5488 t1 = i;
5489 goto getlen;
5490 }
5491 }
5492 return 0;
5493 /* $' / ${^POSTMATCH} */
5494 case RX_BUFF_IDX_POSTMATCH:
5495 if (rx->offs[0].end != -1) {
5496 i = rx->sublen - rx->offs[0].end;
5497 if (i > 0) {
5498 s1 = rx->offs[0].end;
5499 t1 = rx->sublen;
5500 goto getlen;
5501 }
5502 }
5503 return 0;
5504 /* $& / ${^MATCH}, $1, $2, ... */
5505 default:
5506 if (paren <= (I32)rx->nparens &&
5507 (s1 = rx->offs[paren].start) != -1 &&
5508 (t1 = rx->offs[paren].end) != -1)
5509 {
5510 i = t1 - s1;
5511 goto getlen;
5512 } else {
5513 if (ckWARN(WARN_UNINITIALIZED))
5514 report_uninit((const SV *)sv);
5515 return 0;
5516 }
5517 }
5518 getlen:
5519 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5520 const char * const s = rx->subbeg + s1;
5521 const U8 *ep;
5522 STRLEN el;
5523
5524 i = t1 - s1;
5525 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5526 i = el;
5527 }
5528 return i;
5529}
5530
5531SV*
5532Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5533{
5534 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5535 PERL_UNUSED_ARG(rx);
5536 if (0)
5537 return NULL;
5538 else
5539 return newSVpvs("Regexp");
5540}
5541
5542/* Scans the name of a named buffer from the pattern.
5543 * If flags is REG_RSN_RETURN_NULL returns null.
5544 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5545 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5546 * to the parsed name as looked up in the RExC_paren_names hash.
5547 * If there is an error throws a vFAIL().. type exception.
5548 */
5549
5550#define REG_RSN_RETURN_NULL 0
5551#define REG_RSN_RETURN_NAME 1
5552#define REG_RSN_RETURN_DATA 2
5553
5554STATIC SV*
5555S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5556{
5557 char *name_start = RExC_parse;
5558
5559 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5560
5561 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5562 /* skip IDFIRST by using do...while */
5563 if (UTF)
5564 do {
5565 RExC_parse += UTF8SKIP(RExC_parse);
5566 } while (isALNUM_utf8((U8*)RExC_parse));
5567 else
5568 do {
5569 RExC_parse++;
5570 } while (isALNUM(*RExC_parse));
5571 }
5572
5573 if ( flags ) {
5574 SV* sv_name
5575 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5576 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5577 if ( flags == REG_RSN_RETURN_NAME)
5578 return sv_name;
5579 else if (flags==REG_RSN_RETURN_DATA) {
5580 HE *he_str = NULL;
5581 SV *sv_dat = NULL;
5582 if ( ! sv_name ) /* should not happen*/
5583 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5584 if (RExC_paren_names)
5585 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5586 if ( he_str )
5587 sv_dat = HeVAL(he_str);
5588 if ( ! sv_dat )
5589 vFAIL("Reference to nonexistent named group");
5590 return sv_dat;
5591 }
5592 else {
5593 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5594 }
5595 /* NOT REACHED */
5596 }
5597 return NULL;
5598}
5599
5600#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5601 int rem=(int)(RExC_end - RExC_parse); \
5602 int cut; \
5603 int num; \
5604 int iscut=0; \
5605 if (rem>10) { \
5606 rem=10; \
5607 iscut=1; \
5608 } \
5609 cut=10-rem; \
5610 if (RExC_lastparse!=RExC_parse) \
5611 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5612 rem, RExC_parse, \
5613 cut + 4, \
5614 iscut ? "..." : "<" \
5615 ); \
5616 else \
5617 PerlIO_printf(Perl_debug_log,"%16s",""); \
5618 \
5619 if (SIZE_ONLY) \
5620 num = RExC_size + 1; \
5621 else \
5622 num=REG_NODE_NUM(RExC_emit); \
5623 if (RExC_lastnum!=num) \
5624 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5625 else \
5626 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5627 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5628 (int)((depth*2)), "", \
5629 (funcname) \
5630 ); \
5631 RExC_lastnum=num; \
5632 RExC_lastparse=RExC_parse; \
5633})
5634
5635
5636
5637#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5638 DEBUG_PARSE_MSG((funcname)); \
5639 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5640})
5641#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5642 DEBUG_PARSE_MSG((funcname)); \
5643 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5644})
5645/*
5646 - reg - regular expression, i.e. main body or parenthesized thing
5647 *
5648 * Caller must absorb opening parenthesis.
5649 *
5650 * Combining parenthesis handling with the base level of regular expression
5651 * is a trifle forced, but the need to tie the tails of the branches to what
5652 * follows makes it hard to avoid.
5653 */
5654#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5655#ifdef DEBUGGING
5656#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5657#else
5658#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5659#endif
5660
5661STATIC regnode *
5662S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5663 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5664{
5665 dVAR;
5666 register regnode *ret; /* Will be the head of the group. */
5667 register regnode *br;
5668 register regnode *lastbr;
5669 register regnode *ender = NULL;
5670 register I32 parno = 0;
5671 I32 flags;
5672 U32 oregflags = RExC_flags;
5673 bool have_branch = 0;
5674 bool is_open = 0;
5675 I32 freeze_paren = 0;
5676 I32 after_freeze = 0;
5677
5678 /* for (?g), (?gc), and (?o) warnings; warning
5679 about (?c) will warn about (?g) -- japhy */
5680
5681#define WASTED_O 0x01
5682#define WASTED_G 0x02
5683#define WASTED_C 0x04
5684#define WASTED_GC (0x02|0x04)
5685 I32 wastedflags = 0x00;
5686
5687 char * parse_start = RExC_parse; /* MJD */
5688 char * const oregcomp_parse = RExC_parse;
5689
5690 GET_RE_DEBUG_FLAGS_DECL;
5691
5692 PERL_ARGS_ASSERT_REG;
5693 DEBUG_PARSE("reg ");
5694
5695 *flagp = 0; /* Tentatively. */
5696
5697
5698 /* Make an OPEN node, if parenthesized. */
5699 if (paren) {
5700 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5701 char *start_verb = RExC_parse;
5702 STRLEN verb_len = 0;
5703 char *start_arg = NULL;
5704 unsigned char op = 0;
5705 int argok = 1;
5706 int internal_argval = 0; /* internal_argval is only useful if !argok */
5707 while ( *RExC_parse && *RExC_parse != ')' ) {
5708 if ( *RExC_parse == ':' ) {
5709 start_arg = RExC_parse + 1;
5710 break;
5711 }
5712 RExC_parse++;
5713 }
5714 ++start_verb;
5715 verb_len = RExC_parse - start_verb;
5716 if ( start_arg ) {
5717 RExC_parse++;
5718 while ( *RExC_parse && *RExC_parse != ')' )
5719 RExC_parse++;
5720 if ( *RExC_parse != ')' )
5721 vFAIL("Unterminated verb pattern argument");
5722 if ( RExC_parse == start_arg )
5723 start_arg = NULL;
5724 } else {
5725 if ( *RExC_parse != ')' )
5726 vFAIL("Unterminated verb pattern");
5727 }
5728
5729 switch ( *start_verb ) {
5730 case 'A': /* (*ACCEPT) */
5731 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5732 op = ACCEPT;
5733 internal_argval = RExC_nestroot;
5734 }
5735 break;
5736 case 'C': /* (*COMMIT) */
5737 if ( memEQs(start_verb,verb_len,"COMMIT") )
5738 op = COMMIT;
5739 break;
5740 case 'F': /* (*FAIL) */
5741 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5742 op = OPFAIL;
5743 argok = 0;
5744 }
5745 break;
5746 case ':': /* (*:NAME) */
5747 case 'M': /* (*MARK:NAME) */
5748 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5749 op = MARKPOINT;
5750 argok = -1;
5751 }
5752 break;
5753 case 'P': /* (*PRUNE) */
5754 if ( memEQs(start_verb,verb_len,"PRUNE") )
5755 op = PRUNE;
5756 break;
5757 case 'S': /* (*SKIP) */
5758 if ( memEQs(start_verb,verb_len,"SKIP") )
5759 op = SKIP;
5760 break;
5761 case 'T': /* (*THEN) */
5762 /* [19:06] <TimToady> :: is then */
5763 if ( memEQs(start_verb,verb_len,"THEN") ) {
5764 op = CUTGROUP;
5765 RExC_seen |= REG_SEEN_CUTGROUP;
5766 }
5767 break;
5768 }
5769 if ( ! op ) {
5770 RExC_parse++;
5771 vFAIL3("Unknown verb pattern '%.*s'",
5772 verb_len, start_verb);
5773 }
5774 if ( argok ) {
5775 if ( start_arg && internal_argval ) {
5776 vFAIL3("Verb pattern '%.*s' may not have an argument",
5777 verb_len, start_verb);
5778 } else if ( argok < 0 && !start_arg ) {
5779 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5780 verb_len, start_verb);
5781 } else {
5782 ret = reganode(pRExC_state, op, internal_argval);
5783 if ( ! internal_argval && ! SIZE_ONLY ) {
5784 if (start_arg) {
5785 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5786 ARG(ret) = add_data( pRExC_state, 1, "S" );
5787 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5788 ret->flags = 0;
5789 } else {
5790 ret->flags = 1;
5791 }
5792 }
5793 }
5794 if (!internal_argval)
5795 RExC_seen |= REG_SEEN_VERBARG;
5796 } else if ( start_arg ) {
5797 vFAIL3("Verb pattern '%.*s' may not have an argument",
5798 verb_len, start_verb);
5799 } else {
5800 ret = reg_node(pRExC_state, op);
5801 }
5802 nextchar(pRExC_state);
5803 return ret;
5804 } else
5805 if (*RExC_parse == '?') { /* (?...) */
5806 bool is_logical = 0;
5807 const char * const seqstart = RExC_parse;
5808 bool has_use_defaults = FALSE;
5809
5810 RExC_parse++;
5811 paren = *RExC_parse++;
5812 ret = NULL; /* For look-ahead/behind. */
5813 switch (paren) {
5814
5815 case 'P': /* (?P...) variants for those used to PCRE/Python */
5816 paren = *RExC_parse++;
5817 if ( paren == '<') /* (?P<...>) named capture */
5818 goto named_capture;
5819 else if (paren == '>') { /* (?P>name) named recursion */
5820 goto named_recursion;
5821 }
5822 else if (paren == '=') { /* (?P=...) named backref */
5823 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5824 you change this make sure you change that */
5825 char* name_start = RExC_parse;
5826 U32 num = 0;
5827 SV *sv_dat = reg_scan_name(pRExC_state,
5828 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5829 if (RExC_parse == name_start || *RExC_parse != ')')
5830 vFAIL2("Sequence %.3s... not terminated",parse_start);
5831
5832 if (!SIZE_ONLY) {
5833 num = add_data( pRExC_state, 1, "S" );
5834 RExC_rxi->data->data[num]=(void*)sv_dat;
5835 SvREFCNT_inc_simple_void(sv_dat);
5836 }
5837 RExC_sawback = 1;
5838 ret = reganode(pRExC_state,
5839 ((! FOLD)
5840 ? NREF
5841 : (UNI_SEMANTICS)
5842 ? NREFFU
5843 : (LOC)
5844 ? NREFFL
5845 : NREFF),
5846 num);
5847 *flagp |= HASWIDTH;
5848
5849 Set_Node_Offset(ret, parse_start+1);
5850 Set_Node_Cur_Length(ret); /* MJD */
5851
5852 nextchar(pRExC_state);
5853 return ret;
5854 }
5855 RExC_parse++;
5856 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5857 /*NOTREACHED*/
5858 case '<': /* (?<...) */
5859 if (*RExC_parse == '!')
5860 paren = ',';
5861 else if (*RExC_parse != '=')
5862 named_capture:
5863 { /* (?<...>) */
5864 char *name_start;
5865 SV *svname;
5866 paren= '>';
5867 case '\'': /* (?'...') */
5868 name_start= RExC_parse;
5869 svname = reg_scan_name(pRExC_state,
5870 SIZE_ONLY ? /* reverse test from the others */
5871 REG_RSN_RETURN_NAME :
5872 REG_RSN_RETURN_NULL);
5873 if (RExC_parse == name_start) {
5874 RExC_parse++;
5875 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5876 /*NOTREACHED*/
5877 }
5878 if (*RExC_parse != paren)
5879 vFAIL2("Sequence (?%c... not terminated",
5880 paren=='>' ? '<' : paren);
5881 if (SIZE_ONLY) {
5882 HE *he_str;
5883 SV *sv_dat = NULL;
5884 if (!svname) /* shouldn't happen */
5885 Perl_croak(aTHX_
5886 "panic: reg_scan_name returned NULL");
5887 if (!RExC_paren_names) {
5888 RExC_paren_names= newHV();
5889 sv_2mortal(MUTABLE_SV(RExC_paren_names));
5890#ifdef DEBUGGING
5891 RExC_paren_name_list= newAV();
5892 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5893#endif
5894 }
5895 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5896 if ( he_str )
5897 sv_dat = HeVAL(he_str);
5898 if ( ! sv_dat ) {
5899 /* croak baby croak */
5900 Perl_croak(aTHX_
5901 "panic: paren_name hash element allocation failed");
5902 } else if ( SvPOK(sv_dat) ) {
5903 /* (?|...) can mean we have dupes so scan to check
5904 its already been stored. Maybe a flag indicating
5905 we are inside such a construct would be useful,
5906 but the arrays are likely to be quite small, so
5907 for now we punt -- dmq */
5908 IV count = SvIV(sv_dat);
5909 I32 *pv = (I32*)SvPVX(sv_dat);
5910 IV i;
5911 for ( i = 0 ; i < count ; i++ ) {
5912 if ( pv[i] == RExC_npar ) {
5913 count = 0;
5914 break;
5915 }
5916 }
5917 if ( count ) {
5918 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5919 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5920 pv[count] = RExC_npar;
5921 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5922 }
5923 } else {
5924 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5925 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5926 SvIOK_on(sv_dat);
5927 SvIV_set(sv_dat, 1);
5928 }
5929#ifdef DEBUGGING
5930 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5931 SvREFCNT_dec(svname);
5932#endif
5933
5934 /*sv_dump(sv_dat);*/
5935 }
5936 nextchar(pRExC_state);
5937 paren = 1;
5938 goto capturing_parens;
5939 }
5940 RExC_seen |= REG_SEEN_LOOKBEHIND;
5941 RExC_parse++;
5942 case '=': /* (?=...) */
5943 RExC_seen_zerolen++;
5944 break;
5945 case '!': /* (?!...) */
5946 RExC_seen_zerolen++;
5947 if (*RExC_parse == ')') {
5948 ret=reg_node(pRExC_state, OPFAIL);
5949 nextchar(pRExC_state);
5950 return ret;
5951 }
5952 break;
5953 case '|': /* (?|...) */
5954 /* branch reset, behave like a (?:...) except that
5955 buffers in alternations share the same numbers */
5956 paren = ':';
5957 after_freeze = freeze_paren = RExC_npar;
5958 break;
5959 case ':': /* (?:...) */
5960 case '>': /* (?>...) */
5961 break;
5962 case '$': /* (?$...) */
5963 case '@': /* (?@...) */
5964 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5965 break;
5966 case '#': /* (?#...) */
5967 while (*RExC_parse && *RExC_parse != ')')
5968 RExC_parse++;
5969 if (*RExC_parse != ')')
5970 FAIL("Sequence (?#... not terminated");
5971 nextchar(pRExC_state);
5972 *flagp = TRYAGAIN;
5973 return NULL;
5974 case '0' : /* (?0) */
5975 case 'R' : /* (?R) */
5976 if (*RExC_parse != ')')
5977 FAIL("Sequence (?R) not terminated");
5978 ret = reg_node(pRExC_state, GOSTART);
5979 *flagp |= POSTPONED;
5980 nextchar(pRExC_state);
5981 return ret;
5982 /*notreached*/
5983 { /* named and numeric backreferences */
5984 I32 num;
5985 case '&': /* (?&NAME) */
5986 parse_start = RExC_parse - 1;
5987 named_recursion:
5988 {
5989 SV *sv_dat = reg_scan_name(pRExC_state,
5990 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5991 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5992 }
5993 goto gen_recurse_regop;
5994 /* NOT REACHED */
5995 case '+':
5996 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5997 RExC_parse++;
5998 vFAIL("Illegal pattern");
5999 }
6000 goto parse_recursion;
6001 /* NOT REACHED*/
6002 case '-': /* (?-1) */
6003 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6004 RExC_parse--; /* rewind to let it be handled later */
6005 goto parse_flags;
6006 }
6007 /*FALLTHROUGH */
6008 case '1': case '2': case '3': case '4': /* (?1) */
6009 case '5': case '6': case '7': case '8': case '9':
6010 RExC_parse--;
6011 parse_recursion:
6012 num = atoi(RExC_parse);
6013 parse_start = RExC_parse - 1; /* MJD */
6014 if (*RExC_parse == '-')
6015 RExC_parse++;
6016 while (isDIGIT(*RExC_parse))
6017 RExC_parse++;
6018 if (*RExC_parse!=')')
6019 vFAIL("Expecting close bracket");
6020
6021 gen_recurse_regop:
6022 if ( paren == '-' ) {
6023 /*
6024 Diagram of capture buffer numbering.
6025 Top line is the normal capture buffer numbers
6026 Bottom line is the negative indexing as from
6027 the X (the (?-2))
6028
6029 + 1 2 3 4 5 X 6 7
6030 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6031 - 5 4 3 2 1 X x x
6032
6033 */
6034 num = RExC_npar + num;
6035 if (num < 1) {
6036 RExC_parse++;
6037 vFAIL("Reference to nonexistent group");
6038 }
6039 } else if ( paren == '+' ) {
6040 num = RExC_npar + num - 1;
6041 }
6042
6043 ret = reganode(pRExC_state, GOSUB, num);
6044 if (!SIZE_ONLY) {
6045 if (num > (I32)RExC_rx->nparens) {
6046 RExC_parse++;
6047 vFAIL("Reference to nonexistent group");
6048 }
6049 ARG2L_SET( ret, RExC_recurse_count++);
6050 RExC_emit++;
6051 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6052 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6053 } else {
6054 RExC_size++;
6055 }
6056 RExC_seen |= REG_SEEN_RECURSE;
6057 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6058 Set_Node_Offset(ret, parse_start); /* MJD */
6059
6060 *flagp |= POSTPONED;
6061 nextchar(pRExC_state);
6062 return ret;
6063 } /* named and numeric backreferences */
6064 /* NOT REACHED */
6065
6066 case '?': /* (??...) */
6067 is_logical = 1;
6068 if (*RExC_parse != '{') {
6069 RExC_parse++;
6070 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6071 /*NOTREACHED*/
6072 }
6073 *flagp |= POSTPONED;
6074 paren = *RExC_parse++;
6075 /* FALL THROUGH */
6076 case '{': /* (?{...}) */
6077 {
6078 I32 count = 1;
6079 U32 n = 0;
6080 char c;
6081 char *s = RExC_parse;
6082
6083 RExC_seen_zerolen++;
6084 RExC_seen |= REG_SEEN_EVAL;
6085 while (count && (c = *RExC_parse)) {
6086 if (c == '\\') {
6087 if (RExC_parse[1])
6088 RExC_parse++;
6089 }
6090 else if (c == '{')
6091 count++;
6092 else if (c == '}')
6093 count--;
6094 RExC_parse++;
6095 }
6096 if (*RExC_parse != ')') {
6097 RExC_parse = s;
6098 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6099 }
6100 if (!SIZE_ONLY) {
6101 PAD *pad;
6102 OP_4tree *sop, *rop;
6103 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6104
6105 ENTER;
6106 Perl_save_re_context(aTHX);
6107 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6108 sop->op_private |= OPpREFCOUNTED;
6109 /* re_dup will OpREFCNT_inc */
6110 OpREFCNT_set(sop, 1);
6111 LEAVE;
6112
6113 n = add_data(pRExC_state, 3, "nop");
6114 RExC_rxi->data->data[n] = (void*)rop;
6115 RExC_rxi->data->data[n+1] = (void*)sop;
6116 RExC_rxi->data->data[n+2] = (void*)pad;
6117 SvREFCNT_dec(sv);
6118 }
6119 else { /* First pass */
6120 if (PL_reginterp_cnt < ++RExC_seen_evals
6121 && IN_PERL_RUNTIME)
6122 /* No compiled RE interpolated, has runtime
6123 components ===> unsafe. */
6124 FAIL("Eval-group not allowed at runtime, use re 'eval'");
6125 if (PL_tainting && PL_tainted)
6126 FAIL("Eval-group in insecure regular expression");
6127#if PERL_VERSION > 8
6128 if (IN_PERL_COMPILETIME)
6129 PL_cv_has_eval = 1;
6130#endif
6131 }
6132
6133 nextchar(pRExC_state);
6134 if (is_logical) {
6135 ret = reg_node(pRExC_state, LOGICAL);
6136 if (!SIZE_ONLY)
6137 ret->flags = 2;
6138 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6139 /* deal with the length of this later - MJD */
6140 return ret;
6141 }
6142 ret = reganode(pRExC_state, EVAL, n);
6143 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6144 Set_Node_Offset(ret, parse_start);
6145 return ret;
6146 }
6147 case '(': /* (?(?{...})...) and (?(?=...)...) */
6148 {
6149 int is_define= 0;
6150 if (RExC_parse[0] == '?') { /* (?(?...)) */
6151 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6152 || RExC_parse[1] == '<'
6153 || RExC_parse[1] == '{') { /* Lookahead or eval. */
6154 I32 flag;
6155
6156 ret = reg_node(pRExC_state, LOGICAL);
6157 if (!SIZE_ONLY)
6158 ret->flags = 1;
6159 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6160 goto insert_if;
6161 }
6162 }
6163 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6164 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6165 {
6166 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6167 char *name_start= RExC_parse++;
6168 U32 num = 0;
6169 SV *sv_dat=reg_scan_name(pRExC_state,
6170 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6171 if (RExC_parse == name_start || *RExC_parse != ch)
6172 vFAIL2("Sequence (?(%c... not terminated",
6173 (ch == '>' ? '<' : ch));
6174 RExC_parse++;
6175 if (!SIZE_ONLY) {
6176 num = add_data( pRExC_state, 1, "S" );
6177 RExC_rxi->data->data[num]=(void*)sv_dat;
6178 SvREFCNT_inc_simple_void(sv_dat);
6179 }
6180 ret = reganode(pRExC_state,NGROUPP,num);
6181 goto insert_if_check_paren;
6182 }
6183 else if (RExC_parse[0] == 'D' &&
6184 RExC_parse[1] == 'E' &&
6185 RExC_parse[2] == 'F' &&
6186 RExC_parse[3] == 'I' &&
6187 RExC_parse[4] == 'N' &&
6188 RExC_parse[5] == 'E')
6189 {
6190 ret = reganode(pRExC_state,DEFINEP,0);
6191 RExC_parse +=6 ;
6192 is_define = 1;
6193 goto insert_if_check_paren;
6194 }
6195 else if (RExC_parse[0] == 'R') {
6196 RExC_parse++;
6197 parno = 0;
6198 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6199 parno = atoi(RExC_parse++);
6200 while (isDIGIT(*RExC_parse))
6201 RExC_parse++;
6202 } else if (RExC_parse[0] == '&') {
6203 SV *sv_dat;
6204 RExC_parse++;
6205 sv_dat = reg_scan_name(pRExC_state,
6206 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6207 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6208 }
6209 ret = reganode(pRExC_state,INSUBP,parno);
6210 goto insert_if_check_paren;
6211 }
6212 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6213 /* (?(1)...) */
6214 char c;
6215 parno = atoi(RExC_parse++);
6216
6217 while (isDIGIT(*RExC_parse))
6218 RExC_parse++;
6219 ret = reganode(pRExC_state, GROUPP, parno);
6220
6221 insert_if_check_paren:
6222 if ((c = *nextchar(pRExC_state)) != ')')
6223 vFAIL("Switch condition not recognized");
6224 insert_if:
6225 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6226 br = regbranch(pRExC_state, &flags, 1,depth+1);
6227 if (br == NULL)
6228 br = reganode(pRExC_state, LONGJMP, 0);
6229 else
6230 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6231 c = *nextchar(pRExC_state);
6232 if (flags&HASWIDTH)
6233 *flagp |= HASWIDTH;
6234 if (c == '|') {
6235 if (is_define)
6236 vFAIL("(?(DEFINE)....) does not allow branches");
6237 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6238 regbranch(pRExC_state, &flags, 1,depth+1);
6239 REGTAIL(pRExC_state, ret, lastbr);
6240 if (flags&HASWIDTH)
6241 *flagp |= HASWIDTH;
6242 c = *nextchar(pRExC_state);
6243 }
6244 else
6245 lastbr = NULL;
6246 if (c != ')')
6247 vFAIL("Switch (?(condition)... contains too many branches");
6248 ender = reg_node(pRExC_state, TAIL);
6249 REGTAIL(pRExC_state, br, ender);
6250 if (lastbr) {
6251 REGTAIL(pRExC_state, lastbr, ender);
6252 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6253 }
6254 else
6255 REGTAIL(pRExC_state, ret, ender);
6256 RExC_size++; /* XXX WHY do we need this?!!
6257 For large programs it seems to be required
6258 but I can't figure out why. -- dmq*/
6259 return ret;
6260 }
6261 else {
6262 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6263 }
6264 }
6265 case 0:
6266 RExC_parse--; /* for vFAIL to print correctly */
6267 vFAIL("Sequence (? incomplete");
6268 break;
6269 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
6270 that follow */
6271 has_use_defaults = TRUE;
6272 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6273 if (RExC_utf8) { /* But the default for a utf8 pattern is
6274 unicode semantics */
6275 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
6276 }
6277 goto parse_flags;
6278 default:
6279 --RExC_parse;
6280 parse_flags: /* (?i) */
6281 {
6282 U32 posflags = 0, negflags = 0;
6283 U32 *flagsp = &posflags;
6284 bool has_charset_modifier = 0;
6285 regex_charset cs = REGEX_DEPENDS_CHARSET;
6286
6287 while (*RExC_parse) {
6288 /* && strchr("iogcmsx", *RExC_parse) */
6289 /* (?g), (?gc) and (?o) are useless here
6290 and must be globally applied -- japhy */
6291 switch (*RExC_parse) {
6292 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6293 case LOCALE_PAT_MOD:
6294 if (has_charset_modifier || flagsp == &negflags) {
6295 goto fail_modifiers;
6296 }
6297 cs = REGEX_LOCALE_CHARSET;
6298 has_charset_modifier = 1;
6299 break;
6300 case UNICODE_PAT_MOD:
6301 if (has_charset_modifier || flagsp == &negflags) {
6302 goto fail_modifiers;
6303 }
6304 cs = REGEX_UNICODE_CHARSET;
6305 has_charset_modifier = 1;
6306 break;
6307 case ASCII_RESTRICT_PAT_MOD:
6308 if (has_charset_modifier || flagsp == &negflags) {
6309 goto fail_modifiers;
6310 }
6311 cs = REGEX_ASCII_RESTRICTED_CHARSET;
6312 has_charset_modifier = 1;
6313 break;
6314 case DEPENDS_PAT_MOD:
6315 if (has_use_defaults
6316 || has_charset_modifier
6317 || flagsp == &negflags)
6318 {
6319 goto fail_modifiers;
6320 }
6321
6322 /* The dual charset means unicode semantics if the
6323 * pattern (or target, not known until runtime) are
6324 * utf8 */
6325 cs = (RExC_utf8)
6326 ? REGEX_UNICODE_CHARSET
6327 : REGEX_DEPENDS_CHARSET;
6328 has_charset_modifier = 1;
6329 break;
6330 case ONCE_PAT_MOD: /* 'o' */
6331 case GLOBAL_PAT_MOD: /* 'g' */
6332 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6333 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6334 if (! (wastedflags & wflagbit) ) {
6335 wastedflags |= wflagbit;
6336 vWARN5(
6337 RExC_parse + 1,
6338 "Useless (%s%c) - %suse /%c modifier",
6339 flagsp == &negflags ? "?-" : "?",
6340 *RExC_parse,
6341 flagsp == &negflags ? "don't " : "",
6342 *RExC_parse
6343 );
6344 }
6345 }
6346 break;
6347
6348 case CONTINUE_PAT_MOD: /* 'c' */
6349 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6350 if (! (wastedflags & WASTED_C) ) {
6351 wastedflags |= WASTED_GC;
6352 vWARN3(
6353 RExC_parse + 1,
6354 "Useless (%sc) - %suse /gc modifier",
6355 flagsp == &negflags ? "?-" : "?",
6356 flagsp == &negflags ? "don't " : ""
6357 );
6358 }
6359 }
6360 break;
6361 case KEEPCOPY_PAT_MOD: /* 'p' */
6362 if (flagsp == &negflags) {
6363 if (SIZE_ONLY)
6364 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6365 } else {
6366 *flagsp |= RXf_PMf_KEEPCOPY;
6367 }
6368 break;
6369 case '-':
6370 /* A flag is a default iff it is following a minus, so
6371 * if there is a minus, it means will be trying to
6372 * re-specify a default which is an error */
6373 if (has_use_defaults || flagsp == &negflags) {
6374 fail_modifiers:
6375 RExC_parse++;
6376 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6377 /*NOTREACHED*/
6378 }
6379 flagsp = &negflags;
6380 wastedflags = 0; /* reset so (?g-c) warns twice */
6381 break;
6382 case ':':
6383 paren = ':';
6384 /*FALLTHROUGH*/
6385 case ')':
6386 RExC_flags |= posflags;
6387 RExC_flags &= ~negflags;
6388 set_regex_charset(&RExC_flags, cs);
6389 if (paren != ':') {
6390 oregflags |= posflags;
6391 oregflags &= ~negflags;
6392 set_regex_charset(&oregflags, cs);
6393 }
6394 nextchar(pRExC_state);
6395 if (paren != ':') {
6396 *flagp = TRYAGAIN;
6397 return NULL;
6398 } else {
6399 ret = NULL;
6400 goto parse_rest;
6401 }
6402 /*NOTREACHED*/
6403 default:
6404 RExC_parse++;
6405 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6406 /*NOTREACHED*/
6407 }
6408 ++RExC_parse;
6409 }
6410 }} /* one for the default block, one for the switch */
6411 }
6412 else { /* (...) */
6413 capturing_parens:
6414 parno = RExC_npar;
6415 RExC_npar++;
6416
6417 ret = reganode(pRExC_state, OPEN, parno);
6418 if (!SIZE_ONLY ){
6419 if (!RExC_nestroot)
6420 RExC_nestroot = parno;
6421 if (RExC_seen & REG_SEEN_RECURSE
6422 && !RExC_open_parens[parno-1])
6423 {
6424 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6425 "Setting open paren #%"IVdf" to %d\n",
6426 (IV)parno, REG_NODE_NUM(ret)));
6427 RExC_open_parens[parno-1]= ret;
6428 }
6429 }
6430 Set_Node_Length(ret, 1); /* MJD */
6431 Set_Node_Offset(ret, RExC_parse); /* MJD */
6432 is_open = 1;
6433 }
6434 }
6435 else /* ! paren */
6436 ret = NULL;
6437
6438 parse_rest:
6439 /* Pick up the branches, linking them together. */
6440 parse_start = RExC_parse; /* MJD */
6441 br = regbranch(pRExC_state, &flags, 1,depth+1);
6442
6443 if (freeze_paren) {
6444 if (RExC_npar > after_freeze)
6445 after_freeze = RExC_npar;
6446 RExC_npar = freeze_paren;
6447 }
6448
6449 /* branch_len = (paren != 0); */
6450
6451 if (br == NULL)
6452 return(NULL);
6453 if (*RExC_parse == '|') {
6454 if (!SIZE_ONLY && RExC_extralen) {
6455 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6456 }
6457 else { /* MJD */
6458 reginsert(pRExC_state, BRANCH, br, depth+1);
6459 Set_Node_Length(br, paren != 0);
6460 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6461 }
6462 have_branch = 1;
6463 if (SIZE_ONLY)
6464 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6465 }
6466 else if (paren == ':') {
6467 *flagp |= flags&SIMPLE;
6468 }
6469 if (is_open) { /* Starts with OPEN. */
6470 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6471 }
6472 else if (paren != '?') /* Not Conditional */
6473 ret = br;
6474 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6475 lastbr = br;
6476 while (*RExC_parse == '|') {
6477 if (!SIZE_ONLY && RExC_extralen) {
6478 ender = reganode(pRExC_state, LONGJMP,0);
6479 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6480 }
6481 if (SIZE_ONLY)
6482 RExC_extralen += 2; /* Account for LONGJMP. */
6483 nextchar(pRExC_state);
6484 if (freeze_paren) {
6485 if (RExC_npar > after_freeze)
6486 after_freeze = RExC_npar;
6487 RExC_npar = freeze_paren;
6488 }
6489 br = regbranch(pRExC_state, &flags, 0, depth+1);
6490
6491 if (br == NULL)
6492 return(NULL);
6493 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6494 lastbr = br;
6495 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6496 }
6497
6498 if (have_branch || paren != ':') {
6499 /* Make a closing node, and hook it on the end. */
6500 switch (paren) {
6501 case ':':
6502 ender = reg_node(pRExC_state, TAIL);
6503 break;
6504 case 1:
6505 ender = reganode(pRExC_state, CLOSE, parno);
6506 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6507 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6508 "Setting close paren #%"IVdf" to %d\n",
6509 (IV)parno, REG_NODE_NUM(ender)));
6510 RExC_close_parens[parno-1]= ender;
6511 if (RExC_nestroot == parno)
6512 RExC_nestroot = 0;
6513 }
6514 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6515 Set_Node_Length(ender,1); /* MJD */
6516 break;
6517 case '<':
6518 case ',':
6519 case '=':
6520 case '!':
6521 *flagp &= ~HASWIDTH;
6522 /* FALL THROUGH */
6523 case '>':
6524 ender = reg_node(pRExC_state, SUCCEED);
6525 break;
6526 case 0:
6527 ender = reg_node(pRExC_state, END);
6528 if (!SIZE_ONLY) {
6529 assert(!RExC_opend); /* there can only be one! */
6530 RExC_opend = ender;
6531 }
6532 break;
6533 }
6534 REGTAIL(pRExC_state, lastbr, ender);
6535
6536 if (have_branch && !SIZE_ONLY) {
6537 if (depth==1)
6538 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6539
6540 /* Hook the tails of the branches to the closing node. */
6541 for (br = ret; br; br = regnext(br)) {
6542 const U8 op = PL_regkind[OP(br)];
6543 if (op == BRANCH) {
6544 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6545 }
6546 else if (op == BRANCHJ) {
6547 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6548 }
6549 }
6550 }
6551 }
6552
6553 {
6554 const char *p;
6555 static const char parens[] = "=!<,>";
6556
6557 if (paren && (p = strchr(parens, paren))) {
6558 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6559 int flag = (p - parens) > 1;
6560
6561 if (paren == '>')
6562 node = SUSPEND, flag = 0;
6563 reginsert(pRExC_state, node,ret, depth+1);
6564 Set_Node_Cur_Length(ret);
6565 Set_Node_Offset(ret, parse_start + 1);
6566 ret->flags = flag;
6567 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6568 }
6569 }
6570
6571 /* Check for proper termination. */
6572 if (paren) {
6573 RExC_flags = oregflags;
6574 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6575 RExC_parse = oregcomp_parse;
6576 vFAIL("Unmatched (");
6577 }
6578 }
6579 else if (!paren && RExC_parse < RExC_end) {
6580 if (*RExC_parse == ')') {
6581 RExC_parse++;
6582 vFAIL("Unmatched )");
6583 }
6584 else
6585 FAIL("Junk on end of regexp"); /* "Can't happen". */
6586 /* NOTREACHED */
6587 }
6588 if (after_freeze)
6589 RExC_npar = after_freeze;
6590 return(ret);
6591}
6592
6593/*
6594 - regbranch - one alternative of an | operator
6595 *
6596 * Implements the concatenation operator.
6597 */
6598STATIC regnode *
6599S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6600{
6601 dVAR;
6602 register regnode *ret;
6603 register regnode *chain = NULL;
6604 register regnode *latest;
6605 I32 flags = 0, c = 0;
6606 GET_RE_DEBUG_FLAGS_DECL;
6607
6608 PERL_ARGS_ASSERT_REGBRANCH;
6609
6610 DEBUG_PARSE("brnc");
6611
6612 if (first)
6613 ret = NULL;
6614 else {
6615 if (!SIZE_ONLY && RExC_extralen)
6616 ret = reganode(pRExC_state, BRANCHJ,0);
6617 else {
6618 ret = reg_node(pRExC_state, BRANCH);
6619 Set_Node_Length(ret, 1);
6620 }
6621 }
6622
6623 if (!first && SIZE_ONLY)
6624 RExC_extralen += 1; /* BRANCHJ */
6625
6626 *flagp = WORST; /* Tentatively. */
6627
6628 RExC_parse--;
6629 nextchar(pRExC_state);
6630 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6631 flags &= ~TRYAGAIN;
6632 latest = regpiece(pRExC_state, &flags,depth+1);
6633 if (latest == NULL) {
6634 if (flags & TRYAGAIN)
6635 continue;
6636 return(NULL);
6637 }
6638 else if (ret == NULL)
6639 ret = latest;
6640 *flagp |= flags&(HASWIDTH|POSTPONED);
6641 if (chain == NULL) /* First piece. */
6642 *flagp |= flags&SPSTART;
6643 else {
6644 RExC_naughty++;
6645 REGTAIL(pRExC_state, chain, latest);
6646 }
6647 chain = latest;
6648 c++;
6649 }
6650 if (chain == NULL) { /* Loop ran zero times. */
6651 chain = reg_node(pRExC_state, NOTHING);
6652 if (ret == NULL)
6653 ret = chain;
6654 }
6655 if (c == 1) {
6656 *flagp |= flags&SIMPLE;
6657 }
6658
6659 return ret;
6660}
6661
6662/*
6663 - regpiece - something followed by possible [*+?]
6664 *
6665 * Note that the branching code sequences used for ? and the general cases
6666 * of * and + are somewhat optimized: they use the same NOTHING node as
6667 * both the endmarker for their branch list and the body of the last branch.
6668 * It might seem that this node could be dispensed with entirely, but the
6669 * endmarker role is not redundant.
6670 */
6671STATIC regnode *
6672S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6673{
6674 dVAR;
6675 register regnode *ret;
6676 register char op;
6677 register char *next;
6678 I32 flags;
6679 const char * const origparse = RExC_parse;
6680 I32 min;
6681 I32 max = REG_INFTY;
6682 char *parse_start;
6683 const char *maxpos = NULL;
6684 GET_RE_DEBUG_FLAGS_DECL;
6685
6686 PERL_ARGS_ASSERT_REGPIECE;
6687
6688 DEBUG_PARSE("piec");
6689
6690 ret = regatom(pRExC_state, &flags,depth+1);
6691 if (ret == NULL) {
6692 if (flags & TRYAGAIN)
6693 *flagp |= TRYAGAIN;
6694 return(NULL);
6695 }
6696
6697 op = *RExC_parse;
6698
6699 if (op == '{' && regcurly(RExC_parse)) {
6700 maxpos = NULL;
6701 parse_start = RExC_parse; /* MJD */
6702 next = RExC_parse + 1;
6703 while (isDIGIT(*next) || *next == ',') {
6704 if (*next == ',') {
6705 if (maxpos)
6706 break;
6707 else
6708 maxpos = next;
6709 }
6710 next++;
6711 }
6712 if (*next == '}') { /* got one */
6713 if (!maxpos)
6714 maxpos = next;
6715 RExC_parse++;
6716 min = atoi(RExC_parse);
6717 if (*maxpos == ',')
6718 maxpos++;
6719 else
6720 maxpos = RExC_parse;
6721 max = atoi(maxpos);
6722 if (!max && *maxpos != '0')
6723 max = REG_INFTY; /* meaning "infinity" */
6724 else if (max >= REG_INFTY)
6725 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6726 RExC_parse = next;
6727 nextchar(pRExC_state);
6728
6729 do_curly:
6730 if ((flags&SIMPLE)) {
6731 RExC_naughty += 2 + RExC_naughty / 2;
6732 reginsert(pRExC_state, CURLY, ret, depth+1);
6733 Set_Node_Offset(ret, parse_start+1); /* MJD */
6734 Set_Node_Cur_Length(ret);
6735 }
6736 else {
6737 regnode * const w = reg_node(pRExC_state, WHILEM);
6738
6739 w->flags = 0;
6740 REGTAIL(pRExC_state, ret, w);
6741 if (!SIZE_ONLY && RExC_extralen) {
6742 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6743 reginsert(pRExC_state, NOTHING,ret, depth+1);
6744 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6745 }
6746 reginsert(pRExC_state, CURLYX,ret, depth+1);
6747 /* MJD hk */
6748 Set_Node_Offset(ret, parse_start+1);
6749 Set_Node_Length(ret,
6750 op == '{' ? (RExC_parse - parse_start) : 1);
6751
6752 if (!SIZE_ONLY && RExC_extralen)
6753 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6754 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6755 if (SIZE_ONLY)
6756 RExC_whilem_seen++, RExC_extralen += 3;
6757 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6758 }
6759 ret->flags = 0;
6760
6761 if (min > 0)
6762 *flagp = WORST;
6763 if (max > 0)
6764 *flagp |= HASWIDTH;
6765 if (max < min)
6766 vFAIL("Can't do {n,m} with n > m");
6767 if (!SIZE_ONLY) {
6768 ARG1_SET(ret, (U16)min);
6769 ARG2_SET(ret, (U16)max);
6770 }
6771
6772 goto nest_check;
6773 }
6774 }
6775
6776 if (!ISMULT1(op)) {
6777 *flagp = flags;
6778 return(ret);
6779 }
6780
6781#if 0 /* Now runtime fix should be reliable. */
6782
6783 /* if this is reinstated, don't forget to put this back into perldiag:
6784
6785 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6786
6787 (F) The part of the regexp subject to either the * or + quantifier
6788 could match an empty string. The {#} shows in the regular
6789 expression about where the problem was discovered.
6790
6791 */
6792
6793 if (!(flags&HASWIDTH) && op != '?')
6794 vFAIL("Regexp *+ operand could be empty");
6795#endif
6796
6797 parse_start = RExC_parse;
6798 nextchar(pRExC_state);
6799
6800 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6801
6802 if (op == '*' && (flags&SIMPLE)) {
6803 reginsert(pRExC_state, STAR, ret, depth+1);
6804 ret->flags = 0;
6805 RExC_naughty += 4;
6806 }
6807 else if (op == '*') {
6808 min = 0;
6809 goto do_curly;
6810 }
6811 else if (op == '+' && (flags&SIMPLE)) {
6812 reginsert(pRExC_state, PLUS, ret, depth+1);
6813 ret->flags = 0;
6814 RExC_naughty += 3;
6815 }
6816 else if (op == '+') {
6817 min = 1;
6818 goto do_curly;
6819 }
6820 else if (op == '?') {
6821 min = 0; max = 1;
6822 goto do_curly;
6823 }
6824 nest_check:
6825 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6826 ckWARN3reg(RExC_parse,
6827 "%.*s matches null string many times",
6828 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6829 origparse);
6830 }
6831
6832 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6833 nextchar(pRExC_state);
6834 reginsert(pRExC_state, MINMOD, ret, depth+1);
6835 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6836 }
6837#ifndef REG_ALLOW_MINMOD_SUSPEND
6838 else
6839#endif
6840 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6841 regnode *ender;
6842 nextchar(pRExC_state);
6843 ender = reg_node(pRExC_state, SUCCEED);
6844 REGTAIL(pRExC_state, ret, ender);
6845 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6846 ret->flags = 0;
6847 ender = reg_node(pRExC_state, TAIL);
6848 REGTAIL(pRExC_state, ret, ender);
6849 /*ret= ender;*/
6850 }
6851
6852 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6853 RExC_parse++;
6854 vFAIL("Nested quantifiers");
6855 }
6856
6857 return(ret);
6858}
6859
6860
6861/* reg_namedseq(pRExC_state,UVp)
6862
6863 This is expected to be called by a parser routine that has
6864 recognized '\N' and needs to handle the rest. RExC_parse is
6865 expected to point at the first char following the N at the time
6866 of the call.
6867
6868 The \N may be inside (indicated by valuep not being NULL) or outside a
6869 character class.
6870
6871 \N may begin either a named sequence, or if outside a character class, mean
6872 to match a non-newline. For non single-quoted regexes, the tokenizer has
6873 attempted to decide which, and in the case of a named sequence converted it
6874 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6875 where c1... are the characters in the sequence. For single-quoted regexes,
6876 the tokenizer passes the \N sequence through unchanged; this code will not
6877 attempt to determine this nor expand those. The net effect is that if the
6878 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6879 signals that this \N occurrence means to match a non-newline.
6880
6881 Only the \N{U+...} form should occur in a character class, for the same
6882 reason that '.' inside a character class means to just match a period: it
6883 just doesn't make sense.
6884
6885 If valuep is non-null then it is assumed that we are parsing inside
6886 of a charclass definition and the first codepoint in the resolved
6887 string is returned via *valuep and the routine will return NULL.
6888 In this mode if a multichar string is returned from the charnames
6889 handler, a warning will be issued, and only the first char in the
6890 sequence will be examined. If the string returned is zero length
6891 then the value of *valuep is undefined and NON-NULL will
6892 be returned to indicate failure. (This will NOT be a valid pointer
6893 to a regnode.)
6894
6895 If valuep is null then it is assumed that we are parsing normal text and a
6896 new EXACT node is inserted into the program containing the resolved string,
6897 and a pointer to the new node is returned. But if the string is zero length
6898 a NOTHING node is emitted instead.
6899
6900 On success RExC_parse is set to the char following the endbrace.
6901 Parsing failures will generate a fatal error via vFAIL(...)
6902 */
6903STATIC regnode *
6904S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6905{
6906 char * endbrace; /* '}' following the name */
6907 regnode *ret = NULL;
6908#ifdef DEBUGGING
6909 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6910#endif
6911 char* p;
6912
6913 GET_RE_DEBUG_FLAGS_DECL;
6914
6915 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6916
6917 GET_RE_DEBUG_FLAGS;
6918
6919 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6920 * modifier. The other meaning does not */
6921 p = (RExC_flags & RXf_PMf_EXTENDED)
6922 ? regwhite( pRExC_state, RExC_parse )
6923 : RExC_parse;
6924
6925 /* Disambiguate between \N meaning a named character versus \N meaning
6926 * [^\n]. The former is assumed when it can't be the latter. */
6927 if (*p != '{' || regcurly(p)) {
6928 RExC_parse = p;
6929 if (valuep) {
6930 /* no bare \N in a charclass */
6931 vFAIL("\\N in a character class must be a named character: \\N{...}");
6932 }
6933 nextchar(pRExC_state);
6934 ret = reg_node(pRExC_state, REG_ANY);
6935 *flagp |= HASWIDTH|SIMPLE;
6936 RExC_naughty++;
6937 RExC_parse--;
6938 Set_Node_Length(ret, 1); /* MJD */
6939 return ret;
6940 }
6941
6942 /* Here, we have decided it should be a named sequence */
6943
6944 /* The test above made sure that the next real character is a '{', but
6945 * under the /x modifier, it could be separated by space (or a comment and
6946 * \n) and this is not allowed (for consistency with \x{...} and the
6947 * tokenizer handling of \N{NAME}). */
6948 if (*RExC_parse != '{') {
6949 vFAIL("Missing braces on \\N{}");
6950 }
6951
6952 RExC_parse++; /* Skip past the '{' */
6953
6954 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6955 || ! (endbrace == RExC_parse /* nothing between the {} */
6956 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6957 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6958 {
6959 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6960 vFAIL("\\N{NAME} must be resolved by the lexer");
6961 }
6962
6963 if (endbrace == RExC_parse) { /* empty: \N{} */
6964 if (! valuep) {
6965 RExC_parse = endbrace + 1;
6966 return reg_node(pRExC_state,NOTHING);
6967 }
6968
6969 if (SIZE_ONLY) {
6970 ckWARNreg(RExC_parse,
6971 "Ignoring zero length \\N{} in character class"
6972 );
6973 RExC_parse = endbrace + 1;
6974 }
6975 *valuep = 0;
6976 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6977 }
6978
6979 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
6980 RExC_parse += 2; /* Skip past the 'U+' */
6981
6982 if (valuep) { /* In a bracketed char class */
6983 /* We only pay attention to the first char of
6984 multichar strings being returned. I kinda wonder
6985 if this makes sense as it does change the behaviour
6986 from earlier versions, OTOH that behaviour was broken
6987 as well. XXX Solution is to recharacterize as
6988 [rest-of-class]|multi1|multi2... */
6989
6990 STRLEN length_of_hex;
6991 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6992 | PERL_SCAN_DISALLOW_PREFIX
6993 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6994
6995 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
6996 if (endchar < endbrace) {
6997 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6998 }
6999
7000 length_of_hex = (STRLEN)(endchar - RExC_parse);
7001 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7002
7003 /* The tokenizer should have guaranteed validity, but it's possible to
7004 * bypass it by using single quoting, so check */
7005 if (length_of_hex == 0
7006 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7007 {
7008 RExC_parse += length_of_hex; /* Includes all the valid */
7009 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7010 ? UTF8SKIP(RExC_parse)
7011 : 1;
7012 /* Guard against malformed utf8 */
7013 if (RExC_parse >= endchar) RExC_parse = endchar;
7014 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7015 }
7016
7017 RExC_parse = endbrace + 1;
7018 if (endchar == endbrace) return NULL;
7019
7020 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
7021 }
7022 else { /* Not a char class */
7023 char *s; /* String to put in generated EXACT node */
7024 STRLEN len = 0; /* Its current byte length */
7025 char *endchar; /* Points to '.' or '}' ending cur char in the input
7026 stream */
7027
7028 ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
7029 : (LOC)
7030 ? EXACTFL
7031 : UNI_SEMANTICS
7032 ? EXACTFU
7033 : EXACTF));
7034 s= STRING(ret);
7035
7036 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
7037 * the input which is of the form now 'c1.c2.c3...}' until find the
7038 * ending brace or exceed length 255. The characters that exceed this
7039 * limit are dropped. The limit could be relaxed should it become
7040 * desirable by reparsing this as (?:\N{NAME}), so could generate
7041 * multiple EXACT nodes, as is done for just regular input. But this
7042 * is primarily a named character, and not intended to be a huge long
7043 * string, so 255 bytes should be good enough */
7044 while (1) {
7045 STRLEN length_of_hex;
7046 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7047 | PERL_SCAN_DISALLOW_PREFIX
7048 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7049 UV cp; /* Ord of current character */
7050
7051 /* Code points are separated by dots. If none, there is only one
7052 * code point, and is terminated by the brace */
7053 endchar = RExC_parse + strcspn(RExC_parse, ".}");
7054
7055 /* The values are Unicode even on EBCDIC machines */
7056 length_of_hex = (STRLEN)(endchar - RExC_parse);
7057 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7058 if ( length_of_hex == 0
7059 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7060 {
7061 RExC_parse += length_of_hex; /* Includes all the valid */
7062 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7063 ? UTF8SKIP(RExC_parse)
7064 : 1;
7065 /* Guard against malformed utf8 */
7066 if (RExC_parse >= endchar) RExC_parse = endchar;
7067 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7068 }
7069
7070 if (! FOLD) { /* Not folding, just append to the string */
7071 STRLEN unilen;
7072
7073 /* Quit before adding this character if would exceed limit */
7074 if (len + UNISKIP(cp) > U8_MAX) break;
7075
7076 unilen = reguni(pRExC_state, cp, s);
7077 if (unilen > 0) {
7078 s += unilen;
7079 len += unilen;
7080 }
7081 } else { /* Folding, output the folded equivalent */
7082 STRLEN foldlen,numlen;
7083 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7084 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7085
7086 /* Quit before exceeding size limit */
7087 if (len + foldlen > U8_MAX) break;
7088
7089 for (foldbuf = tmpbuf;
7090 foldlen;
7091 foldlen -= numlen)
7092 {
7093 cp = utf8_to_uvchr(foldbuf, &numlen);
7094 if (numlen > 0) {
7095 const STRLEN unilen = reguni(pRExC_state, cp, s);
7096 s += unilen;
7097 len += unilen;
7098 /* In EBCDIC the numlen and unilen can differ. */
7099 foldbuf += numlen;
7100 if (numlen >= foldlen)
7101 break;
7102 }
7103 else
7104 break; /* "Can't happen." */
7105 }
7106 }
7107
7108 /* Point to the beginning of the next character in the sequence. */
7109 RExC_parse = endchar + 1;
7110
7111 /* Quit if no more characters */
7112 if (RExC_parse >= endbrace) break;
7113 }
7114
7115
7116 if (SIZE_ONLY) {
7117 if (RExC_parse < endbrace) {
7118 ckWARNreg(RExC_parse - 1,
7119 "Using just the first characters returned by \\N{}");
7120 }
7121
7122 RExC_size += STR_SZ(len);
7123 } else {
7124 STR_LEN(ret) = len;
7125 RExC_emit += STR_SZ(len);
7126 }
7127
7128 RExC_parse = endbrace + 1;
7129
7130 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7131 with malformed in t/re/pat_advanced.t */
7132 RExC_parse --;
7133 Set_Node_Cur_Length(ret); /* MJD */
7134 nextchar(pRExC_state);
7135 }
7136
7137 return ret;
7138}
7139
7140
7141/*
7142 * reg_recode
7143 *
7144 * It returns the code point in utf8 for the value in *encp.
7145 * value: a code value in the source encoding
7146 * encp: a pointer to an Encode object
7147 *
7148 * If the result from Encode is not a single character,
7149 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7150 */
7151STATIC UV
7152S_reg_recode(pTHX_ const char value, SV **encp)
7153{
7154 STRLEN numlen = 1;
7155 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7156 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7157 const STRLEN newlen = SvCUR(sv);
7158 UV uv = UNICODE_REPLACEMENT;
7159
7160 PERL_ARGS_ASSERT_REG_RECODE;
7161
7162 if (newlen)
7163 uv = SvUTF8(sv)
7164 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7165 : *(U8*)s;
7166
7167 if (!newlen || numlen != newlen) {
7168 uv = UNICODE_REPLACEMENT;
7169 *encp = NULL;
7170 }
7171 return uv;
7172}
7173
7174
7175/*
7176 - regatom - the lowest level
7177
7178 Try to identify anything special at the start of the pattern. If there
7179 is, then handle it as required. This may involve generating a single regop,
7180 such as for an assertion; or it may involve recursing, such as to
7181 handle a () structure.
7182
7183 If the string doesn't start with something special then we gobble up
7184 as much literal text as we can.
7185
7186 Once we have been able to handle whatever type of thing started the
7187 sequence, we return.
7188
7189 Note: we have to be careful with escapes, as they can be both literal
7190 and special, and in the case of \10 and friends can either, depending
7191 on context. Specifically there are two separate switches for handling
7192 escape sequences, with the one for handling literal escapes requiring
7193 a dummy entry for all of the special escapes that are actually handled
7194 by the other.
7195*/
7196
7197STATIC regnode *
7198S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7199{
7200 dVAR;
7201 register regnode *ret = NULL;
7202 I32 flags;
7203 char *parse_start = RExC_parse;
7204 U8 op;
7205 GET_RE_DEBUG_FLAGS_DECL;
7206 DEBUG_PARSE("atom");
7207 *flagp = WORST; /* Tentatively. */
7208
7209 PERL_ARGS_ASSERT_REGATOM;
7210
7211tryagain:
7212 switch ((U8)*RExC_parse) {
7213 case '^':
7214 RExC_seen_zerolen++;
7215 nextchar(pRExC_state);
7216 if (RExC_flags & RXf_PMf_MULTILINE)
7217 ret = reg_node(pRExC_state, MBOL);
7218 else if (RExC_flags & RXf_PMf_SINGLELINE)
7219 ret = reg_node(pRExC_state, SBOL);
7220 else
7221 ret = reg_node(pRExC_state, BOL);
7222 Set_Node_Length(ret, 1); /* MJD */
7223 break;
7224 case '$':
7225 nextchar(pRExC_state);
7226 if (*RExC_parse)
7227 RExC_seen_zerolen++;
7228 if (RExC_flags & RXf_PMf_MULTILINE)
7229 ret = reg_node(pRExC_state, MEOL);
7230 else if (RExC_flags & RXf_PMf_SINGLELINE)
7231 ret = reg_node(pRExC_state, SEOL);
7232 else
7233 ret = reg_node(pRExC_state, EOL);
7234 Set_Node_Length(ret, 1); /* MJD */
7235 break;
7236 case '.':
7237 nextchar(pRExC_state);
7238 if (RExC_flags & RXf_PMf_SINGLELINE)
7239 ret = reg_node(pRExC_state, SANY);
7240 else
7241 ret = reg_node(pRExC_state, REG_ANY);
7242 *flagp |= HASWIDTH|SIMPLE;
7243 RExC_naughty++;
7244 Set_Node_Length(ret, 1); /* MJD */
7245 break;
7246 case '[':
7247 {
7248 char * const oregcomp_parse = ++RExC_parse;
7249 ret = regclass(pRExC_state,depth+1);
7250 if (*RExC_parse != ']') {
7251 RExC_parse = oregcomp_parse;
7252 vFAIL("Unmatched [");
7253 }
7254 nextchar(pRExC_state);
7255 *flagp |= HASWIDTH|SIMPLE;
7256 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7257 break;
7258 }
7259 case '(':
7260 nextchar(pRExC_state);
7261 ret = reg(pRExC_state, 1, &flags,depth+1);
7262 if (ret == NULL) {
7263 if (flags & TRYAGAIN) {
7264 if (RExC_parse == RExC_end) {
7265 /* Make parent create an empty node if needed. */
7266 *flagp |= TRYAGAIN;
7267 return(NULL);
7268 }
7269 goto tryagain;
7270 }
7271 return(NULL);
7272 }
7273 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7274 break;
7275 case '|':
7276 case ')':
7277 if (flags & TRYAGAIN) {
7278 *flagp |= TRYAGAIN;
7279 return NULL;
7280 }
7281 vFAIL("Internal urp");
7282 /* Supposed to be caught earlier. */
7283 break;
7284 case '{':
7285 if (!regcurly(RExC_parse)) {
7286 RExC_parse++;
7287 goto defchar;
7288 }
7289 /* FALL THROUGH */
7290 case '?':
7291 case '+':
7292 case '*':
7293 RExC_parse++;
7294 vFAIL("Quantifier follows nothing");
7295 break;
7296 case LATIN_SMALL_LETTER_SHARP_S:
7297 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7298 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7299#if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
7300#error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ. Other instances in this code should have the case statement below.
7301 case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
7302#endif
7303 do_foldchar:
7304 if (!LOC && FOLD) {
7305 U32 len,cp;
7306 len=0; /* silence a spurious compiler warning */
7307 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7308 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7309 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7310 ret = reganode(pRExC_state, FOLDCHAR, cp);
7311 Set_Node_Length(ret, 1); /* MJD */
7312 nextchar(pRExC_state); /* kill whitespace under /x */
7313 return ret;
7314 }
7315 }
7316 goto outer_default;
7317 case '\\':
7318 /* Special Escapes
7319
7320 This switch handles escape sequences that resolve to some kind
7321 of special regop and not to literal text. Escape sequnces that
7322 resolve to literal text are handled below in the switch marked
7323 "Literal Escapes".
7324
7325 Every entry in this switch *must* have a corresponding entry
7326 in the literal escape switch. However, the opposite is not
7327 required, as the default for this switch is to jump to the
7328 literal text handling code.
7329 */
7330 switch ((U8)*++RExC_parse) {
7331 case LATIN_SMALL_LETTER_SHARP_S:
7332 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7333 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7334 goto do_foldchar;
7335 /* Special Escapes */
7336 case 'A':
7337 RExC_seen_zerolen++;
7338 ret = reg_node(pRExC_state, SBOL);
7339 *flagp |= SIMPLE;
7340 goto finish_meta_pat;
7341 case 'G':
7342 ret = reg_node(pRExC_state, GPOS);
7343 RExC_seen |= REG_SEEN_GPOS;
7344 *flagp |= SIMPLE;
7345 goto finish_meta_pat;
7346 case 'K':
7347 RExC_seen_zerolen++;
7348 ret = reg_node(pRExC_state, KEEPS);
7349 *flagp |= SIMPLE;
7350 /* XXX:dmq : disabling in-place substitution seems to
7351 * be necessary here to avoid cases of memory corruption, as
7352 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7353 */
7354 RExC_seen |= REG_SEEN_LOOKBEHIND;
7355 goto finish_meta_pat;
7356 case 'Z':
7357 ret = reg_node(pRExC_state, SEOL);
7358 *flagp |= SIMPLE;
7359 RExC_seen_zerolen++; /* Do not optimize RE away */
7360 goto finish_meta_pat;
7361 case 'z':
7362 ret = reg_node(pRExC_state, EOS);
7363 *flagp |= SIMPLE;
7364 RExC_seen_zerolen++; /* Do not optimize RE away */
7365 goto finish_meta_pat;
7366 case 'C':
7367 ret = reg_node(pRExC_state, CANY);
7368 RExC_seen |= REG_SEEN_CANY;
7369 *flagp |= HASWIDTH|SIMPLE;
7370 goto finish_meta_pat;
7371 case 'X':
7372 ret = reg_node(pRExC_state, CLUMP);
7373 *flagp |= HASWIDTH;
7374 goto finish_meta_pat;
7375 case 'w':
7376 switch (get_regex_charset(RExC_flags)) {
7377 case REGEX_LOCALE_CHARSET:
7378 op = ALNUML;
7379 break;
7380 case REGEX_UNICODE_CHARSET:
7381 op = ALNUMU;
7382 break;
7383 case REGEX_ASCII_RESTRICTED_CHARSET:
7384 op = ALNUMA;
7385 break;
7386 case REGEX_DEPENDS_CHARSET:
7387 op = ALNUM;
7388 break;
7389 default:
7390 goto bad_charset;
7391 }
7392 ret = reg_node(pRExC_state, op);
7393 *flagp |= HASWIDTH|SIMPLE;
7394 goto finish_meta_pat;
7395 case 'W':
7396 switch (get_regex_charset(RExC_flags)) {
7397 case REGEX_LOCALE_CHARSET:
7398 op = NALNUML;
7399 break;
7400 case REGEX_UNICODE_CHARSET:
7401 op = NALNUMU;
7402 break;
7403 case REGEX_ASCII_RESTRICTED_CHARSET:
7404 op = NALNUMA;
7405 break;
7406 case REGEX_DEPENDS_CHARSET:
7407 op = NALNUM;
7408 break;
7409 default:
7410 goto bad_charset;
7411 }
7412 ret = reg_node(pRExC_state, op);
7413 *flagp |= HASWIDTH|SIMPLE;
7414 goto finish_meta_pat;
7415 case 'b':
7416 RExC_seen_zerolen++;
7417 RExC_seen |= REG_SEEN_LOOKBEHIND;
7418 switch (get_regex_charset(RExC_flags)) {
7419 case REGEX_LOCALE_CHARSET:
7420 op = BOUNDL;
7421 break;
7422 case REGEX_UNICODE_CHARSET:
7423 op = BOUNDU;
7424 break;
7425 case REGEX_ASCII_RESTRICTED_CHARSET:
7426 op = BOUNDA;
7427 break;
7428 case REGEX_DEPENDS_CHARSET:
7429 op = BOUND;
7430 break;
7431 default:
7432 goto bad_charset;
7433 }
7434 ret = reg_node(pRExC_state, op);
7435 FLAGS(ret) = get_regex_charset(RExC_flags);
7436 *flagp |= SIMPLE;
7437 goto finish_meta_pat;
7438 case 'B':
7439 RExC_seen_zerolen++;
7440 RExC_seen |= REG_SEEN_LOOKBEHIND;
7441 switch (get_regex_charset(RExC_flags)) {
7442 case REGEX_LOCALE_CHARSET:
7443 op = NBOUNDL;
7444 break;
7445 case REGEX_UNICODE_CHARSET:
7446 op = NBOUNDU;
7447 break;
7448 case REGEX_ASCII_RESTRICTED_CHARSET:
7449 op = NBOUNDA;
7450 break;
7451 case REGEX_DEPENDS_CHARSET:
7452 op = NBOUND;
7453 break;
7454 default:
7455 goto bad_charset;
7456 }
7457 ret = reg_node(pRExC_state, op);
7458 FLAGS(ret) = get_regex_charset(RExC_flags);
7459 *flagp |= SIMPLE;
7460 goto finish_meta_pat;
7461 case 's':
7462 switch (get_regex_charset(RExC_flags)) {
7463 case REGEX_LOCALE_CHARSET:
7464 op = SPACEL;
7465 break;
7466 case REGEX_UNICODE_CHARSET:
7467 op = SPACEU;
7468 break;
7469 case REGEX_ASCII_RESTRICTED_CHARSET:
7470 op = SPACEA;
7471 break;
7472 case REGEX_DEPENDS_CHARSET:
7473 op = SPACE;
7474 break;
7475 default:
7476 goto bad_charset;
7477 }
7478 ret = reg_node(pRExC_state, op);
7479 *flagp |= HASWIDTH|SIMPLE;
7480 goto finish_meta_pat;
7481 case 'S':
7482 switch (get_regex_charset(RExC_flags)) {
7483 case REGEX_LOCALE_CHARSET:
7484 op = NSPACEL;
7485 break;
7486 case REGEX_UNICODE_CHARSET:
7487 op = NSPACEU;
7488 break;
7489 case REGEX_ASCII_RESTRICTED_CHARSET:
7490 op = NSPACEA;
7491 break;
7492 case REGEX_DEPENDS_CHARSET:
7493 op = NSPACE;
7494 break;
7495 default:
7496 goto bad_charset;
7497 }
7498 ret = reg_node(pRExC_state, op);
7499 *flagp |= HASWIDTH|SIMPLE;
7500 goto finish_meta_pat;
7501 case 'd':
7502 switch (get_regex_charset(RExC_flags)) {
7503 case REGEX_LOCALE_CHARSET:
7504 op = DIGITL;
7505 break;
7506 case REGEX_ASCII_RESTRICTED_CHARSET:
7507 op = DIGITA;
7508 break;
7509 case REGEX_DEPENDS_CHARSET: /* No difference between these */
7510 case REGEX_UNICODE_CHARSET:
7511 op = DIGIT;
7512 break;
7513 default:
7514 goto bad_charset;
7515 }
7516 ret = reg_node(pRExC_state, op);
7517 *flagp |= HASWIDTH|SIMPLE;
7518 goto finish_meta_pat;
7519 case 'D':
7520 switch (get_regex_charset(RExC_flags)) {
7521 case REGEX_LOCALE_CHARSET:
7522 op = NDIGITL;
7523 break;
7524 case REGEX_ASCII_RESTRICTED_CHARSET:
7525 op = NDIGITA;
7526 break;
7527 case REGEX_DEPENDS_CHARSET: /* No difference between these */
7528 case REGEX_UNICODE_CHARSET:
7529 op = NDIGIT;
7530 break;
7531 default:
7532 goto bad_charset;
7533 }
7534 ret = reg_node(pRExC_state, op);
7535 *flagp |= HASWIDTH|SIMPLE;
7536 goto finish_meta_pat;
7537 case 'R':
7538 ret = reg_node(pRExC_state, LNBREAK);
7539 *flagp |= HASWIDTH|SIMPLE;
7540 goto finish_meta_pat;
7541 case 'h':
7542 ret = reg_node(pRExC_state, HORIZWS);
7543 *flagp |= HASWIDTH|SIMPLE;
7544 goto finish_meta_pat;
7545 case 'H':
7546 ret = reg_node(pRExC_state, NHORIZWS);
7547 *flagp |= HASWIDTH|SIMPLE;
7548 goto finish_meta_pat;
7549 case 'v':
7550 ret = reg_node(pRExC_state, VERTWS);
7551 *flagp |= HASWIDTH|SIMPLE;
7552 goto finish_meta_pat;
7553 case 'V':
7554 ret = reg_node(pRExC_state, NVERTWS);
7555 *flagp |= HASWIDTH|SIMPLE;
7556 finish_meta_pat:
7557 nextchar(pRExC_state);
7558 Set_Node_Length(ret, 2); /* MJD */
7559 break;
7560 case 'p':
7561 case 'P':
7562 {
7563 char* const oldregxend = RExC_end;
7564#ifdef DEBUGGING
7565 char* parse_start = RExC_parse - 2;
7566#endif
7567
7568 if (RExC_parse[1] == '{') {
7569 /* a lovely hack--pretend we saw [\pX] instead */
7570 RExC_end = strchr(RExC_parse, '}');
7571 if (!RExC_end) {
7572 const U8 c = (U8)*RExC_parse;
7573 RExC_parse += 2;
7574 RExC_end = oldregxend;
7575 vFAIL2("Missing right brace on \\%c{}", c);
7576 }
7577 RExC_end++;
7578 }
7579 else {
7580 RExC_end = RExC_parse + 2;
7581 if (RExC_end > oldregxend)
7582 RExC_end = oldregxend;
7583 }
7584 RExC_parse--;
7585
7586 ret = regclass(pRExC_state,depth+1);
7587
7588 RExC_end = oldregxend;
7589 RExC_parse--;
7590
7591 Set_Node_Offset(ret, parse_start + 2);
7592 Set_Node_Cur_Length(ret);
7593 nextchar(pRExC_state);
7594 *flagp |= HASWIDTH|SIMPLE;
7595 }
7596 break;
7597 case 'N':
7598 /* Handle \N and \N{NAME} here and not below because it can be
7599 multicharacter. join_exact() will join them up later on.
7600 Also this makes sure that things like /\N{BLAH}+/ and
7601 \N{BLAH} being multi char Just Happen. dmq*/
7602 ++RExC_parse;
7603 ret= reg_namedseq(pRExC_state, NULL, flagp);
7604 break;
7605 case 'k': /* Handle \k<NAME> and \k'NAME' */
7606 parse_named_seq:
7607 {
7608 char ch= RExC_parse[1];
7609 if (ch != '<' && ch != '\'' && ch != '{') {
7610 RExC_parse++;
7611 vFAIL2("Sequence %.2s... not terminated",parse_start);
7612 } else {
7613 /* this pretty much dupes the code for (?P=...) in reg(), if
7614 you change this make sure you change that */
7615 char* name_start = (RExC_parse += 2);
7616 U32 num = 0;
7617 SV *sv_dat = reg_scan_name(pRExC_state,
7618 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7619 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7620 if (RExC_parse == name_start || *RExC_parse != ch)
7621 vFAIL2("Sequence %.3s... not terminated",parse_start);
7622
7623 if (!SIZE_ONLY) {
7624 num = add_data( pRExC_state, 1, "S" );
7625 RExC_rxi->data->data[num]=(void*)sv_dat;
7626 SvREFCNT_inc_simple_void(sv_dat);
7627 }
7628
7629 RExC_sawback = 1;
7630 ret = reganode(pRExC_state,
7631 ((! FOLD)
7632 ? NREF
7633 : (AT_LEAST_UNI_SEMANTICS)
7634 ? NREFFU
7635 : (LOC)
7636 ? NREFFL
7637 : NREFF),
7638 num);
7639 *flagp |= HASWIDTH;
7640
7641 /* override incorrect value set in reganode MJD */
7642 Set_Node_Offset(ret, parse_start+1);
7643 Set_Node_Cur_Length(ret); /* MJD */
7644 nextchar(pRExC_state);
7645
7646 }
7647 break;
7648 }
7649 case 'g':
7650 case '1': case '2': case '3': case '4':
7651 case '5': case '6': case '7': case '8': case '9':
7652 {
7653 I32 num;
7654 bool isg = *RExC_parse == 'g';
7655 bool isrel = 0;
7656 bool hasbrace = 0;
7657 if (isg) {
7658 RExC_parse++;
7659 if (*RExC_parse == '{') {
7660 RExC_parse++;
7661 hasbrace = 1;
7662 }
7663 if (*RExC_parse == '-') {
7664 RExC_parse++;
7665 isrel = 1;
7666 }
7667 if (hasbrace && !isDIGIT(*RExC_parse)) {
7668 if (isrel) RExC_parse--;
7669 RExC_parse -= 2;
7670 goto parse_named_seq;
7671 } }
7672 num = atoi(RExC_parse);
7673 if (isg && num == 0)
7674 vFAIL("Reference to invalid group 0");
7675 if (isrel) {
7676 num = RExC_npar - num;
7677 if (num < 1)
7678 vFAIL("Reference to nonexistent or unclosed group");
7679 }
7680 if (!isg && num > 9 && num >= RExC_npar)
7681 goto defchar;
7682 else {
7683 char * const parse_start = RExC_parse - 1; /* MJD */
7684 while (isDIGIT(*RExC_parse))
7685 RExC_parse++;
7686 if (parse_start == RExC_parse - 1)
7687 vFAIL("Unterminated \\g... pattern");
7688 if (hasbrace) {
7689 if (*RExC_parse != '}')
7690 vFAIL("Unterminated \\g{...} pattern");
7691 RExC_parse++;
7692 }
7693 if (!SIZE_ONLY) {
7694 if (num > (I32)RExC_rx->nparens)
7695 vFAIL("Reference to nonexistent group");
7696 }
7697 RExC_sawback = 1;
7698 ret = reganode(pRExC_state,
7699 ((! FOLD)
7700 ? REF
7701 : (AT_LEAST_UNI_SEMANTICS)
7702 ? REFFU
7703 : (LOC)
7704 ? REFFL
7705 : REFF),
7706 num);
7707 *flagp |= HASWIDTH;
7708
7709 /* override incorrect value set in reganode MJD */
7710 Set_Node_Offset(ret, parse_start+1);
7711 Set_Node_Cur_Length(ret); /* MJD */
7712 RExC_parse--;
7713 nextchar(pRExC_state);
7714 }
7715 }
7716 break;
7717 case '\0':
7718 if (RExC_parse >= RExC_end)
7719 FAIL("Trailing \\");
7720 /* FALL THROUGH */
7721 default:
7722 /* Do not generate "unrecognized" warnings here, we fall
7723 back into the quick-grab loop below */
7724 parse_start--;
7725 goto defchar;
7726 }
7727 break;
7728
7729 case '#':
7730 if (RExC_flags & RXf_PMf_EXTENDED) {
7731 if ( reg_skipcomment( pRExC_state ) )
7732 goto tryagain;
7733 }
7734 /* FALL THROUGH */
7735
7736 default:
7737 outer_default:{
7738 register STRLEN len;
7739 register UV ender;
7740 register char *p;
7741 char *s;
7742 STRLEN foldlen;
7743 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7744
7745 parse_start = RExC_parse - 1;
7746
7747 RExC_parse++;
7748
7749 defchar:
7750 ender = 0;
7751 ret = reg_node(pRExC_state,
7752 (U8) ((! FOLD) ? EXACT
7753 : (LOC)
7754 ? EXACTFL
7755 : (AT_LEAST_UNI_SEMANTICS)
7756 ? EXACTFU
7757 : EXACTF)
7758 );
7759 s = STRING(ret);
7760 for (len = 0, p = RExC_parse - 1;
7761 len < 127 && p < RExC_end;
7762 len++)
7763 {
7764 char * const oldp = p;
7765
7766 if (RExC_flags & RXf_PMf_EXTENDED)
7767 p = regwhite( pRExC_state, p );
7768 switch ((U8)*p) {
7769 case LATIN_SMALL_LETTER_SHARP_S:
7770 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7771 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7772 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7773 goto normal_default;
7774 case '^':
7775 case '$':
7776 case '.':
7777 case '[':
7778 case '(':
7779 case ')':
7780 case '|':
7781 goto loopdone;
7782 case '\\':
7783 /* Literal Escapes Switch
7784
7785 This switch is meant to handle escape sequences that
7786 resolve to a literal character.
7787
7788 Every escape sequence that represents something
7789 else, like an assertion or a char class, is handled
7790 in the switch marked 'Special Escapes' above in this
7791 routine, but also has an entry here as anything that
7792 isn't explicitly mentioned here will be treated as
7793 an unescaped equivalent literal.
7794 */
7795
7796 switch ((U8)*++p) {
7797 /* These are all the special escapes. */
7798 case LATIN_SMALL_LETTER_SHARP_S:
7799 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7800 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7801 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7802 goto normal_default;
7803 case 'A': /* Start assertion */
7804 case 'b': case 'B': /* Word-boundary assertion*/
7805 case 'C': /* Single char !DANGEROUS! */
7806 case 'd': case 'D': /* digit class */
7807 case 'g': case 'G': /* generic-backref, pos assertion */
7808 case 'h': case 'H': /* HORIZWS */
7809 case 'k': case 'K': /* named backref, keep marker */
7810 case 'N': /* named char sequence */
7811 case 'p': case 'P': /* Unicode property */
7812 case 'R': /* LNBREAK */
7813 case 's': case 'S': /* space class */
7814 case 'v': case 'V': /* VERTWS */
7815 case 'w': case 'W': /* word class */
7816 case 'X': /* eXtended Unicode "combining character sequence" */
7817 case 'z': case 'Z': /* End of line/string assertion */
7818 --p;
7819 goto loopdone;
7820
7821 /* Anything after here is an escape that resolves to a
7822 literal. (Except digits, which may or may not)
7823 */
7824 case 'n':
7825 ender = '\n';
7826 p++;
7827 break;
7828 case 'r':
7829 ender = '\r';
7830 p++;
7831 break;
7832 case 't':
7833 ender = '\t';
7834 p++;
7835 break;
7836 case 'f':
7837 ender = '\f';
7838 p++;
7839 break;
7840 case 'e':
7841 ender = ASCII_TO_NATIVE('\033');
7842 p++;
7843 break;
7844 case 'a':
7845 ender = ASCII_TO_NATIVE('\007');
7846 p++;
7847 break;
7848 case 'o':
7849 {
7850 STRLEN brace_len = len;
7851 UV result;
7852 const char* error_msg;
7853
7854 bool valid = grok_bslash_o(p,
7855 &result,
7856 &brace_len,
7857 &error_msg,
7858 1);
7859 p += brace_len;
7860 if (! valid) {
7861 RExC_parse = p; /* going to die anyway; point
7862 to exact spot of failure */
7863 vFAIL(error_msg);
7864 }
7865 else
7866 {
7867 ender = result;
7868 }
7869 if (PL_encoding && ender < 0x100) {
7870 goto recode_encoding;
7871 }
7872 if (ender > 0xff) {
7873 REQUIRE_UTF8;
7874 }
7875 break;
7876 }
7877 case 'x':
7878 if (*++p == '{') {
7879 char* const e = strchr(p, '}');
7880
7881 if (!e) {
7882 RExC_parse = p + 1;
7883 vFAIL("Missing right brace on \\x{}");
7884 }
7885 else {
7886 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7887 | PERL_SCAN_DISALLOW_PREFIX;
7888 STRLEN numlen = e - p - 1;
7889 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7890 if (ender > 0xff)
7891 REQUIRE_UTF8;
7892 p = e + 1;
7893 }
7894 }
7895 else {
7896 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7897 STRLEN numlen = 2;
7898 ender = grok_hex(p, &numlen, &flags, NULL);
7899 p += numlen;
7900 }
7901 if (PL_encoding && ender < 0x100)
7902 goto recode_encoding;
7903 break;
7904 case 'c':
7905 p++;
7906 ender = grok_bslash_c(*p++, SIZE_ONLY);
7907 break;
7908 case '0': case '1': case '2': case '3':case '4':
7909 case '5': case '6': case '7': case '8':case '9':
7910 if (*p == '0' ||
7911 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
7912 {
7913 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
7914 STRLEN numlen = 3;
7915 ender = grok_oct(p, &numlen, &flags, NULL);
7916 if (ender > 0xff) {
7917 REQUIRE_UTF8;
7918 }
7919 p += numlen;
7920 }
7921 else {
7922 --p;
7923 goto loopdone;
7924 }
7925 if (PL_encoding && ender < 0x100)
7926 goto recode_encoding;
7927 break;
7928 recode_encoding:
7929 {
7930 SV* enc = PL_encoding;
7931 ender = reg_recode((const char)(U8)ender, &enc);
7932 if (!enc && SIZE_ONLY)
7933 ckWARNreg(p, "Invalid escape in the specified encoding");
7934 REQUIRE_UTF8;
7935 }
7936 break;
7937 case '\0':
7938 if (p >= RExC_end)
7939 FAIL("Trailing \\");
7940 /* FALL THROUGH */
7941 default:
7942 if (!SIZE_ONLY&& isALPHA(*p))
7943 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7944 goto normal_default;
7945 }
7946 break;
7947 default:
7948 normal_default:
7949 if (UTF8_IS_START(*p) && UTF) {
7950 STRLEN numlen;
7951 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7952 &numlen, UTF8_ALLOW_DEFAULT);
7953 p += numlen;
7954 }
7955 else
7956 ender = *p++;
7957 break;
7958 }
7959 if ( RExC_flags & RXf_PMf_EXTENDED)
7960 p = regwhite( pRExC_state, p );
7961 if (UTF && FOLD) {
7962 /* Prime the casefolded buffer. */
7963 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7964 }
7965 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7966 if (len)
7967 p = oldp;
7968 else if (UTF) {
7969 if (FOLD) {
7970 /* Emit all the Unicode characters. */
7971 STRLEN numlen;
7972 for (foldbuf = tmpbuf;
7973 foldlen;
7974 foldlen -= numlen) {
7975 ender = utf8_to_uvchr(foldbuf, &numlen);
7976 if (numlen > 0) {
7977 const STRLEN unilen = reguni(pRExC_state, ender, s);
7978 s += unilen;
7979 len += unilen;
7980 /* In EBCDIC the numlen
7981 * and unilen can differ. */
7982 foldbuf += numlen;
7983 if (numlen >= foldlen)
7984 break;
7985 }
7986 else
7987 break; /* "Can't happen." */
7988 }
7989 }
7990 else {
7991 const STRLEN unilen = reguni(pRExC_state, ender, s);
7992 if (unilen > 0) {
7993 s += unilen;
7994 len += unilen;
7995 }
7996 }
7997 }
7998 else {
7999 len++;
8000 REGC((char)ender, s++);
8001 }
8002 break;
8003 }
8004 if (UTF) {
8005 if (FOLD) {
8006 /* Emit all the Unicode characters. */
8007 STRLEN numlen;
8008 for (foldbuf = tmpbuf;
8009 foldlen;
8010 foldlen -= numlen) {
8011 ender = utf8_to_uvchr(foldbuf, &numlen);
8012 if (numlen > 0) {
8013 const STRLEN unilen = reguni(pRExC_state, ender, s);
8014 len += unilen;
8015 s += unilen;
8016 /* In EBCDIC the numlen
8017 * and unilen can differ. */
8018 foldbuf += numlen;
8019 if (numlen >= foldlen)
8020 break;
8021 }
8022 else
8023 break;
8024 }
8025 }
8026 else {
8027 const STRLEN unilen = reguni(pRExC_state, ender, s);
8028 if (unilen > 0) {
8029 s += unilen;
8030 len += unilen;
8031 }
8032 }
8033 len--;
8034 }
8035 else
8036 REGC((char)ender, s++);
8037 }
8038 loopdone:
8039 RExC_parse = p - 1;
8040 Set_Node_Cur_Length(ret); /* MJD */
8041 nextchar(pRExC_state);
8042 {
8043 /* len is STRLEN which is unsigned, need to copy to signed */
8044 IV iv = len;
8045 if (iv < 0)
8046 vFAIL("Internal disaster");
8047 }
8048 if (len > 0)
8049 *flagp |= HASWIDTH;
8050 if (len == 1 && UNI_IS_INVARIANT(ender))
8051 *flagp |= SIMPLE;
8052
8053 if (SIZE_ONLY)
8054 RExC_size += STR_SZ(len);
8055 else {
8056 STR_LEN(ret) = len;
8057 RExC_emit += STR_SZ(len);
8058 }
8059 }
8060 break;
8061 }
8062
8063 return(ret);
8064
8065/* Jumped to when an unrecognized character set is encountered */
8066bad_charset:
8067 Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
8068 return(NULL);
8069}
8070
8071STATIC char *
8072S_regwhite( RExC_state_t *pRExC_state, char *p )
8073{
8074 const char *e = RExC_end;
8075
8076 PERL_ARGS_ASSERT_REGWHITE;
8077
8078 while (p < e) {
8079 if (isSPACE(*p))
8080 ++p;
8081 else if (*p == '#') {
8082 bool ended = 0;
8083 do {
8084 if (*p++ == '\n') {
8085 ended = 1;
8086 break;
8087 }
8088 } while (p < e);
8089 if (!ended)
8090 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8091 }
8092 else
8093 break;
8094 }
8095 return p;
8096}
8097
8098/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
8099 Character classes ([:foo:]) can also be negated ([:^foo:]).
8100 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
8101 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
8102 but trigger failures because they are currently unimplemented. */
8103
8104#define POSIXCC_DONE(c) ((c) == ':')
8105#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
8106#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
8107
8108STATIC I32
8109S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
8110{
8111 dVAR;
8112 I32 namedclass = OOB_NAMEDCLASS;
8113
8114 PERL_ARGS_ASSERT_REGPPOSIXCC;
8115
8116 if (value == '[' && RExC_parse + 1 < RExC_end &&
8117 /* I smell either [: or [= or [. -- POSIX has been here, right? */
8118 POSIXCC(UCHARAT(RExC_parse))) {
8119 const char c = UCHARAT(RExC_parse);
8120 char* const s = RExC_parse++;
8121
8122 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
8123 RExC_parse++;
8124 if (RExC_parse == RExC_end)
8125 /* Grandfather lone [:, [=, [. */
8126 RExC_parse = s;
8127 else {
8128 const char* const t = RExC_parse++; /* skip over the c */
8129 assert(*t == c);
8130
8131 if (UCHARAT(RExC_parse) == ']') {
8132 const char *posixcc = s + 1;
8133 RExC_parse++; /* skip over the ending ] */
8134
8135 if (*s == ':') {
8136 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
8137 const I32 skip = t - posixcc;
8138
8139 /* Initially switch on the length of the name. */
8140 switch (skip) {
8141 case 4:
8142 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
8143 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
8144 break;
8145 case 5:
8146 /* Names all of length 5. */
8147 /* alnum alpha ascii blank cntrl digit graph lower
8148 print punct space upper */
8149 /* Offset 4 gives the best switch position. */
8150 switch (posixcc[4]) {
8151 case 'a':
8152 if (memEQ(posixcc, "alph", 4)) /* alpha */
8153 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
8154 break;
8155 case 'e':
8156 if (memEQ(posixcc, "spac", 4)) /* space */
8157 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
8158 break;
8159 case 'h':
8160 if (memEQ(posixcc, "grap", 4)) /* graph */
8161 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
8162 break;
8163 case 'i':
8164 if (memEQ(posixcc, "asci", 4)) /* ascii */
8165 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
8166 break;
8167 case 'k':
8168 if (memEQ(posixcc, "blan", 4)) /* blank */
8169 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
8170 break;
8171 case 'l':
8172 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
8173 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
8174 break;
8175 case 'm':
8176 if (memEQ(posixcc, "alnu", 4)) /* alnum */
8177 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
8178 break;
8179 case 'r':
8180 if (memEQ(posixcc, "lowe", 4)) /* lower */
8181 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
8182 else if (memEQ(posixcc, "uppe", 4)) /* upper */
8183 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
8184 break;
8185 case 't':
8186 if (memEQ(posixcc, "digi", 4)) /* digit */
8187 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
8188 else if (memEQ(posixcc, "prin", 4)) /* print */
8189 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
8190 else if (memEQ(posixcc, "punc", 4)) /* punct */
8191 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
8192 break;
8193 }
8194 break;
8195 case 6:
8196 if (memEQ(posixcc, "xdigit", 6))
8197 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
8198 break;
8199 }
8200
8201 if (namedclass == OOB_NAMEDCLASS)
8202 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8203 t - s - 1, s + 1);
8204 assert (posixcc[skip] == ':');
8205 assert (posixcc[skip+1] == ']');
8206 } else if (!SIZE_ONLY) {
8207 /* [[=foo=]] and [[.foo.]] are still future. */
8208
8209 /* adjust RExC_parse so the warning shows after
8210 the class closes */
8211 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
8212 RExC_parse++;
8213 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8214 }
8215 } else {
8216 /* Maternal grandfather:
8217 * "[:" ending in ":" but not in ":]" */
8218 RExC_parse = s;
8219 }
8220 }
8221 }
8222
8223 return namedclass;
8224}
8225
8226STATIC void
8227S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
8228{
8229 dVAR;
8230
8231 PERL_ARGS_ASSERT_CHECKPOSIXCC;
8232
8233 if (POSIXCC(UCHARAT(RExC_parse))) {
8234 const char *s = RExC_parse;
8235 const char c = *s++;
8236
8237 while (isALNUM(*s))
8238 s++;
8239 if (*s && c == *s && s[1] == ']') {
8240 ckWARN3reg(s+2,
8241 "POSIX syntax [%c %c] belongs inside character classes",
8242 c, c);
8243
8244 /* [[=foo=]] and [[.foo.]] are still future. */
8245 if (POSIXCC_NOTYET(c)) {
8246 /* adjust RExC_parse so the error shows after
8247 the class closes */
8248 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
8249 NOOP;
8250 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8251 }
8252 }
8253 }
8254}
8255
8256/* No locale test, and always Unicode semantics */
8257#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
8258ANYOF_##NAME: \
8259 for (value = 0; value < 256; value++) \
8260 if (TEST) \
8261 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8262 yesno = '+'; \
8263 what = WORD; \
8264 break; \
8265case ANYOF_N##NAME: \
8266 for (value = 0; value < 256; value++) \
8267 if (!TEST) \
8268 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8269 yesno = '!'; \
8270 what = WORD; \
8271 break
8272
8273/* Like the above, but there are differences if we are in uni-8-bit or not, so
8274 * there are two tests passed in, to use depending on that. There aren't any
8275 * cases where the label is different from the name, so no need for that
8276 * parameter */
8277#define _C_C_T_(NAME,TEST_8,TEST_7,WORD) \
8278ANYOF_##NAME: \
8279 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
8280 else if (UNI_SEMANTICS) { \
8281 for (value = 0; value < 256; value++) { \
8282 if (TEST_8) stored += \
8283 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8284 } \
8285 } \
8286 else { \
8287 for (value = 0; value < 128; value++) { \
8288 if (TEST_7) stored += \
8289 S_set_regclass_bit(aTHX_ pRExC_state, ret, \
8290 (U8) UNI_TO_NATIVE(value)); \
8291 } \
8292 } \
8293 yesno = '+'; \
8294 what = WORD; \
8295 break; \
8296case ANYOF_N##NAME: \
8297 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
8298 else if (UNI_SEMANTICS) { \
8299 for (value = 0; value < 256; value++) { \
8300 if (! TEST_8) stored += \
8301 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8302 } \
8303 } \
8304 else { \
8305 for (value = 0; value < 128; value++) { \
8306 if (! TEST_7) stored += \
8307 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8308 } \
8309 if (ASCII_RESTRICTED) { \
8310 for (value = 128; value < 256; value++) { \
8311 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
8312 } \
8313 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL|ANYOF_UTF8; \
8314 } \
8315 else { \
8316 /* For a non-ut8 target string with DEPENDS semantics, all above \
8317 * ASCII Latin1 code points match the complement of any of the \
8318 * classes. But in utf8, they have their Unicode semantics, so \
8319 * can't just set them in the bitmap, or else regexec.c will think \
8320 * they matched when they shouldn't. */ \
8321 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8; \
8322 } \
8323 } \
8324 yesno = '!'; \
8325 what = WORD; \
8326 break
8327
8328/*
8329 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8330 so that it is possible to override the option here without having to
8331 rebuild the entire core. as we are required to do if we change regcomp.h
8332 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8333*/
8334#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8335#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8336#endif
8337
8338#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8339#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8340#else
8341#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8342#endif
8343
8344STATIC U8
8345S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8346{
8347
8348 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
8349 * Locale folding is done at run-time, so this function should not be
8350 * called for nodes that are for locales.
8351 *
8352 * This function simply sets the bit corresponding to the fold of the input
8353 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
8354 * 'F' is 'f'.
8355 *
8356 * It also sets any necessary flags, and returns the number of bits that
8357 * actually changed from 0 to 1 */
8358
8359 U8 stored = 0;
8360 U8 fold;
8361
8362 fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
8363 : PL_fold[value];
8364
8365 /* It assumes the bit for 'value' has already been set */
8366 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
8367 ANYOF_BITMAP_SET(node, fold);
8368 stored++;
8369 }
8370 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
8371 || (! UNI_SEMANTICS
8372 && ! isASCII(value)
8373 && PL_fold_latin1[value] != value))
8374 { /* A character that has a fold outside of Latin1 matches outside the
8375 bitmap, but only when the target string is utf8. Similarly when we
8376 don't have unicode semantics for the above ASCII Latin-1 characters,
8377 and they have a fold, they should match if the target is utf8, and
8378 not otherwise */
8379 ANYOF_FLAGS(node) |= ANYOF_UTF8;
8380 }
8381
8382 return stored;
8383}
8384
8385
8386PERL_STATIC_INLINE U8
8387S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8388{
8389 /* This inline function sets a bit in the bitmap if not already set, and if
8390 * appropriate, its fold, returning the number of bits that actually
8391 * changed from 0 to 1 */
8392
8393 U8 stored;
8394
8395 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
8396 return 0;
8397 }
8398
8399 ANYOF_BITMAP_SET(node, value);
8400 stored = 1;
8401
8402 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
8403 stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
8404 }
8405
8406 return stored;
8407}
8408
8409/*
8410 parse a class specification and produce either an ANYOF node that
8411 matches the pattern or if the pattern matches a single char only and
8412 that char is < 256 and we are case insensitive then we produce an
8413 EXACT node instead.
8414*/
8415
8416STATIC regnode *
8417S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
8418{
8419 dVAR;
8420 register UV nextvalue;
8421 register IV prevvalue = OOB_UNICODE;
8422 register IV range = 0;
8423 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
8424 register regnode *ret;
8425 STRLEN numlen;
8426 IV namedclass;
8427 char *rangebegin = NULL;
8428 bool need_class = 0;
8429 SV *listsv = NULL;
8430 UV n;
8431 AV* unicode_alternate = NULL;
8432#ifdef EBCDIC
8433 UV literal_endpoint = 0;
8434#endif
8435 UV stored = 0; /* how many chars stored in the bitmap */
8436
8437 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
8438 case we need to change the emitted regop to an EXACT. */
8439 const char * orig_parse = RExC_parse;
8440 GET_RE_DEBUG_FLAGS_DECL;
8441
8442 PERL_ARGS_ASSERT_REGCLASS;
8443#ifndef DEBUGGING
8444 PERL_UNUSED_ARG(depth);
8445#endif
8446
8447 DEBUG_PARSE("clas");
8448
8449 /* Assume we are going to generate an ANYOF node. */
8450 ret = reganode(pRExC_state, ANYOF, 0);
8451
8452 if (!SIZE_ONLY)
8453 ANYOF_FLAGS(ret) = 0;
8454
8455 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
8456 RExC_naughty++;
8457 RExC_parse++;
8458 if (!SIZE_ONLY)
8459 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8460 }
8461
8462 if (SIZE_ONLY) {
8463 RExC_size += ANYOF_SKIP;
8464#ifdef ANYOF_ADD_LOC_SKIP
8465 if (LOC) {
8466 RExC_size += ANYOF_ADD_LOC_SKIP;
8467 }
8468#endif
8469 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8470 }
8471 else {
8472 RExC_emit += ANYOF_SKIP;
8473 if (LOC) {
8474 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
8475#ifdef ANYOF_ADD_LOC_SKIP
8476 RExC_emit += ANYOF_ADD_LOC_SKIP;
8477#endif
8478 }
8479 ANYOF_BITMAP_ZERO(ret);
8480 listsv = newSVpvs("# comment\n");
8481 }
8482
8483 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8484
8485 if (!SIZE_ONLY && POSIXCC(nextvalue))
8486 checkposixcc(pRExC_state);
8487
8488 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8489 if (UCHARAT(RExC_parse) == ']')
8490 goto charclassloop;
8491
8492parseit:
8493 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
8494
8495 charclassloop:
8496
8497 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8498
8499 if (!range)
8500 rangebegin = RExC_parse;
8501 if (UTF) {
8502 value = utf8n_to_uvchr((U8*)RExC_parse,
8503 RExC_end - RExC_parse,
8504 &numlen, UTF8_ALLOW_DEFAULT);
8505 RExC_parse += numlen;
8506 }
8507 else
8508 value = UCHARAT(RExC_parse++);
8509
8510 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8511 if (value == '[' && POSIXCC(nextvalue))
8512 namedclass = regpposixcc(pRExC_state, value);
8513 else if (value == '\\') {
8514 if (UTF) {
8515 value = utf8n_to_uvchr((U8*)RExC_parse,
8516 RExC_end - RExC_parse,
8517 &numlen, UTF8_ALLOW_DEFAULT);
8518 RExC_parse += numlen;
8519 }
8520 else
8521 value = UCHARAT(RExC_parse++);
8522 /* Some compilers cannot handle switching on 64-bit integer
8523 * values, therefore value cannot be an UV. Yes, this will
8524 * be a problem later if we want switch on Unicode.
8525 * A similar issue a little bit later when switching on
8526 * namedclass. --jhi */
8527 switch ((I32)value) {
8528 case 'w': namedclass = ANYOF_ALNUM; break;
8529 case 'W': namedclass = ANYOF_NALNUM; break;
8530 case 's': namedclass = ANYOF_SPACE; break;
8531 case 'S': namedclass = ANYOF_NSPACE; break;
8532 case 'd': namedclass = ANYOF_DIGIT; break;
8533 case 'D': namedclass = ANYOF_NDIGIT; break;
8534 case 'v': namedclass = ANYOF_VERTWS; break;
8535 case 'V': namedclass = ANYOF_NVERTWS; break;
8536 case 'h': namedclass = ANYOF_HORIZWS; break;
8537 case 'H': namedclass = ANYOF_NHORIZWS; break;
8538 case 'N': /* Handle \N{NAME} in class */
8539 {
8540 /* We only pay attention to the first char of
8541 multichar strings being returned. I kinda wonder
8542 if this makes sense as it does change the behaviour
8543 from earlier versions, OTOH that behaviour was broken
8544 as well. */
8545 UV v; /* value is register so we cant & it /grrr */
8546 if (reg_namedseq(pRExC_state, &v, NULL)) {
8547 goto parseit;
8548 }
8549 value= v;
8550 }
8551 break;
8552 case 'p':
8553 case 'P':
8554 {
8555 char *e;
8556 if (RExC_parse >= RExC_end)
8557 vFAIL2("Empty \\%c{}", (U8)value);
8558 if (*RExC_parse == '{') {
8559 const U8 c = (U8)value;
8560 e = strchr(RExC_parse++, '}');
8561 if (!e)
8562 vFAIL2("Missing right brace on \\%c{}", c);
8563 while (isSPACE(UCHARAT(RExC_parse)))
8564 RExC_parse++;
8565 if (e == RExC_parse)
8566 vFAIL2("Empty \\%c{}", c);
8567 n = e - RExC_parse;
8568 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8569 n--;
8570 }
8571 else {
8572 e = RExC_parse;
8573 n = 1;
8574 }
8575 if (!SIZE_ONLY) {
8576 if (UCHARAT(RExC_parse) == '^') {
8577 RExC_parse++;
8578 n--;
8579 value = value == 'p' ? 'P' : 'p'; /* toggle */
8580 while (isSPACE(UCHARAT(RExC_parse))) {
8581 RExC_parse++;
8582 n--;
8583 }
8584 }
8585 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8586 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8587 }
8588 RExC_parse = e + 1;
8589
8590 /* The \p could match something in the Latin1 range, hence
8591 * something that isn't utf8 */
8592 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
8593 if (FOLD) { /* And one of these could have a multi-char fold */
8594 OP(ret) = ANYOFV;
8595 }
8596 namedclass = ANYOF_MAX; /* no official name, but it's named */
8597 }
8598 break;
8599 case 'n': value = '\n'; break;
8600 case 'r': value = '\r'; break;
8601 case 't': value = '\t'; break;
8602 case 'f': value = '\f'; break;
8603 case 'b': value = '\b'; break;
8604 case 'e': value = ASCII_TO_NATIVE('\033');break;
8605 case 'a': value = ASCII_TO_NATIVE('\007');break;
8606 case 'o':
8607 RExC_parse--; /* function expects to be pointed at the 'o' */
8608 {
8609 const char* error_msg;
8610 bool valid = grok_bslash_o(RExC_parse,
8611 &value,
8612 &numlen,
8613 &error_msg,
8614 SIZE_ONLY);
8615 RExC_parse += numlen;
8616 if (! valid) {
8617 vFAIL(error_msg);
8618 }
8619 }
8620 if (PL_encoding && value < 0x100) {
8621 goto recode_encoding;
8622 }
8623 break;
8624 case 'x':
8625 if (*RExC_parse == '{') {
8626 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8627 | PERL_SCAN_DISALLOW_PREFIX;
8628 char * const e = strchr(RExC_parse++, '}');
8629 if (!e)
8630 vFAIL("Missing right brace on \\x{}");
8631
8632 numlen = e - RExC_parse;
8633 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8634 RExC_parse = e + 1;
8635 }
8636 else {
8637 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8638 numlen = 2;
8639 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8640 RExC_parse += numlen;
8641 }
8642 if (PL_encoding && value < 0x100)
8643 goto recode_encoding;
8644 break;
8645 case 'c':
8646 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
8647 break;
8648 case '0': case '1': case '2': case '3': case '4':
8649 case '5': case '6': case '7':
8650 {
8651 /* Take 1-3 octal digits */
8652 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8653 numlen = 3;
8654 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8655 RExC_parse += numlen;
8656 if (PL_encoding && value < 0x100)
8657 goto recode_encoding;
8658 break;
8659 }
8660 recode_encoding:
8661 {
8662 SV* enc = PL_encoding;
8663 value = reg_recode((const char)(U8)value, &enc);
8664 if (!enc && SIZE_ONLY)
8665 ckWARNreg(RExC_parse,
8666 "Invalid escape in the specified encoding");
8667 break;
8668 }
8669 default:
8670 /* Allow \_ to not give an error */
8671 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
8672 ckWARN2reg(RExC_parse,
8673 "Unrecognized escape \\%c in character class passed through",
8674 (int)value);
8675 }
8676 break;
8677 }
8678 } /* end of \blah */
8679#ifdef EBCDIC
8680 else
8681 literal_endpoint++;
8682#endif
8683
8684 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8685
8686 /* What matches in a locale is not known until runtime, so need to
8687 * (one time per class) allocate extra space to pass to regexec.
8688 * The space will contain a bit for each named class that is to be
8689 * matched against. This isn't needed for \p{} and pseudo-classes,
8690 * as they are not affected by locale, and hence are dealt with
8691 * separately */
8692 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
8693 need_class = 1;
8694 if (SIZE_ONLY) {
8695#ifdef ANYOF_CLASS_ADD_SKIP
8696 RExC_size += ANYOF_CLASS_ADD_SKIP;
8697#endif
8698 }
8699 else {
8700#ifdef ANYOF_CLASS_ADD_SKIP
8701 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8702#endif
8703 ANYOF_CLASS_ZERO(ret);
8704 }
8705 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8706 }
8707
8708 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
8709 * literal */
8710 if (range) {
8711 if (!SIZE_ONLY) {
8712 const int w =
8713 RExC_parse >= rangebegin ?
8714 RExC_parse - rangebegin : 0;
8715 ckWARN4reg(RExC_parse,
8716 "False [] range \"%*.*s\"",
8717 w, w, rangebegin);
8718
8719 if (prevvalue < 256) {
8720 stored +=
8721 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue);
8722 stored +=
8723 S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
8724 }
8725 else {
8726 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8727 Perl_sv_catpvf(aTHX_ listsv,
8728 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8729 }
8730 }
8731
8732 range = 0; /* this was not a true range */
8733 }
8734
8735
8736
8737 if (!SIZE_ONLY) {
8738 const char *what = NULL;
8739 char yesno = 0;
8740
8741 /* Possible truncation here but in some 64-bit environments
8742 * the compiler gets heartburn about switch on 64-bit values.
8743 * A similar issue a little earlier when switching on value.
8744 * --jhi */
8745 switch ((I32)namedclass) {
8746
8747 case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
8748 case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
8749 case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
8750 case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
8751 case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
8752 case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
8753 case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
8754 case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
8755 case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
8756 case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
8757#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8758 /* \s, \w match all unicode if utf8. */
8759 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8760 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
8761#else
8762 /* \s, \w match ascii and locale only */
8763 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8764 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
8765#endif
8766 case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
8767 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8768 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8769 case ANYOF_ASCII:
8770 if (LOC)
8771 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8772 else {
8773 for (value = 0; value < 128; value++)
8774 stored +=
8775 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
8776 }
8777 yesno = '+';
8778 what = NULL; /* Doesn't match outside ascii, so
8779 don't want to add +utf8:: */
8780 break;
8781 case ANYOF_NASCII:
8782 if (LOC)
8783 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8784 else {
8785 for (value = 128; value < 256; value++)
8786 stored +=
8787 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
8788 }
8789 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
8790 yesno = '!';
8791 what = "ASCII";
8792 break;
8793 case ANYOF_DIGIT:
8794 if (LOC)
8795 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8796 else {
8797 /* consecutive digits assumed */
8798 for (value = '0'; value <= '9'; value++)
8799 stored +=
8800 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8801 }
8802 yesno = '+';
8803 what = POSIX_CC_UNI_NAME("Digit");
8804 break;
8805 case ANYOF_NDIGIT:
8806 if (LOC)
8807 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8808 else {
8809 /* consecutive digits assumed */
8810 for (value = 0; value < '0'; value++)
8811 stored +=
8812 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8813 for (value = '9' + 1; value < 256; value++)
8814 stored +=
8815 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
8816 }
8817 yesno = '!';
8818 what = POSIX_CC_UNI_NAME("Digit");
8819 if (ASCII_RESTRICTED ) {
8820 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
8821 }
8822 break;
8823 case ANYOF_MAX:
8824 /* this is to handle \p and \P */
8825 break;
8826 default:
8827 vFAIL("Invalid [::] class");
8828 break;
8829 }
8830 if (what && ! (ASCII_RESTRICTED)) {
8831 /* Strings such as "+utf8::isWord\n" */
8832 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8833 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8834 }
8835
8836 continue;
8837 }
8838 } /* end of namedclass \blah */
8839
8840 if (range) {
8841 if (prevvalue > (IV)value) /* b-a */ {
8842 const int w = RExC_parse - rangebegin;
8843 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8844 range = 0; /* not a valid range */
8845 }
8846 }
8847 else {
8848 prevvalue = value; /* save the beginning of the range */
8849 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8850 RExC_parse[1] != ']') {
8851 RExC_parse++;
8852
8853 /* a bad range like \w-, [:word:]- ? */
8854 if (namedclass > OOB_NAMEDCLASS) {
8855 if (ckWARN(WARN_REGEXP)) {
8856 const int w =
8857 RExC_parse >= rangebegin ?
8858 RExC_parse - rangebegin : 0;
8859 vWARN4(RExC_parse,
8860 "False [] range \"%*.*s\"",
8861 w, w, rangebegin);
8862 }
8863 if (!SIZE_ONLY)
8864 stored +=
8865 S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
8866 } else
8867 range = 1; /* yeah, it's a range! */
8868 continue; /* but do it the next time */
8869 }
8870 }
8871
8872 /* now is the next time */
8873 if (!SIZE_ONLY) {
8874 if (prevvalue < 256) {
8875 const IV ceilvalue = value < 256 ? value : 255;
8876 IV i;
8877#ifdef EBCDIC
8878 /* In EBCDIC [\x89-\x91] should include
8879 * the \x8e but [i-j] should not. */
8880 if (literal_endpoint == 2 &&
8881 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8882 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8883 {
8884 if (isLOWER(prevvalue)) {
8885 for (i = prevvalue; i <= ceilvalue; i++)
8886 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8887 stored +=
8888 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8889 }
8890 } else {
8891 for (i = prevvalue; i <= ceilvalue; i++)
8892 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8893 stored +=
8894 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8895 }
8896 }
8897 }
8898 else
8899#endif
8900 for (i = prevvalue; i <= ceilvalue; i++) {
8901 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
8902 }
8903 }
8904 if (value > 255 || UTF) {
8905 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8906 const UV natvalue = NATIVE_TO_UNI(value);
8907
8908 /* If the code point requires utf8 to represent, and we are not
8909 * folding, it can't match unless the target is in utf8. Only
8910 * a few code points above 255 fold to below it, so XXX an
8911 * optimization would be to know which ones and set the flag
8912 * appropriately. */
8913 ANYOF_FLAGS(ret) |= (FOLD || value < 256)
8914 ? ANYOF_NONBITMAP
8915 : ANYOF_UTF8;
8916 if (prevnatvalue < natvalue) { /* '>' case is fatal error above */
8917
8918 /* The \t sets the whole range */
8919 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8920 prevnatvalue, natvalue);
8921
8922 /* Currently, we don't look at every value in the range.
8923 * Therefore we have to assume the worst case: that if
8924 * folding, it will match more than one character */
8925 if (FOLD) {
8926 OP(ret) = ANYOFV;
8927 }
8928 }
8929 else if (prevnatvalue == natvalue) {
8930 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8931 if (FOLD) {
8932 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8933 STRLEN foldlen;
8934 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8935
8936#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8937 if (RExC_precomp[0] == ':' &&
8938 RExC_precomp[1] == '[' &&
8939 (f == 0xDF || f == 0x92)) {
8940 f = NATIVE_TO_UNI(f);
8941 }
8942#endif
8943 /* If folding and foldable and a single
8944 * character, insert also the folded version
8945 * to the charclass. */
8946 if (f != value) {
8947#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8948 if ((RExC_precomp[0] == ':' &&
8949 RExC_precomp[1] == '[' &&
8950 (f == 0xA2 &&
8951 (value == 0xFB05 || value == 0xFB06))) ?
8952 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8953 foldlen == (STRLEN)UNISKIP(f) )
8954#else
8955 if (foldlen == (STRLEN)UNISKIP(f))
8956#endif
8957 Perl_sv_catpvf(aTHX_ listsv,
8958 "%04"UVxf"\n", f);
8959 else {
8960 /* Any multicharacter foldings
8961 * require the following transform:
8962 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8963 * where E folds into "pq" and F folds
8964 * into "rst", all other characters
8965 * fold to single characters. We save
8966 * away these multicharacter foldings,
8967 * to be later saved as part of the
8968 * additional "s" data. */
8969 SV *sv;
8970
8971 if (!unicode_alternate)
8972 unicode_alternate = newAV();
8973 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8974 TRUE);
8975 av_push(unicode_alternate, sv);
8976 OP(ret) = ANYOFV;
8977 }
8978 }
8979
8980 /* If folding and the value is one of the Greek
8981 * sigmas insert a few more sigmas to make the
8982 * folding rules of the sigmas to work right.
8983 * Note that not all the possible combinations
8984 * are handled here: some of them are handled
8985 * by the standard folding rules, and some of
8986 * them (literal or EXACTF cases) are handled
8987 * during runtime in regexec.c:S_find_byclass(). */
8988 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8989 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8990 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8991 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8992 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8993 }
8994 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8995 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8996 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8997 }
8998 }
8999 }
9000#ifdef EBCDIC
9001 literal_endpoint = 0;
9002#endif
9003 }
9004
9005 range = 0; /* this range (if it was one) is done now */
9006 }
9007
9008
9009
9010 if (SIZE_ONLY)
9011 return ret;
9012 /****** !SIZE_ONLY AFTER HERE *********/
9013
9014 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
9015 * set the FOLD flag yet, so this this does optimize those. It doesn't
9016 * optimize locale. Doing so perhaps could be done as long as there is
9017 * nothing like \w in it; some thought also would have to be given to the
9018 * interaction with above 0x100 chars */
9019 if (! LOC && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
9020 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
9021 ANYOF_BITMAP(ret)[value] ^= 0xFF;
9022 stored = 256 - stored;
9023
9024 /* The inversion means that everything above 255 is matched; and at the
9025 * same time we clear the invert flag */
9026 ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
9027 }
9028
9029 if (FOLD) {
9030 SV *sv;
9031
9032 /* This is the one character in the bitmap that needs special handling
9033 * under non-locale folding, as it folds to two characters 'ss'. This
9034 * happens if it is set and not inverting, or isn't set and are
9035 * inverting */
9036 if (! LOC
9037 && (cBOOL(ANYOF_BITMAP_TEST(ret, LATIN_SMALL_LETTER_SHARP_S))
9038 ^ cBOOL(ANYOF_FLAGS(ret) & ANYOF_INVERT)))
9039 {
9040 OP(ret) = ANYOFV; /* Can match more than a single char */
9041
9042 /* Under Unicode semantics), it can do this when the target string
9043 * isn't in utf8 */
9044 if (UNI_SEMANTICS) {
9045 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
9046 }
9047
9048 if (!unicode_alternate) {
9049 unicode_alternate = newAV();
9050 }
9051 sv = newSVpvn_utf8("ss", 2, TRUE);
9052 av_push(unicode_alternate, sv);
9053 }
9054
9055 /* Folding in the bitmap is taken care of above, but not for locale
9056 * (for which we have to wait to see what folding is in effect at
9057 * runtime), and for things not in the bitmap. Set run-time fold flag
9058 * for these */
9059 if ((LOC || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP))) {
9060 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
9061 }
9062 }
9063
9064 /* A single character class can be "optimized" into an EXACTish node.
9065 * Note that since we don't currently count how many characters there are
9066 * outside the bitmap, we are XXX missing optimization possibilities for
9067 * them. This optimization can't happen unless this is a truly single
9068 * character class, which means that it can't be an inversion into a
9069 * many-character class, and there must be no possibility of there being
9070 * things outside the bitmap. 'stored' (only) for locales doesn't include
9071 * \w, etc, so have to make a special test that they aren't present
9072 *
9073 * Similarly A 2-character class of the very special form like [bB] can be
9074 * optimized into an EXACTFish node, but only for non-locales, and for
9075 * characters which only have the two folds; so things like 'fF' and 'Ii'
9076 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
9077 * FI'. */
9078 if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
9079 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
9080 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
9081 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
9082 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
9083 /* If the latest code point has a fold whose
9084 * bit is set, it must be the only other one */
9085 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
9086 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
9087 {
9088 /* Note that the information needed to decide to do this optimization
9089 * is not currently available until the 2nd pass, and that the actually
9090 * used EXACTish node takes less space than the calculated ANYOF node,
9091 * and hence the amount of space calculated in the first pass is larger
9092 * than actually used, so this optimization doesn't gain us any space.
9093 * But an EXACT node is faster than an ANYOF node, and can be combined
9094 * with any adjacent EXACT nodes later by the optimizer for further
9095 * gains. The speed of executing an EXACTF is similar to an ANYOF
9096 * node, so the optimization advantage comes from the ability to join
9097 * it to adjacent EXACT nodes */
9098
9099 const char * cur_parse= RExC_parse;
9100 U8 op;
9101 RExC_emit = (regnode *)orig_emit;
9102 RExC_parse = (char *)orig_parse;
9103
9104 if (stored == 1) {
9105
9106 /* A locale node with one point can be folded; all the other cases
9107 * with folding will have two points, since we calculate them above
9108 */
9109 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
9110 op = EXACTFL;
9111 }
9112 else {
9113 op = EXACT;
9114 }
9115 } /* else 2 chars in the bit map: the folds of each other */
9116 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
9117
9118 /* To join adjacent nodes, they must be the exact EXACTish type.
9119 * Try to use the most likely type, by using EXACTFU if the regex
9120 * calls for them, or is required because the character is
9121 * non-ASCII */
9122 op = EXACTFU;
9123 }
9124 else { /* Otherwise, more likely to be EXACTF type */
9125 op = EXACTF;
9126 }
9127
9128 ret = reg_node(pRExC_state, op);
9129 RExC_parse = (char *)cur_parse;
9130 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
9131 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
9132 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
9133 STR_LEN(ret)= 2;
9134 RExC_emit += STR_SZ(2);
9135 }
9136 else {
9137 *STRING(ret)= (char)value;
9138 STR_LEN(ret)= 1;
9139 RExC_emit += STR_SZ(1);
9140 }
9141 SvREFCNT_dec(listsv);
9142 return ret;
9143 }
9144
9145 {
9146 AV * const av = newAV();
9147 SV *rv;
9148 /* The 0th element stores the character class description
9149 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9150 * to initialize the appropriate swash (which gets stored in
9151 * the 1st element), and also useful for dumping the regnode.
9152 * The 2nd element stores the multicharacter foldings,
9153 * used later (regexec.c:S_reginclass()). */
9154 av_store(av, 0, listsv);
9155 av_store(av, 1, NULL);
9156 av_store(av, 2, MUTABLE_SV(unicode_alternate));
9157 rv = newRV_noinc(MUTABLE_SV(av));
9158 n = add_data(pRExC_state, 1, "s");
9159 RExC_rxi->data->data[n] = (void*)rv;
9160 ARG_SET(ret, n);
9161 }
9162 return ret;
9163}
9164#undef _C_C_T_
9165
9166
9167/* reg_skipcomment()
9168
9169 Absorbs an /x style # comments from the input stream.
9170 Returns true if there is more text remaining in the stream.
9171 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
9172 terminates the pattern without including a newline.
9173
9174 Note its the callers responsibility to ensure that we are
9175 actually in /x mode
9176
9177*/
9178
9179STATIC bool
9180S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
9181{
9182 bool ended = 0;
9183
9184 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
9185
9186 while (RExC_parse < RExC_end)
9187 if (*RExC_parse++ == '\n') {
9188 ended = 1;
9189 break;
9190 }
9191 if (!ended) {
9192 /* we ran off the end of the pattern without ending
9193 the comment, so we have to add an \n when wrapping */
9194 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9195 return 0;
9196 } else
9197 return 1;
9198}
9199
9200/* nextchar()
9201
9202 Advances the parse position, and optionally absorbs
9203 "whitespace" from the inputstream.
9204
9205 Without /x "whitespace" means (?#...) style comments only,
9206 with /x this means (?#...) and # comments and whitespace proper.
9207
9208 Returns the RExC_parse point from BEFORE the scan occurs.
9209
9210 This is the /x friendly way of saying RExC_parse++.
9211*/
9212
9213STATIC char*
9214S_nextchar(pTHX_ RExC_state_t *pRExC_state)
9215{
9216 char* const retval = RExC_parse++;
9217
9218 PERL_ARGS_ASSERT_NEXTCHAR;
9219
9220 for (;;) {
9221 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
9222 RExC_parse[2] == '#') {
9223 while (*RExC_parse != ')') {
9224 if (RExC_parse == RExC_end)
9225 FAIL("Sequence (?#... not terminated");
9226 RExC_parse++;
9227 }
9228 RExC_parse++;
9229 continue;
9230 }
9231 if (RExC_flags & RXf_PMf_EXTENDED) {
9232 if (isSPACE(*RExC_parse)) {
9233 RExC_parse++;
9234 continue;
9235 }
9236 else if (*RExC_parse == '#') {
9237 if ( reg_skipcomment( pRExC_state ) )
9238 continue;
9239 }
9240 }
9241 return retval;
9242 }
9243}
9244
9245/*
9246- reg_node - emit a node
9247*/
9248STATIC regnode * /* Location. */
9249S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
9250{
9251 dVAR;
9252 register regnode *ptr;
9253 regnode * const ret = RExC_emit;
9254 GET_RE_DEBUG_FLAGS_DECL;
9255
9256 PERL_ARGS_ASSERT_REG_NODE;
9257
9258 if (SIZE_ONLY) {
9259 SIZE_ALIGN(RExC_size);
9260 RExC_size += 1;
9261 return(ret);
9262 }
9263 if (RExC_emit >= RExC_emit_bound)
9264 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9265
9266 NODE_ALIGN_FILL(ret);
9267 ptr = ret;
9268 FILL_ADVANCE_NODE(ptr, op);
9269#ifdef RE_TRACK_PATTERN_OFFSETS
9270 if (RExC_offsets) { /* MJD */
9271 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
9272 "reg_node", __LINE__,
9273 PL_reg_name[op],
9274 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
9275 ? "Overwriting end of array!\n" : "OK",
9276 (UV)(RExC_emit - RExC_emit_start),
9277 (UV)(RExC_parse - RExC_start),
9278 (UV)RExC_offsets[0]));
9279 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
9280 }
9281#endif
9282 RExC_emit = ptr;
9283 return(ret);
9284}
9285
9286/*
9287- reganode - emit a node with an argument
9288*/
9289STATIC regnode * /* Location. */
9290S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
9291{
9292 dVAR;
9293 register regnode *ptr;
9294 regnode * const ret = RExC_emit;
9295 GET_RE_DEBUG_FLAGS_DECL;
9296
9297 PERL_ARGS_ASSERT_REGANODE;
9298
9299 if (SIZE_ONLY) {
9300 SIZE_ALIGN(RExC_size);
9301 RExC_size += 2;
9302 /*
9303 We can't do this:
9304
9305 assert(2==regarglen[op]+1);
9306
9307 Anything larger than this has to allocate the extra amount.
9308 If we changed this to be:
9309
9310 RExC_size += (1 + regarglen[op]);
9311
9312 then it wouldn't matter. Its not clear what side effect
9313 might come from that so its not done so far.
9314 -- dmq
9315 */
9316 return(ret);
9317 }
9318 if (RExC_emit >= RExC_emit_bound)
9319 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9320
9321 NODE_ALIGN_FILL(ret);
9322 ptr = ret;
9323 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
9324#ifdef RE_TRACK_PATTERN_OFFSETS
9325 if (RExC_offsets) { /* MJD */
9326 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9327 "reganode",
9328 __LINE__,
9329 PL_reg_name[op],
9330 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
9331 "Overwriting end of array!\n" : "OK",
9332 (UV)(RExC_emit - RExC_emit_start),
9333 (UV)(RExC_parse - RExC_start),
9334 (UV)RExC_offsets[0]));
9335 Set_Cur_Node_Offset;
9336 }
9337#endif
9338 RExC_emit = ptr;
9339 return(ret);
9340}
9341
9342/*
9343- reguni - emit (if appropriate) a Unicode character
9344*/
9345STATIC STRLEN
9346S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
9347{
9348 dVAR;
9349
9350 PERL_ARGS_ASSERT_REGUNI;
9351
9352 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
9353}
9354
9355/*
9356- reginsert - insert an operator in front of already-emitted operand
9357*
9358* Means relocating the operand.
9359*/
9360STATIC void
9361S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
9362{
9363 dVAR;
9364 register regnode *src;
9365 register regnode *dst;
9366 register regnode *place;
9367 const int offset = regarglen[(U8)op];
9368 const int size = NODE_STEP_REGNODE + offset;
9369 GET_RE_DEBUG_FLAGS_DECL;
9370
9371 PERL_ARGS_ASSERT_REGINSERT;
9372 PERL_UNUSED_ARG(depth);
9373/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
9374 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
9375 if (SIZE_ONLY) {
9376 RExC_size += size;
9377 return;
9378 }
9379
9380 src = RExC_emit;
9381 RExC_emit += size;
9382 dst = RExC_emit;
9383 if (RExC_open_parens) {
9384 int paren;
9385 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
9386 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
9387 if ( RExC_open_parens[paren] >= opnd ) {
9388 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
9389 RExC_open_parens[paren] += size;
9390 } else {
9391 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
9392 }
9393 if ( RExC_close_parens[paren] >= opnd ) {
9394 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
9395 RExC_close_parens[paren] += size;
9396 } else {
9397 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
9398 }
9399 }
9400 }
9401
9402 while (src > opnd) {
9403 StructCopy(--src, --dst, regnode);
9404#ifdef RE_TRACK_PATTERN_OFFSETS
9405 if (RExC_offsets) { /* MJD 20010112 */
9406 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
9407 "reg_insert",
9408 __LINE__,
9409 PL_reg_name[op],
9410 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9411 ? "Overwriting end of array!\n" : "OK",
9412 (UV)(src - RExC_emit_start),
9413 (UV)(dst - RExC_emit_start),
9414 (UV)RExC_offsets[0]));
9415 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9416 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
9417 }
9418#endif
9419 }
9420
9421
9422 place = opnd; /* Op node, where operand used to be. */
9423#ifdef RE_TRACK_PATTERN_OFFSETS
9424 if (RExC_offsets) { /* MJD */
9425 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9426 "reginsert",
9427 __LINE__,
9428 PL_reg_name[op],
9429 (UV)(place - RExC_emit_start) > RExC_offsets[0]
9430 ? "Overwriting end of array!\n" : "OK",
9431 (UV)(place - RExC_emit_start),
9432 (UV)(RExC_parse - RExC_start),
9433 (UV)RExC_offsets[0]));
9434 Set_Node_Offset(place, RExC_parse);
9435 Set_Node_Length(place, 1);
9436 }
9437#endif
9438 src = NEXTOPER(place);
9439 FILL_ADVANCE_NODE(place, op);
9440 Zero(src, offset, regnode);
9441}
9442
9443/*
9444- regtail - set the next-pointer at the end of a node chain of p to val.
9445- SEE ALSO: regtail_study
9446*/
9447/* TODO: All three parms should be const */
9448STATIC void
9449S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9450{
9451 dVAR;
9452 register regnode *scan;
9453 GET_RE_DEBUG_FLAGS_DECL;
9454
9455 PERL_ARGS_ASSERT_REGTAIL;
9456#ifndef DEBUGGING
9457 PERL_UNUSED_ARG(depth);
9458#endif
9459
9460 if (SIZE_ONLY)
9461 return;
9462
9463 /* Find last node. */
9464 scan = p;
9465 for (;;) {
9466 regnode * const temp = regnext(scan);
9467 DEBUG_PARSE_r({
9468 SV * const mysv=sv_newmortal();
9469 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9470 regprop(RExC_rx, mysv, scan);
9471 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9472 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9473 (temp == NULL ? "->" : ""),
9474 (temp == NULL ? PL_reg_name[OP(val)] : "")
9475 );
9476 });
9477 if (temp == NULL)
9478 break;
9479 scan = temp;
9480 }
9481
9482 if (reg_off_by_arg[OP(scan)]) {
9483 ARG_SET(scan, val - scan);
9484 }
9485 else {
9486 NEXT_OFF(scan) = val - scan;
9487 }
9488}
9489
9490#ifdef DEBUGGING
9491/*
9492- regtail_study - set the next-pointer at the end of a node chain of p to val.
9493- Look for optimizable sequences at the same time.
9494- currently only looks for EXACT chains.
9495
9496This is experimental code. The idea is to use this routine to perform
9497in place optimizations on branches and groups as they are constructed,
9498with the long term intention of removing optimization from study_chunk so
9499that it is purely analytical.
9500
9501Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9502to control which is which.
9503
9504*/
9505/* TODO: All four parms should be const */
9506
9507STATIC U8
9508S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9509{
9510 dVAR;
9511 register regnode *scan;
9512 U8 exact = PSEUDO;
9513#ifdef EXPERIMENTAL_INPLACESCAN
9514 I32 min = 0;
9515#endif
9516 GET_RE_DEBUG_FLAGS_DECL;
9517
9518 PERL_ARGS_ASSERT_REGTAIL_STUDY;
9519
9520
9521 if (SIZE_ONLY)
9522 return exact;
9523
9524 /* Find last node. */
9525
9526 scan = p;
9527 for (;;) {
9528 regnode * const temp = regnext(scan);
9529#ifdef EXPERIMENTAL_INPLACESCAN
9530 if (PL_regkind[OP(scan)] == EXACT)
9531 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9532 return EXACT;
9533#endif
9534 if ( exact ) {
9535 switch (OP(scan)) {
9536 case EXACT:
9537 case EXACTF:
9538 case EXACTFU:
9539 case EXACTFL:
9540 if( exact == PSEUDO )
9541 exact= OP(scan);
9542 else if ( exact != OP(scan) )
9543 exact= 0;
9544 case NOTHING:
9545 break;
9546 default:
9547 exact= 0;
9548 }
9549 }
9550 DEBUG_PARSE_r({
9551 SV * const mysv=sv_newmortal();
9552 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9553 regprop(RExC_rx, mysv, scan);
9554 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
9555 SvPV_nolen_const(mysv),
9556 REG_NODE_NUM(scan),
9557 PL_reg_name[exact]);
9558 });
9559 if (temp == NULL)
9560 break;
9561 scan = temp;
9562 }
9563 DEBUG_PARSE_r({
9564 SV * const mysv_val=sv_newmortal();
9565 DEBUG_PARSE_MSG("");
9566 regprop(RExC_rx, mysv_val, val);
9567 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9568 SvPV_nolen_const(mysv_val),
9569 (IV)REG_NODE_NUM(val),
9570 (IV)(val - scan)
9571 );
9572 });
9573 if (reg_off_by_arg[OP(scan)]) {
9574 ARG_SET(scan, val - scan);
9575 }
9576 else {
9577 NEXT_OFF(scan) = val - scan;
9578 }
9579
9580 return exact;
9581}
9582#endif
9583
9584/*
9585 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
9586 */
9587#ifdef DEBUGGING
9588static void
9589S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9590{
9591 int bit;
9592 int set=0;
9593 regex_charset cs;
9594
9595 for (bit=0; bit<32; bit++) {
9596 if (flags & (1<<bit)) {
9597 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
9598 continue;
9599 }
9600 if (!set++ && lead)
9601 PerlIO_printf(Perl_debug_log, "%s",lead);
9602 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9603 }
9604 }
9605 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
9606 if (!set++ && lead) {
9607 PerlIO_printf(Perl_debug_log, "%s",lead);
9608 }
9609 switch (cs) {
9610 case REGEX_UNICODE_CHARSET:
9611 PerlIO_printf(Perl_debug_log, "UNICODE");
9612 break;
9613 case REGEX_LOCALE_CHARSET:
9614 PerlIO_printf(Perl_debug_log, "LOCALE");
9615 break;
9616 case REGEX_ASCII_RESTRICTED_CHARSET:
9617 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
9618 break;
9619 default:
9620 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
9621 break;
9622 }
9623 }
9624 if (lead) {
9625 if (set)
9626 PerlIO_printf(Perl_debug_log, "\n");
9627 else
9628 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9629 }
9630}
9631#endif
9632
9633void
9634Perl_regdump(pTHX_ const regexp *r)
9635{
9636#ifdef DEBUGGING
9637 dVAR;
9638 SV * const sv = sv_newmortal();
9639 SV *dsv= sv_newmortal();
9640 RXi_GET_DECL(r,ri);
9641 GET_RE_DEBUG_FLAGS_DECL;
9642
9643 PERL_ARGS_ASSERT_REGDUMP;
9644
9645 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
9646
9647 /* Header fields of interest. */
9648 if (r->anchored_substr) {
9649 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9650 RE_SV_DUMPLEN(r->anchored_substr), 30);
9651 PerlIO_printf(Perl_debug_log,
9652 "anchored %s%s at %"IVdf" ",
9653 s, RE_SV_TAIL(r->anchored_substr),
9654 (IV)r->anchored_offset);
9655 } else if (r->anchored_utf8) {
9656 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9657 RE_SV_DUMPLEN(r->anchored_utf8), 30);
9658 PerlIO_printf(Perl_debug_log,
9659 "anchored utf8 %s%s at %"IVdf" ",
9660 s, RE_SV_TAIL(r->anchored_utf8),
9661 (IV)r->anchored_offset);
9662 }
9663 if (r->float_substr) {
9664 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9665 RE_SV_DUMPLEN(r->float_substr), 30);
9666 PerlIO_printf(Perl_debug_log,
9667 "floating %s%s at %"IVdf"..%"UVuf" ",
9668 s, RE_SV_TAIL(r->float_substr),
9669 (IV)r->float_min_offset, (UV)r->float_max_offset);
9670 } else if (r->float_utf8) {
9671 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9672 RE_SV_DUMPLEN(r->float_utf8), 30);
9673 PerlIO_printf(Perl_debug_log,
9674 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9675 s, RE_SV_TAIL(r->float_utf8),
9676 (IV)r->float_min_offset, (UV)r->float_max_offset);
9677 }
9678 if (r->check_substr || r->check_utf8)
9679 PerlIO_printf(Perl_debug_log,
9680 (const char *)
9681 (r->check_substr == r->float_substr
9682 && r->check_utf8 == r->float_utf8
9683 ? "(checking floating" : "(checking anchored"));
9684 if (r->extflags & RXf_NOSCAN)
9685 PerlIO_printf(Perl_debug_log, " noscan");
9686 if (r->extflags & RXf_CHECK_ALL)
9687 PerlIO_printf(Perl_debug_log, " isall");
9688 if (r->check_substr || r->check_utf8)
9689 PerlIO_printf(Perl_debug_log, ") ");
9690
9691 if (ri->regstclass) {
9692 regprop(r, sv, ri->regstclass);
9693 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9694 }
9695 if (r->extflags & RXf_ANCH) {
9696 PerlIO_printf(Perl_debug_log, "anchored");
9697 if (r->extflags & RXf_ANCH_BOL)
9698 PerlIO_printf(Perl_debug_log, "(BOL)");
9699 if (r->extflags & RXf_ANCH_MBOL)
9700 PerlIO_printf(Perl_debug_log, "(MBOL)");
9701 if (r->extflags & RXf_ANCH_SBOL)
9702 PerlIO_printf(Perl_debug_log, "(SBOL)");
9703 if (r->extflags & RXf_ANCH_GPOS)
9704 PerlIO_printf(Perl_debug_log, "(GPOS)");
9705 PerlIO_putc(Perl_debug_log, ' ');
9706 }
9707 if (r->extflags & RXf_GPOS_SEEN)
9708 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9709 if (r->intflags & PREGf_SKIP)
9710 PerlIO_printf(Perl_debug_log, "plus ");
9711 if (r->intflags & PREGf_IMPLICIT)
9712 PerlIO_printf(Perl_debug_log, "implicit ");
9713 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9714 if (r->extflags & RXf_EVAL_SEEN)
9715 PerlIO_printf(Perl_debug_log, "with eval ");
9716 PerlIO_printf(Perl_debug_log, "\n");
9717 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9718#else
9719 PERL_ARGS_ASSERT_REGDUMP;
9720 PERL_UNUSED_CONTEXT;
9721 PERL_UNUSED_ARG(r);
9722#endif /* DEBUGGING */
9723}
9724
9725/*
9726- regprop - printable representation of opcode
9727*/
9728#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9729STMT_START { \
9730 if (do_sep) { \
9731 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9732 if (flags & ANYOF_INVERT) \
9733 /*make sure the invert info is in each */ \
9734 sv_catpvs(sv, "^"); \
9735 do_sep = 0; \
9736 } \
9737} STMT_END
9738
9739void
9740Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9741{
9742#ifdef DEBUGGING
9743 dVAR;
9744 register int k;
9745 RXi_GET_DECL(prog,progi);
9746 GET_RE_DEBUG_FLAGS_DECL;
9747
9748 PERL_ARGS_ASSERT_REGPROP;
9749
9750 sv_setpvs(sv, "");
9751
9752 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
9753 /* It would be nice to FAIL() here, but this may be called from
9754 regexec.c, and it would be hard to supply pRExC_state. */
9755 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9756 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9757
9758 k = PL_regkind[OP(o)];
9759
9760 if (k == EXACT) {
9761 sv_catpvs(sv, " ");
9762 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9763 * is a crude hack but it may be the best for now since
9764 * we have no flag "this EXACTish node was UTF-8"
9765 * --jhi */
9766 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9767 PERL_PV_ESCAPE_UNI_DETECT |
9768 PERL_PV_ESCAPE_NONASCII |
9769 PERL_PV_PRETTY_ELLIPSES |
9770 PERL_PV_PRETTY_LTGT |
9771 PERL_PV_PRETTY_NOCLEAR
9772 );
9773 } else if (k == TRIE) {
9774 /* print the details of the trie in dumpuntil instead, as
9775 * progi->data isn't available here */
9776 const char op = OP(o);
9777 const U32 n = ARG(o);
9778 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9779 (reg_ac_data *)progi->data->data[n] :
9780 NULL;
9781 const reg_trie_data * const trie
9782 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9783
9784 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9785 DEBUG_TRIE_COMPILE_r(
9786 Perl_sv_catpvf(aTHX_ sv,
9787 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9788 (UV)trie->startstate,
9789 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9790 (UV)trie->wordcount,
9791 (UV)trie->minlen,
9792 (UV)trie->maxlen,
9793 (UV)TRIE_CHARCOUNT(trie),
9794 (UV)trie->uniquecharcount
9795 )
9796 );
9797 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9798 int i;
9799 int rangestart = -1;
9800 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9801 sv_catpvs(sv, "[");
9802 for (i = 0; i <= 256; i++) {
9803 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9804 if (rangestart == -1)
9805 rangestart = i;
9806 } else if (rangestart != -1) {
9807 if (i <= rangestart + 3)
9808 for (; rangestart < i; rangestart++)
9809 put_byte(sv, rangestart);
9810 else {
9811 put_byte(sv, rangestart);
9812 sv_catpvs(sv, "-");
9813 put_byte(sv, i - 1);
9814 }
9815 rangestart = -1;
9816 }
9817 }
9818 sv_catpvs(sv, "]");
9819 }
9820
9821 } else if (k == CURLY) {
9822 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9823 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9824 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9825 }
9826 else if (k == WHILEM && o->flags) /* Ordinal/of */
9827 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9828 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9829 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9830 if ( RXp_PAREN_NAMES(prog) ) {
9831 if ( k != REF || (OP(o) < NREF)) {
9832 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9833 SV **name= av_fetch(list, ARG(o), 0 );
9834 if (name)
9835 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9836 }
9837 else {
9838 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9839 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9840 I32 *nums=(I32*)SvPVX(sv_dat);
9841 SV **name= av_fetch(list, nums[0], 0 );
9842 I32 n;
9843 if (name) {
9844 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9845 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9846 (n ? "," : ""), (IV)nums[n]);
9847 }
9848 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9849 }
9850 }
9851 }
9852 } else if (k == GOSUB)
9853 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9854 else if (k == VERB) {
9855 if (!o->flags)
9856 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9857 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9858 } else if (k == LOGICAL)
9859 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
9860 else if (k == FOLDCHAR)
9861 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9862 else if (k == ANYOF) {
9863 int i, rangestart = -1;
9864 const U8 flags = ANYOF_FLAGS(o);
9865 int do_sep = 0;
9866
9867 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9868 static const char * const anyofs[] = {
9869 "\\w",
9870 "\\W",
9871 "\\s",
9872 "\\S",
9873 "\\d",
9874 "\\D",
9875 "[:alnum:]",
9876 "[:^alnum:]",
9877 "[:alpha:]",
9878 "[:^alpha:]",
9879 "[:ascii:]",
9880 "[:^ascii:]",
9881 "[:cntrl:]",
9882 "[:^cntrl:]",
9883 "[:graph:]",
9884 "[:^graph:]",
9885 "[:lower:]",
9886 "[:^lower:]",
9887 "[:print:]",
9888 "[:^print:]",
9889 "[:punct:]",
9890 "[:^punct:]",
9891 "[:upper:]",
9892 "[:^upper:]",
9893 "[:xdigit:]",
9894 "[:^xdigit:]",
9895 "[:space:]",
9896 "[:^space:]",
9897 "[:blank:]",
9898 "[:^blank:]"
9899 };
9900
9901 if (flags & ANYOF_LOCALE)
9902 sv_catpvs(sv, "{loc}");
9903 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
9904 sv_catpvs(sv, "{i}");
9905 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9906 if (flags & ANYOF_INVERT)
9907 sv_catpvs(sv, "^");
9908
9909 /* output what the standard cp 0-255 bitmap matches */
9910 for (i = 0; i <= 256; i++) {
9911 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9912 if (rangestart == -1)
9913 rangestart = i;
9914 } else if (rangestart != -1) {
9915 if (i <= rangestart + 3)
9916 for (; rangestart < i; rangestart++)
9917 put_byte(sv, rangestart);
9918 else {
9919 put_byte(sv, rangestart);
9920 sv_catpvs(sv, "-");
9921 put_byte(sv, i - 1);
9922 }
9923 do_sep = 1;
9924 rangestart = -1;
9925 }
9926 }
9927
9928 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9929 /* output any special charclass tests (used entirely under use locale) */
9930 if (ANYOF_CLASS_TEST_ANY_SET(o))
9931 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9932 if (ANYOF_CLASS_TEST(o,i)) {
9933 sv_catpv(sv, anyofs[i]);
9934 do_sep = 1;
9935 }
9936
9937 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9938
9939 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
9940 sv_catpvs(sv, "{non-utf8-latin1-all}");
9941 }
9942
9943 /* output information about the unicode matching */
9944 if (flags & ANYOF_UNICODE_ALL)
9945 sv_catpvs(sv, "{unicode_all}");
9946 else if (flags & ANYOF_UTF8)
9947 sv_catpvs(sv, "{unicode}");
9948 if (flags & ANYOF_NONBITMAP_NON_UTF8)
9949 sv_catpvs(sv, "{outside bitmap}");
9950
9951 {
9952 SV *lv;
9953 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9954
9955 if (lv) {
9956 if (sw) {
9957 U8 s[UTF8_MAXBYTES_CASE+1];
9958
9959 for (i = 0; i <= 256; i++) { /* just the first 256 */
9960 uvchr_to_utf8(s, i);
9961
9962 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9963 if (rangestart == -1)
9964 rangestart = i;
9965 } else if (rangestart != -1) {
9966 if (i <= rangestart + 3)
9967 for (; rangestart < i; rangestart++) {
9968 const U8 * const e = uvchr_to_utf8(s,rangestart);
9969 U8 *p;
9970 for(p = s; p < e; p++)
9971 put_byte(sv, *p);
9972 }
9973 else {
9974 const U8 *e = uvchr_to_utf8(s,rangestart);
9975 U8 *p;
9976 for (p = s; p < e; p++)
9977 put_byte(sv, *p);
9978 sv_catpvs(sv, "-");
9979 e = uvchr_to_utf8(s, i-1);
9980 for (p = s; p < e; p++)
9981 put_byte(sv, *p);
9982 }
9983 rangestart = -1;
9984 }
9985 }
9986
9987 sv_catpvs(sv, "..."); /* et cetera */
9988 }
9989
9990 {
9991 char *s = savesvpv(lv);
9992 char * const origs = s;
9993
9994 while (*s && *s != '\n')
9995 s++;
9996
9997 if (*s == '\n') {
9998 const char * const t = ++s;
9999
10000 while (*s) {
10001 if (*s == '\n')
10002 *s = ' ';
10003 s++;
10004 }
10005 if (s[-1] == ' ')
10006 s[-1] = 0;
10007
10008 sv_catpv(sv, t);
10009 }
10010
10011 Safefree(origs);
10012 }
10013 }
10014 }
10015
10016 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
10017 }
10018 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
10019 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
10020#else
10021 PERL_UNUSED_CONTEXT;
10022 PERL_UNUSED_ARG(sv);
10023 PERL_UNUSED_ARG(o);
10024 PERL_UNUSED_ARG(prog);
10025#endif /* DEBUGGING */
10026}
10027
10028SV *
10029Perl_re_intuit_string(pTHX_ REGEXP * const r)
10030{ /* Assume that RE_INTUIT is set */
10031 dVAR;
10032 struct regexp *const prog = (struct regexp *)SvANY(r);
10033 GET_RE_DEBUG_FLAGS_DECL;
10034
10035 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
10036 PERL_UNUSED_CONTEXT;
10037
10038 DEBUG_COMPILE_r(
10039 {
10040 const char * const s = SvPV_nolen_const(prog->check_substr
10041 ? prog->check_substr : prog->check_utf8);
10042
10043 if (!PL_colorset) reginitcolors();
10044 PerlIO_printf(Perl_debug_log,
10045 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
10046 PL_colors[4],
10047 prog->check_substr ? "" : "utf8 ",
10048 PL_colors[5],PL_colors[0],
10049 s,
10050 PL_colors[1],
10051 (strlen(s) > 60 ? "..." : ""));
10052 } );
10053
10054 return prog->check_substr ? prog->check_substr : prog->check_utf8;
10055}
10056
10057/*
10058 pregfree()
10059
10060 handles refcounting and freeing the perl core regexp structure. When
10061 it is necessary to actually free the structure the first thing it
10062 does is call the 'free' method of the regexp_engine associated to
10063 the regexp, allowing the handling of the void *pprivate; member
10064 first. (This routine is not overridable by extensions, which is why
10065 the extensions free is called first.)
10066
10067 See regdupe and regdupe_internal if you change anything here.
10068*/
10069#ifndef PERL_IN_XSUB_RE
10070void
10071Perl_pregfree(pTHX_ REGEXP *r)
10072{
10073 SvREFCNT_dec(r);
10074}
10075
10076void
10077Perl_pregfree2(pTHX_ REGEXP *rx)
10078{
10079 dVAR;
10080 struct regexp *const r = (struct regexp *)SvANY(rx);
10081 GET_RE_DEBUG_FLAGS_DECL;
10082
10083 PERL_ARGS_ASSERT_PREGFREE2;
10084
10085 if (r->mother_re) {
10086 ReREFCNT_dec(r->mother_re);
10087 } else {
10088 CALLREGFREE_PVT(rx); /* free the private data */
10089 SvREFCNT_dec(RXp_PAREN_NAMES(r));
10090 }
10091 if (r->substrs) {
10092 SvREFCNT_dec(r->anchored_substr);
10093 SvREFCNT_dec(r->anchored_utf8);
10094 SvREFCNT_dec(r->float_substr);
10095 SvREFCNT_dec(r->float_utf8);
10096 Safefree(r->substrs);
10097 }
10098 RX_MATCH_COPY_FREE(rx);
10099#ifdef PERL_OLD_COPY_ON_WRITE
10100 SvREFCNT_dec(r->saved_copy);
10101#endif
10102 Safefree(r->offs);
10103}
10104
10105/* reg_temp_copy()
10106
10107 This is a hacky workaround to the structural issue of match results
10108 being stored in the regexp structure which is in turn stored in
10109 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
10110 could be PL_curpm in multiple contexts, and could require multiple
10111 result sets being associated with the pattern simultaneously, such
10112 as when doing a recursive match with (??{$qr})
10113
10114 The solution is to make a lightweight copy of the regexp structure
10115 when a qr// is returned from the code executed by (??{$qr}) this
10116 lightweight copy doesn't actually own any of its data except for
10117 the starp/end and the actual regexp structure itself.
10118
10119*/
10120
10121
10122REGEXP *
10123Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
10124{
10125 struct regexp *ret;
10126 struct regexp *const r = (struct regexp *)SvANY(rx);
10127 register const I32 npar = r->nparens+1;
10128
10129 PERL_ARGS_ASSERT_REG_TEMP_COPY;
10130
10131 if (!ret_x)
10132 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
10133 ret = (struct regexp *)SvANY(ret_x);
10134
10135 (void)ReREFCNT_inc(rx);
10136 /* We can take advantage of the existing "copied buffer" mechanism in SVs
10137 by pointing directly at the buffer, but flagging that the allocated
10138 space in the copy is zero. As we've just done a struct copy, it's now
10139 a case of zero-ing that, rather than copying the current length. */
10140 SvPV_set(ret_x, RX_WRAPPED(rx));
10141 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
10142 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
10143 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
10144 SvLEN_set(ret_x, 0);
10145 SvSTASH_set(ret_x, NULL);
10146 SvMAGIC_set(ret_x, NULL);
10147 Newx(ret->offs, npar, regexp_paren_pair);
10148 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
10149 if (r->substrs) {
10150 Newx(ret->substrs, 1, struct reg_substr_data);
10151 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10152
10153 SvREFCNT_inc_void(ret->anchored_substr);
10154 SvREFCNT_inc_void(ret->anchored_utf8);
10155 SvREFCNT_inc_void(ret->float_substr);
10156 SvREFCNT_inc_void(ret->float_utf8);
10157
10158 /* check_substr and check_utf8, if non-NULL, point to either their
10159 anchored or float namesakes, and don't hold a second reference. */
10160 }
10161 RX_MATCH_COPIED_off(ret_x);
10162#ifdef PERL_OLD_COPY_ON_WRITE
10163 ret->saved_copy = NULL;
10164#endif
10165 ret->mother_re = rx;
10166
10167 return ret_x;
10168}
10169#endif
10170
10171/* regfree_internal()
10172
10173 Free the private data in a regexp. This is overloadable by
10174 extensions. Perl takes care of the regexp structure in pregfree(),
10175 this covers the *pprivate pointer which technically perl doesn't
10176 know about, however of course we have to handle the
10177 regexp_internal structure when no extension is in use.
10178
10179 Note this is called before freeing anything in the regexp
10180 structure.
10181 */
10182
10183void
10184Perl_regfree_internal(pTHX_ REGEXP * const rx)
10185{
10186 dVAR;
10187 struct regexp *const r = (struct regexp *)SvANY(rx);
10188 RXi_GET_DECL(r,ri);
10189 GET_RE_DEBUG_FLAGS_DECL;
10190
10191 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
10192
10193 DEBUG_COMPILE_r({
10194 if (!PL_colorset)
10195 reginitcolors();
10196 {
10197 SV *dsv= sv_newmortal();
10198 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
10199 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
10200 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
10201 PL_colors[4],PL_colors[5],s);
10202 }
10203 });
10204#ifdef RE_TRACK_PATTERN_OFFSETS
10205 if (ri->u.offsets)
10206 Safefree(ri->u.offsets); /* 20010421 MJD */
10207#endif
10208 if (ri->data) {
10209 int n = ri->data->count;
10210 PAD* new_comppad = NULL;
10211 PAD* old_comppad;
10212 PADOFFSET refcnt;
10213
10214 while (--n >= 0) {
10215 /* If you add a ->what type here, update the comment in regcomp.h */
10216 switch (ri->data->what[n]) {
10217 case 'a':
10218 case 's':
10219 case 'S':
10220 case 'u':
10221 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
10222 break;
10223 case 'f':
10224 Safefree(ri->data->data[n]);
10225 break;
10226 case 'p':
10227 new_comppad = MUTABLE_AV(ri->data->data[n]);
10228 break;
10229 case 'o':
10230 if (new_comppad == NULL)
10231 Perl_croak(aTHX_ "panic: pregfree comppad");
10232 PAD_SAVE_LOCAL(old_comppad,
10233 /* Watch out for global destruction's random ordering. */
10234 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
10235 );
10236 OP_REFCNT_LOCK;
10237 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
10238 OP_REFCNT_UNLOCK;
10239 if (!refcnt)
10240 op_free((OP_4tree*)ri->data->data[n]);
10241
10242 PAD_RESTORE_LOCAL(old_comppad);
10243 SvREFCNT_dec(MUTABLE_SV(new_comppad));
10244 new_comppad = NULL;
10245 break;
10246 case 'n':
10247 break;
10248 case 'T':
10249 { /* Aho Corasick add-on structure for a trie node.
10250 Used in stclass optimization only */
10251 U32 refcount;
10252 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
10253 OP_REFCNT_LOCK;
10254 refcount = --aho->refcount;
10255 OP_REFCNT_UNLOCK;
10256 if ( !refcount ) {
10257 PerlMemShared_free(aho->states);
10258 PerlMemShared_free(aho->fail);
10259 /* do this last!!!! */
10260 PerlMemShared_free(ri->data->data[n]);
10261 PerlMemShared_free(ri->regstclass);
10262 }
10263 }
10264 break;
10265 case 't':
10266 {
10267 /* trie structure. */
10268 U32 refcount;
10269 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
10270 OP_REFCNT_LOCK;
10271 refcount = --trie->refcount;
10272 OP_REFCNT_UNLOCK;
10273 if ( !refcount ) {
10274 PerlMemShared_free(trie->charmap);
10275 PerlMemShared_free(trie->states);
10276 PerlMemShared_free(trie->trans);
10277 if (trie->bitmap)
10278 PerlMemShared_free(trie->bitmap);
10279 if (trie->jump)
10280 PerlMemShared_free(trie->jump);
10281 PerlMemShared_free(trie->wordinfo);
10282 /* do this last!!!! */
10283 PerlMemShared_free(ri->data->data[n]);
10284 }
10285 }
10286 break;
10287 default:
10288 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
10289 }
10290 }
10291 Safefree(ri->data->what);
10292 Safefree(ri->data);
10293 }
10294
10295 Safefree(ri);
10296}
10297
10298#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
10299#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
10300#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
10301
10302/*
10303 re_dup - duplicate a regexp.
10304
10305 This routine is expected to clone a given regexp structure. It is only
10306 compiled under USE_ITHREADS.
10307
10308 After all of the core data stored in struct regexp is duplicated
10309 the regexp_engine.dupe method is used to copy any private data
10310 stored in the *pprivate pointer. This allows extensions to handle
10311 any duplication it needs to do.
10312
10313 See pregfree() and regfree_internal() if you change anything here.
10314*/
10315#if defined(USE_ITHREADS)
10316#ifndef PERL_IN_XSUB_RE
10317void
10318Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
10319{
10320 dVAR;
10321 I32 npar;
10322 const struct regexp *r = (const struct regexp *)SvANY(sstr);
10323 struct regexp *ret = (struct regexp *)SvANY(dstr);
10324
10325 PERL_ARGS_ASSERT_RE_DUP_GUTS;
10326
10327 npar = r->nparens+1;
10328 Newx(ret->offs, npar, regexp_paren_pair);
10329 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
10330 if(ret->swap) {
10331 /* no need to copy these */
10332 Newx(ret->swap, npar, regexp_paren_pair);
10333 }
10334
10335 if (ret->substrs) {
10336 /* Do it this way to avoid reading from *r after the StructCopy().
10337 That way, if any of the sv_dup_inc()s dislodge *r from the L1
10338 cache, it doesn't matter. */
10339 const bool anchored = r->check_substr
10340 ? r->check_substr == r->anchored_substr
10341 : r->check_utf8 == r->anchored_utf8;
10342 Newx(ret->substrs, 1, struct reg_substr_data);
10343 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10344
10345 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
10346 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
10347 ret->float_substr = sv_dup_inc(ret->float_substr, param);
10348 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
10349
10350 /* check_substr and check_utf8, if non-NULL, point to either their
10351 anchored or float namesakes, and don't hold a second reference. */
10352
10353 if (ret->check_substr) {
10354 if (anchored) {
10355 assert(r->check_utf8 == r->anchored_utf8);
10356 ret->check_substr = ret->anchored_substr;
10357 ret->check_utf8 = ret->anchored_utf8;
10358 } else {
10359 assert(r->check_substr == r->float_substr);
10360 assert(r->check_utf8 == r->float_utf8);
10361 ret->check_substr = ret->float_substr;
10362 ret->check_utf8 = ret->float_utf8;
10363 }
10364 } else if (ret->check_utf8) {
10365 if (anchored) {
10366 ret->check_utf8 = ret->anchored_utf8;
10367 } else {
10368 ret->check_utf8 = ret->float_utf8;
10369 }
10370 }
10371 }
10372
10373 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
10374
10375 if (ret->pprivate)
10376 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
10377
10378 if (RX_MATCH_COPIED(dstr))
10379 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
10380 else
10381 ret->subbeg = NULL;
10382#ifdef PERL_OLD_COPY_ON_WRITE
10383 ret->saved_copy = NULL;
10384#endif
10385
10386 if (ret->mother_re) {
10387 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
10388 /* Our storage points directly to our mother regexp, but that's
10389 1: a buffer in a different thread
10390 2: something we no longer hold a reference on
10391 so we need to copy it locally. */
10392 /* Note we need to sue SvCUR() on our mother_re, because it, in
10393 turn, may well be pointing to its own mother_re. */
10394 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
10395 SvCUR(ret->mother_re)+1));
10396 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
10397 }
10398 ret->mother_re = NULL;
10399 }
10400 ret->gofs = 0;
10401}
10402#endif /* PERL_IN_XSUB_RE */
10403
10404/*
10405 regdupe_internal()
10406
10407 This is the internal complement to regdupe() which is used to copy
10408 the structure pointed to by the *pprivate pointer in the regexp.
10409 This is the core version of the extension overridable cloning hook.
10410 The regexp structure being duplicated will be copied by perl prior
10411 to this and will be provided as the regexp *r argument, however
10412 with the /old/ structures pprivate pointer value. Thus this routine
10413 may override any copying normally done by perl.
10414
10415 It returns a pointer to the new regexp_internal structure.
10416*/
10417
10418void *
10419Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
10420{
10421 dVAR;
10422 struct regexp *const r = (struct regexp *)SvANY(rx);
10423 regexp_internal *reti;
10424 int len, npar;
10425 RXi_GET_DECL(r,ri);
10426
10427 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
10428
10429 npar = r->nparens+1;
10430 len = ProgLen(ri);
10431
10432 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
10433 Copy(ri->program, reti->program, len+1, regnode);
10434
10435
10436 reti->regstclass = NULL;
10437
10438 if (ri->data) {
10439 struct reg_data *d;
10440 const int count = ri->data->count;
10441 int i;
10442
10443 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10444 char, struct reg_data);
10445 Newx(d->what, count, U8);
10446
10447 d->count = count;
10448 for (i = 0; i < count; i++) {
10449 d->what[i] = ri->data->what[i];
10450 switch (d->what[i]) {
10451 /* legal options are one of: sSfpontTua
10452 see also regcomp.h and pregfree() */
10453 case 'a': /* actually an AV, but the dup function is identical. */
10454 case 's':
10455 case 'S':
10456 case 'p': /* actually an AV, but the dup function is identical. */
10457 case 'u': /* actually an HV, but the dup function is identical. */
10458 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
10459 break;
10460 case 'f':
10461 /* This is cheating. */
10462 Newx(d->data[i], 1, struct regnode_charclass_class);
10463 StructCopy(ri->data->data[i], d->data[i],
10464 struct regnode_charclass_class);
10465 reti->regstclass = (regnode*)d->data[i];
10466 break;
10467 case 'o':
10468 /* Compiled op trees are readonly and in shared memory,
10469 and can thus be shared without duplication. */
10470 OP_REFCNT_LOCK;
10471 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
10472 OP_REFCNT_UNLOCK;
10473 break;
10474 case 'T':
10475 /* Trie stclasses are readonly and can thus be shared
10476 * without duplication. We free the stclass in pregfree
10477 * when the corresponding reg_ac_data struct is freed.
10478 */
10479 reti->regstclass= ri->regstclass;
10480 /* Fall through */
10481 case 't':
10482 OP_REFCNT_LOCK;
10483 ((reg_trie_data*)ri->data->data[i])->refcount++;
10484 OP_REFCNT_UNLOCK;
10485 /* Fall through */
10486 case 'n':
10487 d->data[i] = ri->data->data[i];
10488 break;
10489 default:
10490 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
10491 }
10492 }
10493
10494 reti->data = d;
10495 }
10496 else
10497 reti->data = NULL;
10498
10499 reti->name_list_idx = ri->name_list_idx;
10500
10501#ifdef RE_TRACK_PATTERN_OFFSETS
10502 if (ri->u.offsets) {
10503 Newx(reti->u.offsets, 2*len+1, U32);
10504 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10505 }
10506#else
10507 SetProgLen(reti,len);
10508#endif
10509
10510 return (void*)reti;
10511}
10512
10513#endif /* USE_ITHREADS */
10514
10515#ifndef PERL_IN_XSUB_RE
10516
10517/*
10518 - regnext - dig the "next" pointer out of a node
10519 */
10520regnode *
10521Perl_regnext(pTHX_ register regnode *p)
10522{
10523 dVAR;
10524 register I32 offset;
10525
10526 if (!p)
10527 return(NULL);
10528
10529 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
10530 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10531 }
10532
10533 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10534 if (offset == 0)
10535 return(NULL);
10536
10537 return(p+offset);
10538}
10539#endif
10540
10541STATIC void
10542S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
10543{
10544 va_list args;
10545 STRLEN l1 = strlen(pat1);
10546 STRLEN l2 = strlen(pat2);
10547 char buf[512];
10548 SV *msv;
10549 const char *message;
10550
10551 PERL_ARGS_ASSERT_RE_CROAK2;
10552
10553 if (l1 > 510)
10554 l1 = 510;
10555 if (l1 + l2 > 510)
10556 l2 = 510 - l1;
10557 Copy(pat1, buf, l1 , char);
10558 Copy(pat2, buf + l1, l2 , char);
10559 buf[l1 + l2] = '\n';
10560 buf[l1 + l2 + 1] = '\0';
10561#ifdef I_STDARG
10562 /* ANSI variant takes additional second argument */
10563 va_start(args, pat2);
10564#else
10565 va_start(args);
10566#endif
10567 msv = vmess(buf, &args);
10568 va_end(args);
10569 message = SvPV_const(msv,l1);
10570 if (l1 > 512)
10571 l1 = 512;
10572 Copy(message, buf, l1 , char);
10573 buf[l1-1] = '\0'; /* Overwrite \n */
10574 Perl_croak(aTHX_ "%s", buf);
10575}
10576
10577/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
10578
10579#ifndef PERL_IN_XSUB_RE
10580void
10581Perl_save_re_context(pTHX)
10582{
10583 dVAR;
10584
10585 struct re_save_state *state;
10586
10587 SAVEVPTR(PL_curcop);
10588 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10589
10590 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10591 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
10592 SSPUSHUV(SAVEt_RE_STATE);
10593
10594 Copy(&PL_reg_state, state, 1, struct re_save_state);
10595
10596 PL_reg_start_tmp = 0;
10597 PL_reg_start_tmpl = 0;
10598 PL_reg_oldsaved = NULL;
10599 PL_reg_oldsavedlen = 0;
10600 PL_reg_maxiter = 0;
10601 PL_reg_leftiter = 0;
10602 PL_reg_poscache = NULL;
10603 PL_reg_poscache_size = 0;
10604#ifdef PERL_OLD_COPY_ON_WRITE
10605 PL_nrs = NULL;
10606#endif
10607
10608 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10609 if (PL_curpm) {
10610 const REGEXP * const rx = PM_GETRE(PL_curpm);
10611 if (rx) {
10612 U32 i;
10613 for (i = 1; i <= RX_NPARENS(rx); i++) {
10614 char digits[TYPE_CHARS(long)];
10615 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
10616 GV *const *const gvp
10617 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10618
10619 if (gvp) {
10620 GV * const gv = *gvp;
10621 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10622 save_scalar(gv);
10623 }
10624 }
10625 }
10626 }
10627}
10628#endif
10629
10630static void
10631clear_re(pTHX_ void *r)
10632{
10633 dVAR;
10634 ReREFCNT_dec((REGEXP *)r);
10635}
10636
10637#ifdef DEBUGGING
10638
10639STATIC void
10640S_put_byte(pTHX_ SV *sv, int c)
10641{
10642 PERL_ARGS_ASSERT_PUT_BYTE;
10643
10644 /* Our definition of isPRINT() ignores locales, so only bytes that are
10645 not part of UTF-8 are considered printable. I assume that the same
10646 holds for UTF-EBCDIC.
10647 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10648 which Wikipedia says:
10649
10650 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10651 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10652 identical, to the ASCII delete (DEL) or rubout control character.
10653 ) So the old condition can be simplified to !isPRINT(c) */
10654 if (!isPRINT(c)) {
10655 if (c < 256) {
10656 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
10657 }
10658 else {
10659 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
10660 }
10661 }
10662 else {
10663 const char string = c;
10664 if (c == '-' || c == ']' || c == '\\' || c == '^')
10665 sv_catpvs(sv, "\\");
10666 sv_catpvn(sv, &string, 1);
10667 }
10668}
10669
10670
10671#define CLEAR_OPTSTART \
10672 if (optstart) STMT_START { \
10673 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
10674 optstart=NULL; \
10675 } STMT_END
10676
10677#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
10678
10679STATIC const regnode *
10680S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
10681 const regnode *last, const regnode *plast,
10682 SV* sv, I32 indent, U32 depth)
10683{
10684 dVAR;
10685 register U8 op = PSEUDO; /* Arbitrary non-END op. */
10686 register const regnode *next;
10687 const regnode *optstart= NULL;
10688
10689 RXi_GET_DECL(r,ri);
10690 GET_RE_DEBUG_FLAGS_DECL;
10691
10692 PERL_ARGS_ASSERT_DUMPUNTIL;
10693
10694#ifdef DEBUG_DUMPUNTIL
10695 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10696 last ? last-start : 0,plast ? plast-start : 0);
10697#endif
10698
10699 if (plast && plast < last)
10700 last= plast;
10701
10702 while (PL_regkind[op] != END && (!last || node < last)) {
10703 /* While that wasn't END last time... */
10704 NODE_ALIGN(node);
10705 op = OP(node);
10706 if (op == CLOSE || op == WHILEM)
10707 indent--;
10708 next = regnext((regnode *)node);
10709
10710 /* Where, what. */
10711 if (OP(node) == OPTIMIZED) {
10712 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10713 optstart = node;
10714 else
10715 goto after_print;
10716 } else
10717 CLEAR_OPTSTART;
10718
10719 regprop(r, sv, node);
10720 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10721 (int)(2*indent + 1), "", SvPVX_const(sv));
10722
10723 if (OP(node) != OPTIMIZED) {
10724 if (next == NULL) /* Next ptr. */
10725 PerlIO_printf(Perl_debug_log, " (0)");
10726 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10727 PerlIO_printf(Perl_debug_log, " (FAIL)");
10728 else
10729 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10730 (void)PerlIO_putc(Perl_debug_log, '\n');
10731 }
10732
10733 after_print:
10734 if (PL_regkind[(U8)op] == BRANCHJ) {
10735 assert(next);
10736 {
10737 register const regnode *nnode = (OP(next) == LONGJMP
10738 ? regnext((regnode *)next)
10739 : next);
10740 if (last && nnode > last)
10741 nnode = last;
10742 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10743 }
10744 }
10745 else if (PL_regkind[(U8)op] == BRANCH) {
10746 assert(next);
10747 DUMPUNTIL(NEXTOPER(node), next);
10748 }
10749 else if ( PL_regkind[(U8)op] == TRIE ) {
10750 const regnode *this_trie = node;
10751 const char op = OP(node);
10752 const U32 n = ARG(node);
10753 const reg_ac_data * const ac = op>=AHOCORASICK ?
10754 (reg_ac_data *)ri->data->data[n] :
10755 NULL;
10756 const reg_trie_data * const trie =
10757 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10758#ifdef DEBUGGING
10759 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10760#endif
10761 const regnode *nextbranch= NULL;
10762 I32 word_idx;
10763 sv_setpvs(sv, "");
10764 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10765 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10766
10767 PerlIO_printf(Perl_debug_log, "%*s%s ",
10768 (int)(2*(indent+3)), "",
10769 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10770 PL_colors[0], PL_colors[1],
10771 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10772 PERL_PV_PRETTY_ELLIPSES |
10773 PERL_PV_PRETTY_LTGT
10774 )
10775 : "???"
10776 );
10777 if (trie->jump) {
10778 U16 dist= trie->jump[word_idx+1];
10779 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10780 (UV)((dist ? this_trie + dist : next) - start));
10781 if (dist) {
10782 if (!nextbranch)
10783 nextbranch= this_trie + trie->jump[0];
10784 DUMPUNTIL(this_trie + dist, nextbranch);
10785 }
10786 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10787 nextbranch= regnext((regnode *)nextbranch);
10788 } else {
10789 PerlIO_printf(Perl_debug_log, "\n");
10790 }
10791 }
10792 if (last && next > last)
10793 node= last;
10794 else
10795 node= next;
10796 }
10797 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10798 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10799 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10800 }
10801 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10802 assert(next);
10803 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10804 }
10805 else if ( op == PLUS || op == STAR) {
10806 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10807 }
10808 else if (PL_regkind[(U8)op] == ANYOF) {
10809 /* arglen 1 + class block */
10810 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
10811 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10812 node = NEXTOPER(node);
10813 }
10814 else if (PL_regkind[(U8)op] == EXACT) {
10815 /* Literal string, where present. */
10816 node += NODE_SZ_STR(node) - 1;
10817 node = NEXTOPER(node);
10818 }
10819 else {
10820 node = NEXTOPER(node);
10821 node += regarglen[(U8)op];
10822 }
10823 if (op == CURLYX || op == OPEN)
10824 indent++;
10825 }
10826 CLEAR_OPTSTART;
10827#ifdef DEBUG_DUMPUNTIL
10828 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10829#endif
10830 return node;
10831}
10832
10833#endif /* DEBUGGING */
10834
10835/*
10836 * Local variables:
10837 * c-indentation-style: bsd
10838 * c-basic-offset: 4
10839 * indent-tabs-mode: t
10840 * End:
10841 *
10842 * ex: set ts=8 sts=4 sw=4 noet:
10843 */