This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Simplify a tiny bit of the release manager guide
[perl5.git] / regcomp.c
... / ...
CommitLineData
1/* regcomp.c
2 */
3
4/*
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8 */
9
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
18 */
19
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
29/* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
32*/
33
34#ifdef PERL_EXT_RE_BUILD
35#include "re_top.h"
36#endif
37
38/*
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
64 ****
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
74#define PERL_IN_REGCOMP_C
75#include "perl.h"
76
77#ifndef PERL_IN_XSUB_RE
78# include "INTERN.h"
79#endif
80
81#define REG_COMP_C
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
87
88#include "dquote_static.c"
89
90#ifdef op
91#undef op
92#endif /* op */
93
94#ifdef MSDOS
95# if defined(BUGGY_MSC6)
96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
97# pragma optimize("a",off)
98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
99# pragma optimize("w",on )
100# endif /* BUGGY_MSC6 */
101#endif /* MSDOS */
102
103#ifndef STATIC
104#define STATIC static
105#endif
106
107typedef struct RExC_state_t {
108 U32 flags; /* are we folding, multilining? */
109 char *precomp; /* uncompiled string. */
110 REGEXP *rx_sv; /* The SV that is the regexp. */
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
113 char *start; /* Start of input for compile */
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
117 regnode *emit_start; /* Start of emitted-code area */
118 regnode *emit_bound; /* First regnode outside of the allocated space */
119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
126 I32 nestroot; /* root parens we are in - used by accept */
127 I32 extralen;
128 I32 seen_zerolen;
129 I32 seen_evals;
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
137 I32 uni_semantics; /* If a d charset modifier should use unicode
138 rules, even if the pattern is not in
139 utf8 */
140 HV *paren_names; /* Paren names */
141
142 regnode **recurse; /* Recurse regops */
143 I32 recurse_count; /* Number of recurse regops */
144 I32 in_lookbehind;
145 I32 contains_locale;
146#if ADD_TO_REGEXEC
147 char *starttry; /* -Dr: where regtry was called. */
148#define RExC_starttry (pRExC_state->starttry)
149#endif
150#ifdef DEBUGGING
151 const char *lastparse;
152 I32 lastnum;
153 AV *paren_name_list; /* idx -> name */
154#define RExC_lastparse (pRExC_state->lastparse)
155#define RExC_lastnum (pRExC_state->lastnum)
156#define RExC_paren_name_list (pRExC_state->paren_name_list)
157#endif
158} RExC_state_t;
159
160#define RExC_flags (pRExC_state->flags)
161#define RExC_precomp (pRExC_state->precomp)
162#define RExC_rx_sv (pRExC_state->rx_sv)
163#define RExC_rx (pRExC_state->rx)
164#define RExC_rxi (pRExC_state->rxi)
165#define RExC_start (pRExC_state->start)
166#define RExC_end (pRExC_state->end)
167#define RExC_parse (pRExC_state->parse)
168#define RExC_whilem_seen (pRExC_state->whilem_seen)
169#ifdef RE_TRACK_PATTERN_OFFSETS
170#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
171#endif
172#define RExC_emit (pRExC_state->emit)
173#define RExC_emit_start (pRExC_state->emit_start)
174#define RExC_emit_bound (pRExC_state->emit_bound)
175#define RExC_naughty (pRExC_state->naughty)
176#define RExC_sawback (pRExC_state->sawback)
177#define RExC_seen (pRExC_state->seen)
178#define RExC_size (pRExC_state->size)
179#define RExC_npar (pRExC_state->npar)
180#define RExC_nestroot (pRExC_state->nestroot)
181#define RExC_extralen (pRExC_state->extralen)
182#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
183#define RExC_seen_evals (pRExC_state->seen_evals)
184#define RExC_utf8 (pRExC_state->utf8)
185#define RExC_uni_semantics (pRExC_state->uni_semantics)
186#define RExC_orig_utf8 (pRExC_state->orig_utf8)
187#define RExC_open_parens (pRExC_state->open_parens)
188#define RExC_close_parens (pRExC_state->close_parens)
189#define RExC_opend (pRExC_state->opend)
190#define RExC_paren_names (pRExC_state->paren_names)
191#define RExC_recurse (pRExC_state->recurse)
192#define RExC_recurse_count (pRExC_state->recurse_count)
193#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
194#define RExC_contains_locale (pRExC_state->contains_locale)
195
196
197#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
198#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
199 ((*s) == '{' && regcurly(s)))
200
201#ifdef SPSTART
202#undef SPSTART /* dratted cpp namespace... */
203#endif
204/*
205 * Flags to be passed up and down.
206 */
207#define WORST 0 /* Worst case. */
208#define HASWIDTH 0x01 /* Known to match non-null strings. */
209
210/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
211 * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
212#define SIMPLE 0x02
213#define SPSTART 0x04 /* Starts with * or +. */
214#define TRYAGAIN 0x08 /* Weeded out a declaration. */
215#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
216
217#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
218
219/* whether trie related optimizations are enabled */
220#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
221#define TRIE_STUDY_OPT
222#define FULL_TRIE_STUDY
223#define TRIE_STCLASS
224#endif
225
226
227
228#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
229#define PBITVAL(paren) (1 << ((paren) & 7))
230#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
231#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
232#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
233
234/* If not already in utf8, do a longjmp back to the beginning */
235#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
236#define REQUIRE_UTF8 STMT_START { \
237 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
238 } STMT_END
239
240/* About scan_data_t.
241
242 During optimisation we recurse through the regexp program performing
243 various inplace (keyhole style) optimisations. In addition study_chunk
244 and scan_commit populate this data structure with information about
245 what strings MUST appear in the pattern. We look for the longest
246 string that must appear at a fixed location, and we look for the
247 longest string that may appear at a floating location. So for instance
248 in the pattern:
249
250 /FOO[xX]A.*B[xX]BAR/
251
252 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
253 strings (because they follow a .* construct). study_chunk will identify
254 both FOO and BAR as being the longest fixed and floating strings respectively.
255
256 The strings can be composites, for instance
257
258 /(f)(o)(o)/
259
260 will result in a composite fixed substring 'foo'.
261
262 For each string some basic information is maintained:
263
264 - offset or min_offset
265 This is the position the string must appear at, or not before.
266 It also implicitly (when combined with minlenp) tells us how many
267 characters must match before the string we are searching for.
268 Likewise when combined with minlenp and the length of the string it
269 tells us how many characters must appear after the string we have
270 found.
271
272 - max_offset
273 Only used for floating strings. This is the rightmost point that
274 the string can appear at. If set to I32 max it indicates that the
275 string can occur infinitely far to the right.
276
277 - minlenp
278 A pointer to the minimum length of the pattern that the string
279 was found inside. This is important as in the case of positive
280 lookahead or positive lookbehind we can have multiple patterns
281 involved. Consider
282
283 /(?=FOO).*F/
284
285 The minimum length of the pattern overall is 3, the minimum length
286 of the lookahead part is 3, but the minimum length of the part that
287 will actually match is 1. So 'FOO's minimum length is 3, but the
288 minimum length for the F is 1. This is important as the minimum length
289 is used to determine offsets in front of and behind the string being
290 looked for. Since strings can be composites this is the length of the
291 pattern at the time it was committed with a scan_commit. Note that
292 the length is calculated by study_chunk, so that the minimum lengths
293 are not known until the full pattern has been compiled, thus the
294 pointer to the value.
295
296 - lookbehind
297
298 In the case of lookbehind the string being searched for can be
299 offset past the start point of the final matching string.
300 If this value was just blithely removed from the min_offset it would
301 invalidate some of the calculations for how many chars must match
302 before or after (as they are derived from min_offset and minlen and
303 the length of the string being searched for).
304 When the final pattern is compiled and the data is moved from the
305 scan_data_t structure into the regexp structure the information
306 about lookbehind is factored in, with the information that would
307 have been lost precalculated in the end_shift field for the
308 associated string.
309
310 The fields pos_min and pos_delta are used to store the minimum offset
311 and the delta to the maximum offset at the current point in the pattern.
312
313*/
314
315typedef struct scan_data_t {
316 /*I32 len_min; unused */
317 /*I32 len_delta; unused */
318 I32 pos_min;
319 I32 pos_delta;
320 SV *last_found;
321 I32 last_end; /* min value, <0 unless valid. */
322 I32 last_start_min;
323 I32 last_start_max;
324 SV **longest; /* Either &l_fixed, or &l_float. */
325 SV *longest_fixed; /* longest fixed string found in pattern */
326 I32 offset_fixed; /* offset where it starts */
327 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
328 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
329 SV *longest_float; /* longest floating string found in pattern */
330 I32 offset_float_min; /* earliest point in string it can appear */
331 I32 offset_float_max; /* latest point in string it can appear */
332 I32 *minlen_float; /* pointer to the minlen relevant to the string */
333 I32 lookbehind_float; /* is the position of the string modified by LB */
334 I32 flags;
335 I32 whilem_c;
336 I32 *last_closep;
337 struct regnode_charclass_class *start_class;
338} scan_data_t;
339
340/*
341 * Forward declarations for pregcomp()'s friends.
342 */
343
344static const scan_data_t zero_scan_data =
345 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
346
347#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
348#define SF_BEFORE_SEOL 0x0001
349#define SF_BEFORE_MEOL 0x0002
350#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
351#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
352
353#ifdef NO_UNARY_PLUS
354# define SF_FIX_SHIFT_EOL (0+2)
355# define SF_FL_SHIFT_EOL (0+4)
356#else
357# define SF_FIX_SHIFT_EOL (+2)
358# define SF_FL_SHIFT_EOL (+4)
359#endif
360
361#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
362#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
363
364#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
365#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
366#define SF_IS_INF 0x0040
367#define SF_HAS_PAR 0x0080
368#define SF_IN_PAR 0x0100
369#define SF_HAS_EVAL 0x0200
370#define SCF_DO_SUBSTR 0x0400
371#define SCF_DO_STCLASS_AND 0x0800
372#define SCF_DO_STCLASS_OR 0x1000
373#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
374#define SCF_WHILEM_VISITED_POS 0x2000
375
376#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
377#define SCF_SEEN_ACCEPT 0x8000
378
379#define UTF cBOOL(RExC_utf8)
380#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
381#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
382#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
383#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
384#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
385#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
386#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
387
388#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
389
390#define OOB_UNICODE 12345678
391#define OOB_NAMEDCLASS -1
392
393#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
394#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
395
396
397/* length of regex to show in messages that don't mark a position within */
398#define RegexLengthToShowInErrorMessages 127
399
400/*
401 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
402 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
403 * op/pragma/warn/regcomp.
404 */
405#define MARKER1 "<-- HERE" /* marker as it appears in the description */
406#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
407
408#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
409
410/*
411 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
412 * arg. Show regex, up to a maximum length. If it's too long, chop and add
413 * "...".
414 */
415#define _FAIL(code) STMT_START { \
416 const char *ellipses = ""; \
417 IV len = RExC_end - RExC_precomp; \
418 \
419 if (!SIZE_ONLY) \
420 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
421 if (len > RegexLengthToShowInErrorMessages) { \
422 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
423 len = RegexLengthToShowInErrorMessages - 10; \
424 ellipses = "..."; \
425 } \
426 code; \
427} STMT_END
428
429#define FAIL(msg) _FAIL( \
430 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
431 msg, (int)len, RExC_precomp, ellipses))
432
433#define FAIL2(msg,arg) _FAIL( \
434 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
435 arg, (int)len, RExC_precomp, ellipses))
436
437/*
438 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
439 */
440#define Simple_vFAIL(m) STMT_START { \
441 const IV offset = RExC_parse - RExC_precomp; \
442 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
443 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
444} STMT_END
445
446/*
447 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
448 */
449#define vFAIL(m) STMT_START { \
450 if (!SIZE_ONLY) \
451 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
452 Simple_vFAIL(m); \
453} STMT_END
454
455/*
456 * Like Simple_vFAIL(), but accepts two arguments.
457 */
458#define Simple_vFAIL2(m,a1) STMT_START { \
459 const IV offset = RExC_parse - RExC_precomp; \
460 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
461 (int)offset, RExC_precomp, RExC_precomp + offset); \
462} STMT_END
463
464/*
465 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
466 */
467#define vFAIL2(m,a1) STMT_START { \
468 if (!SIZE_ONLY) \
469 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
470 Simple_vFAIL2(m, a1); \
471} STMT_END
472
473
474/*
475 * Like Simple_vFAIL(), but accepts three arguments.
476 */
477#define Simple_vFAIL3(m, a1, a2) STMT_START { \
478 const IV offset = RExC_parse - RExC_precomp; \
479 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
480 (int)offset, RExC_precomp, RExC_precomp + offset); \
481} STMT_END
482
483/*
484 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
485 */
486#define vFAIL3(m,a1,a2) STMT_START { \
487 if (!SIZE_ONLY) \
488 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
489 Simple_vFAIL3(m, a1, a2); \
490} STMT_END
491
492/*
493 * Like Simple_vFAIL(), but accepts four arguments.
494 */
495#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
496 const IV offset = RExC_parse - RExC_precomp; \
497 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
498 (int)offset, RExC_precomp, RExC_precomp + offset); \
499} STMT_END
500
501#define ckWARNreg(loc,m) STMT_START { \
502 const IV offset = loc - RExC_precomp; \
503 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
504 (int)offset, RExC_precomp, RExC_precomp + offset); \
505} STMT_END
506
507#define ckWARNregdep(loc,m) STMT_START { \
508 const IV offset = loc - RExC_precomp; \
509 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
510 m REPORT_LOCATION, \
511 (int)offset, RExC_precomp, RExC_precomp + offset); \
512} STMT_END
513
514#define ckWARN2regdep(loc,m, a1) STMT_START { \
515 const IV offset = loc - RExC_precomp; \
516 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
517 m REPORT_LOCATION, \
518 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
519} STMT_END
520
521#define ckWARN2reg(loc, m, a1) STMT_START { \
522 const IV offset = loc - RExC_precomp; \
523 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
524 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
525} STMT_END
526
527#define vWARN3(loc, m, a1, a2) STMT_START { \
528 const IV offset = loc - RExC_precomp; \
529 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
530 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
531} STMT_END
532
533#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
534 const IV offset = loc - RExC_precomp; \
535 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
536 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
537} STMT_END
538
539#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
540 const IV offset = loc - RExC_precomp; \
541 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
542 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
543} STMT_END
544
545#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
546 const IV offset = loc - RExC_precomp; \
547 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
548 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
549} STMT_END
550
551#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
552 const IV offset = loc - RExC_precomp; \
553 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
554 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
555} STMT_END
556
557
558/* Allow for side effects in s */
559#define REGC(c,s) STMT_START { \
560 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
561} STMT_END
562
563/* Macros for recording node offsets. 20001227 mjd@plover.com
564 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
565 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
566 * Element 0 holds the number n.
567 * Position is 1 indexed.
568 */
569#ifndef RE_TRACK_PATTERN_OFFSETS
570#define Set_Node_Offset_To_R(node,byte)
571#define Set_Node_Offset(node,byte)
572#define Set_Cur_Node_Offset
573#define Set_Node_Length_To_R(node,len)
574#define Set_Node_Length(node,len)
575#define Set_Node_Cur_Length(node)
576#define Node_Offset(n)
577#define Node_Length(n)
578#define Set_Node_Offset_Length(node,offset,len)
579#define ProgLen(ri) ri->u.proglen
580#define SetProgLen(ri,x) ri->u.proglen = x
581#else
582#define ProgLen(ri) ri->u.offsets[0]
583#define SetProgLen(ri,x) ri->u.offsets[0] = x
584#define Set_Node_Offset_To_R(node,byte) STMT_START { \
585 if (! SIZE_ONLY) { \
586 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
587 __LINE__, (int)(node), (int)(byte))); \
588 if((node) < 0) { \
589 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
590 } else { \
591 RExC_offsets[2*(node)-1] = (byte); \
592 } \
593 } \
594} STMT_END
595
596#define Set_Node_Offset(node,byte) \
597 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
598#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
599
600#define Set_Node_Length_To_R(node,len) STMT_START { \
601 if (! SIZE_ONLY) { \
602 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
603 __LINE__, (int)(node), (int)(len))); \
604 if((node) < 0) { \
605 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
606 } else { \
607 RExC_offsets[2*(node)] = (len); \
608 } \
609 } \
610} STMT_END
611
612#define Set_Node_Length(node,len) \
613 Set_Node_Length_To_R((node)-RExC_emit_start, len)
614#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
615#define Set_Node_Cur_Length(node) \
616 Set_Node_Length(node, RExC_parse - parse_start)
617
618/* Get offsets and lengths */
619#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
620#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
621
622#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
623 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
624 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
625} STMT_END
626#endif
627
628#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
629#define EXPERIMENTAL_INPLACESCAN
630#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
631
632#define DEBUG_STUDYDATA(str,data,depth) \
633DEBUG_OPTIMISE_MORE_r(if(data){ \
634 PerlIO_printf(Perl_debug_log, \
635 "%*s" str "Pos:%"IVdf"/%"IVdf \
636 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
637 (int)(depth)*2, "", \
638 (IV)((data)->pos_min), \
639 (IV)((data)->pos_delta), \
640 (UV)((data)->flags), \
641 (IV)((data)->whilem_c), \
642 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
643 is_inf ? "INF " : "" \
644 ); \
645 if ((data)->last_found) \
646 PerlIO_printf(Perl_debug_log, \
647 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
648 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
649 SvPVX_const((data)->last_found), \
650 (IV)((data)->last_end), \
651 (IV)((data)->last_start_min), \
652 (IV)((data)->last_start_max), \
653 ((data)->longest && \
654 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
655 SvPVX_const((data)->longest_fixed), \
656 (IV)((data)->offset_fixed), \
657 ((data)->longest && \
658 (data)->longest==&((data)->longest_float)) ? "*" : "", \
659 SvPVX_const((data)->longest_float), \
660 (IV)((data)->offset_float_min), \
661 (IV)((data)->offset_float_max) \
662 ); \
663 PerlIO_printf(Perl_debug_log,"\n"); \
664});
665
666static void clear_re(pTHX_ void *r);
667
668/* Mark that we cannot extend a found fixed substring at this point.
669 Update the longest found anchored substring and the longest found
670 floating substrings if needed. */
671
672STATIC void
673S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
674{
675 const STRLEN l = CHR_SVLEN(data->last_found);
676 const STRLEN old_l = CHR_SVLEN(*data->longest);
677 GET_RE_DEBUG_FLAGS_DECL;
678
679 PERL_ARGS_ASSERT_SCAN_COMMIT;
680
681 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
682 SvSetMagicSV(*data->longest, data->last_found);
683 if (*data->longest == data->longest_fixed) {
684 data->offset_fixed = l ? data->last_start_min : data->pos_min;
685 if (data->flags & SF_BEFORE_EOL)
686 data->flags
687 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
688 else
689 data->flags &= ~SF_FIX_BEFORE_EOL;
690 data->minlen_fixed=minlenp;
691 data->lookbehind_fixed=0;
692 }
693 else { /* *data->longest == data->longest_float */
694 data->offset_float_min = l ? data->last_start_min : data->pos_min;
695 data->offset_float_max = (l
696 ? data->last_start_max
697 : data->pos_min + data->pos_delta);
698 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
699 data->offset_float_max = I32_MAX;
700 if (data->flags & SF_BEFORE_EOL)
701 data->flags
702 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
703 else
704 data->flags &= ~SF_FL_BEFORE_EOL;
705 data->minlen_float=minlenp;
706 data->lookbehind_float=0;
707 }
708 }
709 SvCUR_set(data->last_found, 0);
710 {
711 SV * const sv = data->last_found;
712 if (SvUTF8(sv) && SvMAGICAL(sv)) {
713 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
714 if (mg)
715 mg->mg_len = 0;
716 }
717 }
718 data->last_end = -1;
719 data->flags &= ~SF_BEFORE_EOL;
720 DEBUG_STUDYDATA("commit: ",data,0);
721}
722
723/* Can match anything (initialization) */
724STATIC void
725S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
726{
727 PERL_ARGS_ASSERT_CL_ANYTHING;
728
729 ANYOF_BITMAP_SETALL(cl);
730 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
731 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL
732 /* Even though no bitmap is in use here, we need to set
733 * the flag below so an AND with a node that does have one
734 * doesn't lose that one. The flag should get cleared if
735 * the other one doesn't; and the code in regexec.c is
736 * structured so this being set when not needed does no
737 * harm. It seemed a little cleaner to set it here than do
738 * a special case in cl_and() */
739 |ANYOF_NONBITMAP_NON_UTF8;
740
741 /* If any portion of the regex is to operate under locale rules,
742 * initialization includes it. The reason this isn't done for all regexes
743 * is that the optimizer was written under the assumption that locale was
744 * all-or-nothing. Given the complexity and lack of documentation in the
745 * optimizer, and that there are inadequate test cases for locale, so many
746 * parts of it may not work properly, it is safest to avoid locale unless
747 * necessary. */
748 if (RExC_contains_locale) {
749 ANYOF_CLASS_SETALL(cl); /* /l uses class */
750 cl->flags |= ANYOF_LOCALE;
751 }
752 else {
753 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
754 }
755}
756
757/* Can match anything (initialization) */
758STATIC int
759S_cl_is_anything(const struct regnode_charclass_class *cl)
760{
761 int value;
762
763 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
764
765 for (value = 0; value <= ANYOF_MAX; value += 2)
766 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
767 return 1;
768 if (!(cl->flags & ANYOF_UNICODE_ALL))
769 return 0;
770 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
771 return 0;
772 return 1;
773}
774
775/* Can match anything (initialization) */
776STATIC void
777S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
778{
779 PERL_ARGS_ASSERT_CL_INIT;
780
781 Zero(cl, 1, struct regnode_charclass_class);
782 cl->type = ANYOF;
783 cl_anything(pRExC_state, cl);
784 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
785}
786
787/* These two functions currently do the exact same thing */
788#define cl_init_zero S_cl_init
789
790/* 'AND' a given class with another one. Can create false positives. 'cl'
791 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
792 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
793STATIC void
794S_cl_and(struct regnode_charclass_class *cl,
795 const struct regnode_charclass_class *and_with)
796{
797 PERL_ARGS_ASSERT_CL_AND;
798
799 assert(and_with->type == ANYOF);
800
801 /* I (khw) am not sure all these restrictions are necessary XXX */
802 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
803 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
804 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
805 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
806 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
807 int i;
808
809 if (and_with->flags & ANYOF_INVERT)
810 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
811 cl->bitmap[i] &= ~and_with->bitmap[i];
812 else
813 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
814 cl->bitmap[i] &= and_with->bitmap[i];
815 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
816
817 if (and_with->flags & ANYOF_INVERT) {
818
819 /* Here, the and'ed node is inverted. Get the AND of the flags that
820 * aren't affected by the inversion. Those that are affected are
821 * handled individually below */
822 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
823 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
824 cl->flags |= affected_flags;
825
826 /* We currently don't know how to deal with things that aren't in the
827 * bitmap, but we know that the intersection is no greater than what
828 * is already in cl, so let there be false positives that get sorted
829 * out after the synthetic start class succeeds, and the node is
830 * matched for real. */
831
832 /* The inversion of these two flags indicate that the resulting
833 * intersection doesn't have them */
834 if (and_with->flags & ANYOF_UNICODE_ALL) {
835 cl->flags &= ~ANYOF_UNICODE_ALL;
836 }
837 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
838 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
839 }
840 }
841 else { /* and'd node is not inverted */
842 if (! ANYOF_NONBITMAP(and_with)) {
843
844 /* Here 'and_with' doesn't match anything outside the bitmap
845 * (except possibly ANYOF_UNICODE_ALL), which means the
846 * intersection can't either, except for ANYOF_UNICODE_ALL, in
847 * which case we don't know what the intersection is, but it's no
848 * greater than what cl already has, so can just leave it alone,
849 * with possible false positives */
850 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
851 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
852 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
853 }
854 }
855 else if (! ANYOF_NONBITMAP(cl)) {
856
857 /* Here, 'and_with' does match something outside the bitmap, and cl
858 * doesn't have a list of things to match outside the bitmap. If
859 * cl can match all code points above 255, the intersection will
860 * be those above-255 code points that 'and_with' matches. There
861 * may be false positives from code points in 'and_with' that are
862 * outside the bitmap but below 256, but those get sorted out
863 * after the synthetic start class succeeds). If cl can't match
864 * all Unicode code points, it means here that it can't match *
865 * anything outside the bitmap, so we leave the bitmap empty */
866 if (cl->flags & ANYOF_UNICODE_ALL) {
867 ARG_SET(cl, ARG(and_with));
868 }
869 }
870 else {
871 /* Here, both 'and_with' and cl match something outside the
872 * bitmap. Currently we do not do the intersection, so just match
873 * whatever cl had at the beginning. */
874 }
875
876
877 /* Take the intersection of the two sets of flags */
878 cl->flags &= and_with->flags;
879 }
880}
881
882/* 'OR' a given class with another one. Can create false positives. 'cl'
883 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
884 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
885STATIC void
886S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
887{
888 PERL_ARGS_ASSERT_CL_OR;
889
890 if (or_with->flags & ANYOF_INVERT) {
891
892 /* Here, the or'd node is to be inverted. This means we take the
893 * complement of everything not in the bitmap, but currently we don't
894 * know what that is, so give up and match anything */
895 if (ANYOF_NONBITMAP(or_with)) {
896 cl_anything(pRExC_state, cl);
897 }
898 /* We do not use
899 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
900 * <= (B1 | !B2) | (CL1 | !CL2)
901 * which is wasteful if CL2 is small, but we ignore CL2:
902 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
903 * XXXX Can we handle case-fold? Unclear:
904 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
905 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
906 */
907 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
908 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
909 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
910 int i;
911
912 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
913 cl->bitmap[i] |= ~or_with->bitmap[i];
914 } /* XXXX: logic is complicated otherwise */
915 else {
916 cl_anything(pRExC_state, cl);
917 }
918
919 /* And, we can just take the union of the flags that aren't affected
920 * by the inversion */
921 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
922
923 /* For the remaining flags:
924 ANYOF_UNICODE_ALL and inverted means to not match anything above
925 255, which means that the union with cl should just be
926 what cl has in it, so can ignore this flag
927 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
928 is 127-255 to match them, but then invert that, so the
929 union with cl should just be what cl has in it, so can
930 ignore this flag
931 */
932 } else { /* 'or_with' is not inverted */
933 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
934 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
935 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
936 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
937 int i;
938
939 /* OR char bitmap and class bitmap separately */
940 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
941 cl->bitmap[i] |= or_with->bitmap[i];
942 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
943 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
944 cl->classflags[i] |= or_with->classflags[i];
945 cl->flags |= ANYOF_CLASS;
946 }
947 }
948 else { /* XXXX: logic is complicated, leave it along for a moment. */
949 cl_anything(pRExC_state, cl);
950 }
951
952 if (ANYOF_NONBITMAP(or_with)) {
953
954 /* Use the added node's outside-the-bit-map match if there isn't a
955 * conflict. If there is a conflict (both nodes match something
956 * outside the bitmap, but what they match outside is not the same
957 * pointer, and hence not easily compared until XXX we extend
958 * inversion lists this far), give up and allow the start class to
959 * match everything outside the bitmap. If that stuff is all above
960 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
961 if (! ANYOF_NONBITMAP(cl)) {
962 ARG_SET(cl, ARG(or_with));
963 }
964 else if (ARG(cl) != ARG(or_with)) {
965
966 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
967 cl_anything(pRExC_state, cl);
968 }
969 else {
970 cl->flags |= ANYOF_UNICODE_ALL;
971 }
972 }
973
974 /* Take the union */
975 cl->flags |= or_with->flags;
976 }
977 }
978}
979
980#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
981#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
982#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
983#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
984
985
986#ifdef DEBUGGING
987/*
988 dump_trie(trie,widecharmap,revcharmap)
989 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
990 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
991
992 These routines dump out a trie in a somewhat readable format.
993 The _interim_ variants are used for debugging the interim
994 tables that are used to generate the final compressed
995 representation which is what dump_trie expects.
996
997 Part of the reason for their existence is to provide a form
998 of documentation as to how the different representations function.
999
1000*/
1001
1002/*
1003 Dumps the final compressed table form of the trie to Perl_debug_log.
1004 Used for debugging make_trie().
1005*/
1006
1007STATIC void
1008S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1009 AV *revcharmap, U32 depth)
1010{
1011 U32 state;
1012 SV *sv=sv_newmortal();
1013 int colwidth= widecharmap ? 6 : 4;
1014 U16 word;
1015 GET_RE_DEBUG_FLAGS_DECL;
1016
1017 PERL_ARGS_ASSERT_DUMP_TRIE;
1018
1019 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1020 (int)depth * 2 + 2,"",
1021 "Match","Base","Ofs" );
1022
1023 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1024 SV ** const tmp = av_fetch( revcharmap, state, 0);
1025 if ( tmp ) {
1026 PerlIO_printf( Perl_debug_log, "%*s",
1027 colwidth,
1028 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1029 PL_colors[0], PL_colors[1],
1030 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1031 PERL_PV_ESCAPE_FIRSTCHAR
1032 )
1033 );
1034 }
1035 }
1036 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1037 (int)depth * 2 + 2,"");
1038
1039 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1040 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1041 PerlIO_printf( Perl_debug_log, "\n");
1042
1043 for( state = 1 ; state < trie->statecount ; state++ ) {
1044 const U32 base = trie->states[ state ].trans.base;
1045
1046 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1047
1048 if ( trie->states[ state ].wordnum ) {
1049 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1050 } else {
1051 PerlIO_printf( Perl_debug_log, "%6s", "" );
1052 }
1053
1054 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1055
1056 if ( base ) {
1057 U32 ofs = 0;
1058
1059 while( ( base + ofs < trie->uniquecharcount ) ||
1060 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1061 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1062 ofs++;
1063
1064 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1065
1066 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1067 if ( ( base + ofs >= trie->uniquecharcount ) &&
1068 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1069 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1070 {
1071 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1072 colwidth,
1073 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1074 } else {
1075 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1076 }
1077 }
1078
1079 PerlIO_printf( Perl_debug_log, "]");
1080
1081 }
1082 PerlIO_printf( Perl_debug_log, "\n" );
1083 }
1084 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1085 for (word=1; word <= trie->wordcount; word++) {
1086 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1087 (int)word, (int)(trie->wordinfo[word].prev),
1088 (int)(trie->wordinfo[word].len));
1089 }
1090 PerlIO_printf(Perl_debug_log, "\n" );
1091}
1092/*
1093 Dumps a fully constructed but uncompressed trie in list form.
1094 List tries normally only are used for construction when the number of
1095 possible chars (trie->uniquecharcount) is very high.
1096 Used for debugging make_trie().
1097*/
1098STATIC void
1099S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1100 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1101 U32 depth)
1102{
1103 U32 state;
1104 SV *sv=sv_newmortal();
1105 int colwidth= widecharmap ? 6 : 4;
1106 GET_RE_DEBUG_FLAGS_DECL;
1107
1108 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1109
1110 /* print out the table precompression. */
1111 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1112 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1113 "------:-----+-----------------\n" );
1114
1115 for( state=1 ; state < next_alloc ; state ++ ) {
1116 U16 charid;
1117
1118 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1119 (int)depth * 2 + 2,"", (UV)state );
1120 if ( ! trie->states[ state ].wordnum ) {
1121 PerlIO_printf( Perl_debug_log, "%5s| ","");
1122 } else {
1123 PerlIO_printf( Perl_debug_log, "W%4x| ",
1124 trie->states[ state ].wordnum
1125 );
1126 }
1127 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1128 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1129 if ( tmp ) {
1130 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1131 colwidth,
1132 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1133 PL_colors[0], PL_colors[1],
1134 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1135 PERL_PV_ESCAPE_FIRSTCHAR
1136 ) ,
1137 TRIE_LIST_ITEM(state,charid).forid,
1138 (UV)TRIE_LIST_ITEM(state,charid).newstate
1139 );
1140 if (!(charid % 10))
1141 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1142 (int)((depth * 2) + 14), "");
1143 }
1144 }
1145 PerlIO_printf( Perl_debug_log, "\n");
1146 }
1147}
1148
1149/*
1150 Dumps a fully constructed but uncompressed trie in table form.
1151 This is the normal DFA style state transition table, with a few
1152 twists to facilitate compression later.
1153 Used for debugging make_trie().
1154*/
1155STATIC void
1156S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1157 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1158 U32 depth)
1159{
1160 U32 state;
1161 U16 charid;
1162 SV *sv=sv_newmortal();
1163 int colwidth= widecharmap ? 6 : 4;
1164 GET_RE_DEBUG_FLAGS_DECL;
1165
1166 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1167
1168 /*
1169 print out the table precompression so that we can do a visual check
1170 that they are identical.
1171 */
1172
1173 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1174
1175 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1176 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1177 if ( tmp ) {
1178 PerlIO_printf( Perl_debug_log, "%*s",
1179 colwidth,
1180 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1181 PL_colors[0], PL_colors[1],
1182 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1183 PERL_PV_ESCAPE_FIRSTCHAR
1184 )
1185 );
1186 }
1187 }
1188
1189 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1190
1191 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1192 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1193 }
1194
1195 PerlIO_printf( Perl_debug_log, "\n" );
1196
1197 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1198
1199 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1200 (int)depth * 2 + 2,"",
1201 (UV)TRIE_NODENUM( state ) );
1202
1203 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1204 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1205 if (v)
1206 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1207 else
1208 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1209 }
1210 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1211 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1212 } else {
1213 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1214 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1215 }
1216 }
1217}
1218
1219#endif
1220
1221
1222/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1223 startbranch: the first branch in the whole branch sequence
1224 first : start branch of sequence of branch-exact nodes.
1225 May be the same as startbranch
1226 last : Thing following the last branch.
1227 May be the same as tail.
1228 tail : item following the branch sequence
1229 count : words in the sequence
1230 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1231 depth : indent depth
1232
1233Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1234
1235A trie is an N'ary tree where the branches are determined by digital
1236decomposition of the key. IE, at the root node you look up the 1st character and
1237follow that branch repeat until you find the end of the branches. Nodes can be
1238marked as "accepting" meaning they represent a complete word. Eg:
1239
1240 /he|she|his|hers/
1241
1242would convert into the following structure. Numbers represent states, letters
1243following numbers represent valid transitions on the letter from that state, if
1244the number is in square brackets it represents an accepting state, otherwise it
1245will be in parenthesis.
1246
1247 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1248 | |
1249 | (2)
1250 | |
1251 (1) +-i->(6)-+-s->[7]
1252 |
1253 +-s->(3)-+-h->(4)-+-e->[5]
1254
1255 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1256
1257This shows that when matching against the string 'hers' we will begin at state 1
1258read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1259then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1260is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1261single traverse. We store a mapping from accepting to state to which word was
1262matched, and then when we have multiple possibilities we try to complete the
1263rest of the regex in the order in which they occured in the alternation.
1264
1265The only prior NFA like behaviour that would be changed by the TRIE support is
1266the silent ignoring of duplicate alternations which are of the form:
1267
1268 / (DUPE|DUPE) X? (?{ ... }) Y /x
1269
1270Thus EVAL blocks following a trie may be called a different number of times with
1271and without the optimisation. With the optimisations dupes will be silently
1272ignored. This inconsistent behaviour of EVAL type nodes is well established as
1273the following demonstrates:
1274
1275 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1276
1277which prints out 'word' three times, but
1278
1279 'words'=~/(word|word|word)(?{ print $1 })S/
1280
1281which doesnt print it out at all. This is due to other optimisations kicking in.
1282
1283Example of what happens on a structural level:
1284
1285The regexp /(ac|ad|ab)+/ will produce the following debug output:
1286
1287 1: CURLYM[1] {1,32767}(18)
1288 5: BRANCH(8)
1289 6: EXACT <ac>(16)
1290 8: BRANCH(11)
1291 9: EXACT <ad>(16)
1292 11: BRANCH(14)
1293 12: EXACT <ab>(16)
1294 16: SUCCEED(0)
1295 17: NOTHING(18)
1296 18: END(0)
1297
1298This would be optimizable with startbranch=5, first=5, last=16, tail=16
1299and should turn into:
1300
1301 1: CURLYM[1] {1,32767}(18)
1302 5: TRIE(16)
1303 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1304 <ac>
1305 <ad>
1306 <ab>
1307 16: SUCCEED(0)
1308 17: NOTHING(18)
1309 18: END(0)
1310
1311Cases where tail != last would be like /(?foo|bar)baz/:
1312
1313 1: BRANCH(4)
1314 2: EXACT <foo>(8)
1315 4: BRANCH(7)
1316 5: EXACT <bar>(8)
1317 7: TAIL(8)
1318 8: EXACT <baz>(10)
1319 10: END(0)
1320
1321which would be optimizable with startbranch=1, first=1, last=7, tail=8
1322and would end up looking like:
1323
1324 1: TRIE(8)
1325 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1326 <foo>
1327 <bar>
1328 7: TAIL(8)
1329 8: EXACT <baz>(10)
1330 10: END(0)
1331
1332 d = uvuni_to_utf8_flags(d, uv, 0);
1333
1334is the recommended Unicode-aware way of saying
1335
1336 *(d++) = uv;
1337*/
1338
1339#define TRIE_STORE_REVCHAR \
1340 STMT_START { \
1341 if (UTF) { \
1342 SV *zlopp = newSV(2); \
1343 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1344 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1345 SvCUR_set(zlopp, kapow - flrbbbbb); \
1346 SvPOK_on(zlopp); \
1347 SvUTF8_on(zlopp); \
1348 av_push(revcharmap, zlopp); \
1349 } else { \
1350 char ooooff = (char)uvc; \
1351 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1352 } \
1353 } STMT_END
1354
1355#define TRIE_READ_CHAR STMT_START { \
1356 wordlen++; \
1357 if ( UTF ) { \
1358 if ( folder ) { \
1359 if ( foldlen > 0 ) { \
1360 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1361 foldlen -= len; \
1362 scan += len; \
1363 len = 0; \
1364 } else { \
1365 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1366 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1367 foldlen -= UNISKIP( uvc ); \
1368 scan = foldbuf + UNISKIP( uvc ); \
1369 } \
1370 } else { \
1371 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1372 } \
1373 } else { \
1374 uvc = (U32)*uc; \
1375 len = 1; \
1376 } \
1377} STMT_END
1378
1379
1380
1381#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1382 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1383 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1384 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1385 } \
1386 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1387 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1388 TRIE_LIST_CUR( state )++; \
1389} STMT_END
1390
1391#define TRIE_LIST_NEW(state) STMT_START { \
1392 Newxz( trie->states[ state ].trans.list, \
1393 4, reg_trie_trans_le ); \
1394 TRIE_LIST_CUR( state ) = 1; \
1395 TRIE_LIST_LEN( state ) = 4; \
1396} STMT_END
1397
1398#define TRIE_HANDLE_WORD(state) STMT_START { \
1399 U16 dupe= trie->states[ state ].wordnum; \
1400 regnode * const noper_next = regnext( noper ); \
1401 \
1402 DEBUG_r({ \
1403 /* store the word for dumping */ \
1404 SV* tmp; \
1405 if (OP(noper) != NOTHING) \
1406 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1407 else \
1408 tmp = newSVpvn_utf8( "", 0, UTF ); \
1409 av_push( trie_words, tmp ); \
1410 }); \
1411 \
1412 curword++; \
1413 trie->wordinfo[curword].prev = 0; \
1414 trie->wordinfo[curword].len = wordlen; \
1415 trie->wordinfo[curword].accept = state; \
1416 \
1417 if ( noper_next < tail ) { \
1418 if (!trie->jump) \
1419 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1420 trie->jump[curword] = (U16)(noper_next - convert); \
1421 if (!jumper) \
1422 jumper = noper_next; \
1423 if (!nextbranch) \
1424 nextbranch= regnext(cur); \
1425 } \
1426 \
1427 if ( dupe ) { \
1428 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1429 /* chain, so that when the bits of chain are later */\
1430 /* linked together, the dups appear in the chain */\
1431 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1432 trie->wordinfo[dupe].prev = curword; \
1433 } else { \
1434 /* we haven't inserted this word yet. */ \
1435 trie->states[ state ].wordnum = curword; \
1436 } \
1437} STMT_END
1438
1439
1440#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1441 ( ( base + charid >= ucharcount \
1442 && base + charid < ubound \
1443 && state == trie->trans[ base - ucharcount + charid ].check \
1444 && trie->trans[ base - ucharcount + charid ].next ) \
1445 ? trie->trans[ base - ucharcount + charid ].next \
1446 : ( state==1 ? special : 0 ) \
1447 )
1448
1449#define MADE_TRIE 1
1450#define MADE_JUMP_TRIE 2
1451#define MADE_EXACT_TRIE 4
1452
1453STATIC I32
1454S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1455{
1456 dVAR;
1457 /* first pass, loop through and scan words */
1458 reg_trie_data *trie;
1459 HV *widecharmap = NULL;
1460 AV *revcharmap = newAV();
1461 regnode *cur;
1462 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1463 STRLEN len = 0;
1464 UV uvc = 0;
1465 U16 curword = 0;
1466 U32 next_alloc = 0;
1467 regnode *jumper = NULL;
1468 regnode *nextbranch = NULL;
1469 regnode *convert = NULL;
1470 U32 *prev_states; /* temp array mapping each state to previous one */
1471 /* we just use folder as a flag in utf8 */
1472 const U8 * folder = NULL;
1473
1474#ifdef DEBUGGING
1475 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1476 AV *trie_words = NULL;
1477 /* along with revcharmap, this only used during construction but both are
1478 * useful during debugging so we store them in the struct when debugging.
1479 */
1480#else
1481 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1482 STRLEN trie_charcount=0;
1483#endif
1484 SV *re_trie_maxbuff;
1485 GET_RE_DEBUG_FLAGS_DECL;
1486
1487 PERL_ARGS_ASSERT_MAKE_TRIE;
1488#ifndef DEBUGGING
1489 PERL_UNUSED_ARG(depth);
1490#endif
1491
1492 switch (flags) {
1493 case EXACTFA:
1494 case EXACTFU: folder = PL_fold_latin1; break;
1495 case EXACTF: folder = PL_fold; break;
1496 case EXACTFL: folder = PL_fold_locale; break;
1497 }
1498
1499 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1500 trie->refcount = 1;
1501 trie->startstate = 1;
1502 trie->wordcount = word_count;
1503 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1504 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1505 if (!(UTF && folder))
1506 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1507 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1508 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1509
1510 DEBUG_r({
1511 trie_words = newAV();
1512 });
1513
1514 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1515 if (!SvIOK(re_trie_maxbuff)) {
1516 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1517 }
1518 DEBUG_OPTIMISE_r({
1519 PerlIO_printf( Perl_debug_log,
1520 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1521 (int)depth * 2 + 2, "",
1522 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1523 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1524 (int)depth);
1525 });
1526
1527 /* Find the node we are going to overwrite */
1528 if ( first == startbranch && OP( last ) != BRANCH ) {
1529 /* whole branch chain */
1530 convert = first;
1531 } else {
1532 /* branch sub-chain */
1533 convert = NEXTOPER( first );
1534 }
1535
1536 /* -- First loop and Setup --
1537
1538 We first traverse the branches and scan each word to determine if it
1539 contains widechars, and how many unique chars there are, this is
1540 important as we have to build a table with at least as many columns as we
1541 have unique chars.
1542
1543 We use an array of integers to represent the character codes 0..255
1544 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1545 native representation of the character value as the key and IV's for the
1546 coded index.
1547
1548 *TODO* If we keep track of how many times each character is used we can
1549 remap the columns so that the table compression later on is more
1550 efficient in terms of memory by ensuring the most common value is in the
1551 middle and the least common are on the outside. IMO this would be better
1552 than a most to least common mapping as theres a decent chance the most
1553 common letter will share a node with the least common, meaning the node
1554 will not be compressible. With a middle is most common approach the worst
1555 case is when we have the least common nodes twice.
1556
1557 */
1558
1559 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1560 regnode * const noper = NEXTOPER( cur );
1561 const U8 *uc = (U8*)STRING( noper );
1562 const U8 * const e = uc + STR_LEN( noper );
1563 STRLEN foldlen = 0;
1564 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1565 const U8 *scan = (U8*)NULL;
1566 U32 wordlen = 0; /* required init */
1567 STRLEN chars = 0;
1568 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1569
1570 if (OP(noper) == NOTHING) {
1571 trie->minlen= 0;
1572 continue;
1573 }
1574 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1575 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1576 regardless of encoding */
1577
1578 for ( ; uc < e ; uc += len ) {
1579 TRIE_CHARCOUNT(trie)++;
1580 TRIE_READ_CHAR;
1581 chars++;
1582 if ( uvc < 256 ) {
1583 if ( !trie->charmap[ uvc ] ) {
1584 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1585 if ( folder )
1586 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1587 TRIE_STORE_REVCHAR;
1588 }
1589 if ( set_bit ) {
1590 /* store the codepoint in the bitmap, and its folded
1591 * equivalent. */
1592 TRIE_BITMAP_SET(trie,uvc);
1593
1594 /* store the folded codepoint */
1595 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1596
1597 if ( !UTF ) {
1598 /* store first byte of utf8 representation of
1599 variant codepoints */
1600 if (! UNI_IS_INVARIANT(uvc)) {
1601 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1602 }
1603 }
1604 set_bit = 0; /* We've done our bit :-) */
1605 }
1606 } else {
1607 SV** svpp;
1608 if ( !widecharmap )
1609 widecharmap = newHV();
1610
1611 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1612
1613 if ( !svpp )
1614 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1615
1616 if ( !SvTRUE( *svpp ) ) {
1617 sv_setiv( *svpp, ++trie->uniquecharcount );
1618 TRIE_STORE_REVCHAR;
1619 }
1620 }
1621 }
1622 if( cur == first ) {
1623 trie->minlen=chars;
1624 trie->maxlen=chars;
1625 } else if (chars < trie->minlen) {
1626 trie->minlen=chars;
1627 } else if (chars > trie->maxlen) {
1628 trie->maxlen=chars;
1629 }
1630
1631 } /* end first pass */
1632 DEBUG_TRIE_COMPILE_r(
1633 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1634 (int)depth * 2 + 2,"",
1635 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1636 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1637 (int)trie->minlen, (int)trie->maxlen )
1638 );
1639
1640 /*
1641 We now know what we are dealing with in terms of unique chars and
1642 string sizes so we can calculate how much memory a naive
1643 representation using a flat table will take. If it's over a reasonable
1644 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1645 conservative but potentially much slower representation using an array
1646 of lists.
1647
1648 At the end we convert both representations into the same compressed
1649 form that will be used in regexec.c for matching with. The latter
1650 is a form that cannot be used to construct with but has memory
1651 properties similar to the list form and access properties similar
1652 to the table form making it both suitable for fast searches and
1653 small enough that its feasable to store for the duration of a program.
1654
1655 See the comment in the code where the compressed table is produced
1656 inplace from the flat tabe representation for an explanation of how
1657 the compression works.
1658
1659 */
1660
1661
1662 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1663 prev_states[1] = 0;
1664
1665 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1666 /*
1667 Second Pass -- Array Of Lists Representation
1668
1669 Each state will be represented by a list of charid:state records
1670 (reg_trie_trans_le) the first such element holds the CUR and LEN
1671 points of the allocated array. (See defines above).
1672
1673 We build the initial structure using the lists, and then convert
1674 it into the compressed table form which allows faster lookups
1675 (but cant be modified once converted).
1676 */
1677
1678 STRLEN transcount = 1;
1679
1680 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1681 "%*sCompiling trie using list compiler\n",
1682 (int)depth * 2 + 2, ""));
1683
1684 trie->states = (reg_trie_state *)
1685 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1686 sizeof(reg_trie_state) );
1687 TRIE_LIST_NEW(1);
1688 next_alloc = 2;
1689
1690 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1691
1692 regnode * const noper = NEXTOPER( cur );
1693 U8 *uc = (U8*)STRING( noper );
1694 const U8 * const e = uc + STR_LEN( noper );
1695 U32 state = 1; /* required init */
1696 U16 charid = 0; /* sanity init */
1697 U8 *scan = (U8*)NULL; /* sanity init */
1698 STRLEN foldlen = 0; /* required init */
1699 U32 wordlen = 0; /* required init */
1700 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1701
1702 if (OP(noper) != NOTHING) {
1703 for ( ; uc < e ; uc += len ) {
1704
1705 TRIE_READ_CHAR;
1706
1707 if ( uvc < 256 ) {
1708 charid = trie->charmap[ uvc ];
1709 } else {
1710 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1711 if ( !svpp ) {
1712 charid = 0;
1713 } else {
1714 charid=(U16)SvIV( *svpp );
1715 }
1716 }
1717 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1718 if ( charid ) {
1719
1720 U16 check;
1721 U32 newstate = 0;
1722
1723 charid--;
1724 if ( !trie->states[ state ].trans.list ) {
1725 TRIE_LIST_NEW( state );
1726 }
1727 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1728 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1729 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1730 break;
1731 }
1732 }
1733 if ( ! newstate ) {
1734 newstate = next_alloc++;
1735 prev_states[newstate] = state;
1736 TRIE_LIST_PUSH( state, charid, newstate );
1737 transcount++;
1738 }
1739 state = newstate;
1740 } else {
1741 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1742 }
1743 }
1744 }
1745 TRIE_HANDLE_WORD(state);
1746
1747 } /* end second pass */
1748
1749 /* next alloc is the NEXT state to be allocated */
1750 trie->statecount = next_alloc;
1751 trie->states = (reg_trie_state *)
1752 PerlMemShared_realloc( trie->states,
1753 next_alloc
1754 * sizeof(reg_trie_state) );
1755
1756 /* and now dump it out before we compress it */
1757 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1758 revcharmap, next_alloc,
1759 depth+1)
1760 );
1761
1762 trie->trans = (reg_trie_trans *)
1763 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1764 {
1765 U32 state;
1766 U32 tp = 0;
1767 U32 zp = 0;
1768
1769
1770 for( state=1 ; state < next_alloc ; state ++ ) {
1771 U32 base=0;
1772
1773 /*
1774 DEBUG_TRIE_COMPILE_MORE_r(
1775 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1776 );
1777 */
1778
1779 if (trie->states[state].trans.list) {
1780 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1781 U16 maxid=minid;
1782 U16 idx;
1783
1784 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1785 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1786 if ( forid < minid ) {
1787 minid=forid;
1788 } else if ( forid > maxid ) {
1789 maxid=forid;
1790 }
1791 }
1792 if ( transcount < tp + maxid - minid + 1) {
1793 transcount *= 2;
1794 trie->trans = (reg_trie_trans *)
1795 PerlMemShared_realloc( trie->trans,
1796 transcount
1797 * sizeof(reg_trie_trans) );
1798 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1799 }
1800 base = trie->uniquecharcount + tp - minid;
1801 if ( maxid == minid ) {
1802 U32 set = 0;
1803 for ( ; zp < tp ; zp++ ) {
1804 if ( ! trie->trans[ zp ].next ) {
1805 base = trie->uniquecharcount + zp - minid;
1806 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1807 trie->trans[ zp ].check = state;
1808 set = 1;
1809 break;
1810 }
1811 }
1812 if ( !set ) {
1813 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1814 trie->trans[ tp ].check = state;
1815 tp++;
1816 zp = tp;
1817 }
1818 } else {
1819 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1820 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1821 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1822 trie->trans[ tid ].check = state;
1823 }
1824 tp += ( maxid - minid + 1 );
1825 }
1826 Safefree(trie->states[ state ].trans.list);
1827 }
1828 /*
1829 DEBUG_TRIE_COMPILE_MORE_r(
1830 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1831 );
1832 */
1833 trie->states[ state ].trans.base=base;
1834 }
1835 trie->lasttrans = tp + 1;
1836 }
1837 } else {
1838 /*
1839 Second Pass -- Flat Table Representation.
1840
1841 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1842 We know that we will need Charcount+1 trans at most to store the data
1843 (one row per char at worst case) So we preallocate both structures
1844 assuming worst case.
1845
1846 We then construct the trie using only the .next slots of the entry
1847 structs.
1848
1849 We use the .check field of the first entry of the node temporarily to
1850 make compression both faster and easier by keeping track of how many non
1851 zero fields are in the node.
1852
1853 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1854 transition.
1855
1856 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1857 number representing the first entry of the node, and state as a
1858 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1859 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1860 are 2 entrys per node. eg:
1861
1862 A B A B
1863 1. 2 4 1. 3 7
1864 2. 0 3 3. 0 5
1865 3. 0 0 5. 0 0
1866 4. 0 0 7. 0 0
1867
1868 The table is internally in the right hand, idx form. However as we also
1869 have to deal with the states array which is indexed by nodenum we have to
1870 use TRIE_NODENUM() to convert.
1871
1872 */
1873 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1874 "%*sCompiling trie using table compiler\n",
1875 (int)depth * 2 + 2, ""));
1876
1877 trie->trans = (reg_trie_trans *)
1878 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1879 * trie->uniquecharcount + 1,
1880 sizeof(reg_trie_trans) );
1881 trie->states = (reg_trie_state *)
1882 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1883 sizeof(reg_trie_state) );
1884 next_alloc = trie->uniquecharcount + 1;
1885
1886
1887 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1888
1889 regnode * const noper = NEXTOPER( cur );
1890 const U8 *uc = (U8*)STRING( noper );
1891 const U8 * const e = uc + STR_LEN( noper );
1892
1893 U32 state = 1; /* required init */
1894
1895 U16 charid = 0; /* sanity init */
1896 U32 accept_state = 0; /* sanity init */
1897 U8 *scan = (U8*)NULL; /* sanity init */
1898
1899 STRLEN foldlen = 0; /* required init */
1900 U32 wordlen = 0; /* required init */
1901 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1902
1903 if ( OP(noper) != NOTHING ) {
1904 for ( ; uc < e ; uc += len ) {
1905
1906 TRIE_READ_CHAR;
1907
1908 if ( uvc < 256 ) {
1909 charid = trie->charmap[ uvc ];
1910 } else {
1911 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1912 charid = svpp ? (U16)SvIV(*svpp) : 0;
1913 }
1914 if ( charid ) {
1915 charid--;
1916 if ( !trie->trans[ state + charid ].next ) {
1917 trie->trans[ state + charid ].next = next_alloc;
1918 trie->trans[ state ].check++;
1919 prev_states[TRIE_NODENUM(next_alloc)]
1920 = TRIE_NODENUM(state);
1921 next_alloc += trie->uniquecharcount;
1922 }
1923 state = trie->trans[ state + charid ].next;
1924 } else {
1925 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1926 }
1927 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1928 }
1929 }
1930 accept_state = TRIE_NODENUM( state );
1931 TRIE_HANDLE_WORD(accept_state);
1932
1933 } /* end second pass */
1934
1935 /* and now dump it out before we compress it */
1936 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1937 revcharmap,
1938 next_alloc, depth+1));
1939
1940 {
1941 /*
1942 * Inplace compress the table.*
1943
1944 For sparse data sets the table constructed by the trie algorithm will
1945 be mostly 0/FAIL transitions or to put it another way mostly empty.
1946 (Note that leaf nodes will not contain any transitions.)
1947
1948 This algorithm compresses the tables by eliminating most such
1949 transitions, at the cost of a modest bit of extra work during lookup:
1950
1951 - Each states[] entry contains a .base field which indicates the
1952 index in the state[] array wheres its transition data is stored.
1953
1954 - If .base is 0 there are no valid transitions from that node.
1955
1956 - If .base is nonzero then charid is added to it to find an entry in
1957 the trans array.
1958
1959 -If trans[states[state].base+charid].check!=state then the
1960 transition is taken to be a 0/Fail transition. Thus if there are fail
1961 transitions at the front of the node then the .base offset will point
1962 somewhere inside the previous nodes data (or maybe even into a node
1963 even earlier), but the .check field determines if the transition is
1964 valid.
1965
1966 XXX - wrong maybe?
1967 The following process inplace converts the table to the compressed
1968 table: We first do not compress the root node 1,and mark all its
1969 .check pointers as 1 and set its .base pointer as 1 as well. This
1970 allows us to do a DFA construction from the compressed table later,
1971 and ensures that any .base pointers we calculate later are greater
1972 than 0.
1973
1974 - We set 'pos' to indicate the first entry of the second node.
1975
1976 - We then iterate over the columns of the node, finding the first and
1977 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1978 and set the .check pointers accordingly, and advance pos
1979 appropriately and repreat for the next node. Note that when we copy
1980 the next pointers we have to convert them from the original
1981 NODEIDX form to NODENUM form as the former is not valid post
1982 compression.
1983
1984 - If a node has no transitions used we mark its base as 0 and do not
1985 advance the pos pointer.
1986
1987 - If a node only has one transition we use a second pointer into the
1988 structure to fill in allocated fail transitions from other states.
1989 This pointer is independent of the main pointer and scans forward
1990 looking for null transitions that are allocated to a state. When it
1991 finds one it writes the single transition into the "hole". If the
1992 pointer doesnt find one the single transition is appended as normal.
1993
1994 - Once compressed we can Renew/realloc the structures to release the
1995 excess space.
1996
1997 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1998 specifically Fig 3.47 and the associated pseudocode.
1999
2000 demq
2001 */
2002 const U32 laststate = TRIE_NODENUM( next_alloc );
2003 U32 state, charid;
2004 U32 pos = 0, zp=0;
2005 trie->statecount = laststate;
2006
2007 for ( state = 1 ; state < laststate ; state++ ) {
2008 U8 flag = 0;
2009 const U32 stateidx = TRIE_NODEIDX( state );
2010 const U32 o_used = trie->trans[ stateidx ].check;
2011 U32 used = trie->trans[ stateidx ].check;
2012 trie->trans[ stateidx ].check = 0;
2013
2014 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2015 if ( flag || trie->trans[ stateidx + charid ].next ) {
2016 if ( trie->trans[ stateidx + charid ].next ) {
2017 if (o_used == 1) {
2018 for ( ; zp < pos ; zp++ ) {
2019 if ( ! trie->trans[ zp ].next ) {
2020 break;
2021 }
2022 }
2023 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2024 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2025 trie->trans[ zp ].check = state;
2026 if ( ++zp > pos ) pos = zp;
2027 break;
2028 }
2029 used--;
2030 }
2031 if ( !flag ) {
2032 flag = 1;
2033 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2034 }
2035 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2036 trie->trans[ pos ].check = state;
2037 pos++;
2038 }
2039 }
2040 }
2041 trie->lasttrans = pos + 1;
2042 trie->states = (reg_trie_state *)
2043 PerlMemShared_realloc( trie->states, laststate
2044 * sizeof(reg_trie_state) );
2045 DEBUG_TRIE_COMPILE_MORE_r(
2046 PerlIO_printf( Perl_debug_log,
2047 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2048 (int)depth * 2 + 2,"",
2049 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2050 (IV)next_alloc,
2051 (IV)pos,
2052 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2053 );
2054
2055 } /* end table compress */
2056 }
2057 DEBUG_TRIE_COMPILE_MORE_r(
2058 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2059 (int)depth * 2 + 2, "",
2060 (UV)trie->statecount,
2061 (UV)trie->lasttrans)
2062 );
2063 /* resize the trans array to remove unused space */
2064 trie->trans = (reg_trie_trans *)
2065 PerlMemShared_realloc( trie->trans, trie->lasttrans
2066 * sizeof(reg_trie_trans) );
2067
2068 { /* Modify the program and insert the new TRIE node */
2069 U8 nodetype =(U8)(flags & 0xFF);
2070 char *str=NULL;
2071
2072#ifdef DEBUGGING
2073 regnode *optimize = NULL;
2074#ifdef RE_TRACK_PATTERN_OFFSETS
2075
2076 U32 mjd_offset = 0;
2077 U32 mjd_nodelen = 0;
2078#endif /* RE_TRACK_PATTERN_OFFSETS */
2079#endif /* DEBUGGING */
2080 /*
2081 This means we convert either the first branch or the first Exact,
2082 depending on whether the thing following (in 'last') is a branch
2083 or not and whther first is the startbranch (ie is it a sub part of
2084 the alternation or is it the whole thing.)
2085 Assuming its a sub part we convert the EXACT otherwise we convert
2086 the whole branch sequence, including the first.
2087 */
2088 /* Find the node we are going to overwrite */
2089 if ( first != startbranch || OP( last ) == BRANCH ) {
2090 /* branch sub-chain */
2091 NEXT_OFF( first ) = (U16)(last - first);
2092#ifdef RE_TRACK_PATTERN_OFFSETS
2093 DEBUG_r({
2094 mjd_offset= Node_Offset((convert));
2095 mjd_nodelen= Node_Length((convert));
2096 });
2097#endif
2098 /* whole branch chain */
2099 }
2100#ifdef RE_TRACK_PATTERN_OFFSETS
2101 else {
2102 DEBUG_r({
2103 const regnode *nop = NEXTOPER( convert );
2104 mjd_offset= Node_Offset((nop));
2105 mjd_nodelen= Node_Length((nop));
2106 });
2107 }
2108 DEBUG_OPTIMISE_r(
2109 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2110 (int)depth * 2 + 2, "",
2111 (UV)mjd_offset, (UV)mjd_nodelen)
2112 );
2113#endif
2114 /* But first we check to see if there is a common prefix we can
2115 split out as an EXACT and put in front of the TRIE node. */
2116 trie->startstate= 1;
2117 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2118 U32 state;
2119 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2120 U32 ofs = 0;
2121 I32 idx = -1;
2122 U32 count = 0;
2123 const U32 base = trie->states[ state ].trans.base;
2124
2125 if ( trie->states[state].wordnum )
2126 count = 1;
2127
2128 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2129 if ( ( base + ofs >= trie->uniquecharcount ) &&
2130 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2131 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2132 {
2133 if ( ++count > 1 ) {
2134 SV **tmp = av_fetch( revcharmap, ofs, 0);
2135 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2136 if ( state == 1 ) break;
2137 if ( count == 2 ) {
2138 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2139 DEBUG_OPTIMISE_r(
2140 PerlIO_printf(Perl_debug_log,
2141 "%*sNew Start State=%"UVuf" Class: [",
2142 (int)depth * 2 + 2, "",
2143 (UV)state));
2144 if (idx >= 0) {
2145 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2146 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2147
2148 TRIE_BITMAP_SET(trie,*ch);
2149 if ( folder )
2150 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2151 DEBUG_OPTIMISE_r(
2152 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2153 );
2154 }
2155 }
2156 TRIE_BITMAP_SET(trie,*ch);
2157 if ( folder )
2158 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2159 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2160 }
2161 idx = ofs;
2162 }
2163 }
2164 if ( count == 1 ) {
2165 SV **tmp = av_fetch( revcharmap, idx, 0);
2166 STRLEN len;
2167 char *ch = SvPV( *tmp, len );
2168 DEBUG_OPTIMISE_r({
2169 SV *sv=sv_newmortal();
2170 PerlIO_printf( Perl_debug_log,
2171 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2172 (int)depth * 2 + 2, "",
2173 (UV)state, (UV)idx,
2174 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2175 PL_colors[0], PL_colors[1],
2176 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2177 PERL_PV_ESCAPE_FIRSTCHAR
2178 )
2179 );
2180 });
2181 if ( state==1 ) {
2182 OP( convert ) = nodetype;
2183 str=STRING(convert);
2184 STR_LEN(convert)=0;
2185 }
2186 STR_LEN(convert) += len;
2187 while (len--)
2188 *str++ = *ch++;
2189 } else {
2190#ifdef DEBUGGING
2191 if (state>1)
2192 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2193#endif
2194 break;
2195 }
2196 }
2197 trie->prefixlen = (state-1);
2198 if (str) {
2199 regnode *n = convert+NODE_SZ_STR(convert);
2200 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2201 trie->startstate = state;
2202 trie->minlen -= (state - 1);
2203 trie->maxlen -= (state - 1);
2204#ifdef DEBUGGING
2205 /* At least the UNICOS C compiler choked on this
2206 * being argument to DEBUG_r(), so let's just have
2207 * it right here. */
2208 if (
2209#ifdef PERL_EXT_RE_BUILD
2210 1
2211#else
2212 DEBUG_r_TEST
2213#endif
2214 ) {
2215 regnode *fix = convert;
2216 U32 word = trie->wordcount;
2217 mjd_nodelen++;
2218 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2219 while( ++fix < n ) {
2220 Set_Node_Offset_Length(fix, 0, 0);
2221 }
2222 while (word--) {
2223 SV ** const tmp = av_fetch( trie_words, word, 0 );
2224 if (tmp) {
2225 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2226 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2227 else
2228 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2229 }
2230 }
2231 }
2232#endif
2233 if (trie->maxlen) {
2234 convert = n;
2235 } else {
2236 NEXT_OFF(convert) = (U16)(tail - convert);
2237 DEBUG_r(optimize= n);
2238 }
2239 }
2240 }
2241 if (!jumper)
2242 jumper = last;
2243 if ( trie->maxlen ) {
2244 NEXT_OFF( convert ) = (U16)(tail - convert);
2245 ARG_SET( convert, data_slot );
2246 /* Store the offset to the first unabsorbed branch in
2247 jump[0], which is otherwise unused by the jump logic.
2248 We use this when dumping a trie and during optimisation. */
2249 if (trie->jump)
2250 trie->jump[0] = (U16)(nextbranch - convert);
2251
2252 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2253 * and there is a bitmap
2254 * and the first "jump target" node we found leaves enough room
2255 * then convert the TRIE node into a TRIEC node, with the bitmap
2256 * embedded inline in the opcode - this is hypothetically faster.
2257 */
2258 if ( !trie->states[trie->startstate].wordnum
2259 && trie->bitmap
2260 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2261 {
2262 OP( convert ) = TRIEC;
2263 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2264 PerlMemShared_free(trie->bitmap);
2265 trie->bitmap= NULL;
2266 } else
2267 OP( convert ) = TRIE;
2268
2269 /* store the type in the flags */
2270 convert->flags = nodetype;
2271 DEBUG_r({
2272 optimize = convert
2273 + NODE_STEP_REGNODE
2274 + regarglen[ OP( convert ) ];
2275 });
2276 /* XXX We really should free up the resource in trie now,
2277 as we won't use them - (which resources?) dmq */
2278 }
2279 /* needed for dumping*/
2280 DEBUG_r(if (optimize) {
2281 regnode *opt = convert;
2282
2283 while ( ++opt < optimize) {
2284 Set_Node_Offset_Length(opt,0,0);
2285 }
2286 /*
2287 Try to clean up some of the debris left after the
2288 optimisation.
2289 */
2290 while( optimize < jumper ) {
2291 mjd_nodelen += Node_Length((optimize));
2292 OP( optimize ) = OPTIMIZED;
2293 Set_Node_Offset_Length(optimize,0,0);
2294 optimize++;
2295 }
2296 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2297 });
2298 } /* end node insert */
2299
2300 /* Finish populating the prev field of the wordinfo array. Walk back
2301 * from each accept state until we find another accept state, and if
2302 * so, point the first word's .prev field at the second word. If the
2303 * second already has a .prev field set, stop now. This will be the
2304 * case either if we've already processed that word's accept state,
2305 * or that state had multiple words, and the overspill words were
2306 * already linked up earlier.
2307 */
2308 {
2309 U16 word;
2310 U32 state;
2311 U16 prev;
2312
2313 for (word=1; word <= trie->wordcount; word++) {
2314 prev = 0;
2315 if (trie->wordinfo[word].prev)
2316 continue;
2317 state = trie->wordinfo[word].accept;
2318 while (state) {
2319 state = prev_states[state];
2320 if (!state)
2321 break;
2322 prev = trie->states[state].wordnum;
2323 if (prev)
2324 break;
2325 }
2326 trie->wordinfo[word].prev = prev;
2327 }
2328 Safefree(prev_states);
2329 }
2330
2331
2332 /* and now dump out the compressed format */
2333 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2334
2335 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2336#ifdef DEBUGGING
2337 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2338 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2339#else
2340 SvREFCNT_dec(revcharmap);
2341#endif
2342 return trie->jump
2343 ? MADE_JUMP_TRIE
2344 : trie->startstate>1
2345 ? MADE_EXACT_TRIE
2346 : MADE_TRIE;
2347}
2348
2349STATIC void
2350S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2351{
2352/* The Trie is constructed and compressed now so we can build a fail array if it's needed
2353
2354 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2355 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2356 ISBN 0-201-10088-6
2357
2358 We find the fail state for each state in the trie, this state is the longest proper
2359 suffix of the current state's 'word' that is also a proper prefix of another word in our
2360 trie. State 1 represents the word '' and is thus the default fail state. This allows
2361 the DFA not to have to restart after its tried and failed a word at a given point, it
2362 simply continues as though it had been matching the other word in the first place.
2363 Consider
2364 'abcdgu'=~/abcdefg|cdgu/
2365 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2366 fail, which would bring us to the state representing 'd' in the second word where we would
2367 try 'g' and succeed, proceeding to match 'cdgu'.
2368 */
2369 /* add a fail transition */
2370 const U32 trie_offset = ARG(source);
2371 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2372 U32 *q;
2373 const U32 ucharcount = trie->uniquecharcount;
2374 const U32 numstates = trie->statecount;
2375 const U32 ubound = trie->lasttrans + ucharcount;
2376 U32 q_read = 0;
2377 U32 q_write = 0;
2378 U32 charid;
2379 U32 base = trie->states[ 1 ].trans.base;
2380 U32 *fail;
2381 reg_ac_data *aho;
2382 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2383 GET_RE_DEBUG_FLAGS_DECL;
2384
2385 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2386#ifndef DEBUGGING
2387 PERL_UNUSED_ARG(depth);
2388#endif
2389
2390
2391 ARG_SET( stclass, data_slot );
2392 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2393 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2394 aho->trie=trie_offset;
2395 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2396 Copy( trie->states, aho->states, numstates, reg_trie_state );
2397 Newxz( q, numstates, U32);
2398 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2399 aho->refcount = 1;
2400 fail = aho->fail;
2401 /* initialize fail[0..1] to be 1 so that we always have
2402 a valid final fail state */
2403 fail[ 0 ] = fail[ 1 ] = 1;
2404
2405 for ( charid = 0; charid < ucharcount ; charid++ ) {
2406 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2407 if ( newstate ) {
2408 q[ q_write ] = newstate;
2409 /* set to point at the root */
2410 fail[ q[ q_write++ ] ]=1;
2411 }
2412 }
2413 while ( q_read < q_write) {
2414 const U32 cur = q[ q_read++ % numstates ];
2415 base = trie->states[ cur ].trans.base;
2416
2417 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2418 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2419 if (ch_state) {
2420 U32 fail_state = cur;
2421 U32 fail_base;
2422 do {
2423 fail_state = fail[ fail_state ];
2424 fail_base = aho->states[ fail_state ].trans.base;
2425 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2426
2427 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2428 fail[ ch_state ] = fail_state;
2429 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2430 {
2431 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2432 }
2433 q[ q_write++ % numstates] = ch_state;
2434 }
2435 }
2436 }
2437 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2438 when we fail in state 1, this allows us to use the
2439 charclass scan to find a valid start char. This is based on the principle
2440 that theres a good chance the string being searched contains lots of stuff
2441 that cant be a start char.
2442 */
2443 fail[ 0 ] = fail[ 1 ] = 0;
2444 DEBUG_TRIE_COMPILE_r({
2445 PerlIO_printf(Perl_debug_log,
2446 "%*sStclass Failtable (%"UVuf" states): 0",
2447 (int)(depth * 2), "", (UV)numstates
2448 );
2449 for( q_read=1; q_read<numstates; q_read++ ) {
2450 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2451 }
2452 PerlIO_printf(Perl_debug_log, "\n");
2453 });
2454 Safefree(q);
2455 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2456}
2457
2458
2459/*
2460 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2461 * These need to be revisited when a newer toolchain becomes available.
2462 */
2463#if defined(__sparc64__) && defined(__GNUC__)
2464# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2465# undef SPARC64_GCC_WORKAROUND
2466# define SPARC64_GCC_WORKAROUND 1
2467# endif
2468#endif
2469
2470#define DEBUG_PEEP(str,scan,depth) \
2471 DEBUG_OPTIMISE_r({if (scan){ \
2472 SV * const mysv=sv_newmortal(); \
2473 regnode *Next = regnext(scan); \
2474 regprop(RExC_rx, mysv, scan); \
2475 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2476 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2477 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2478 }});
2479
2480
2481
2482
2483
2484#define JOIN_EXACT(scan,min,flags) \
2485 if (PL_regkind[OP(scan)] == EXACT) \
2486 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2487
2488STATIC U32
2489S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2490 /* Merge several consecutive EXACTish nodes into one. */
2491 regnode *n = regnext(scan);
2492 U32 stringok = 1;
2493 regnode *next = scan + NODE_SZ_STR(scan);
2494 U32 merged = 0;
2495 U32 stopnow = 0;
2496#ifdef DEBUGGING
2497 regnode *stop = scan;
2498 GET_RE_DEBUG_FLAGS_DECL;
2499#else
2500 PERL_UNUSED_ARG(depth);
2501#endif
2502
2503 PERL_ARGS_ASSERT_JOIN_EXACT;
2504#ifndef EXPERIMENTAL_INPLACESCAN
2505 PERL_UNUSED_ARG(flags);
2506 PERL_UNUSED_ARG(val);
2507#endif
2508 DEBUG_PEEP("join",scan,depth);
2509
2510 /* Skip NOTHING, merge EXACT*. */
2511 while (n &&
2512 ( PL_regkind[OP(n)] == NOTHING ||
2513 (stringok && (OP(n) == OP(scan))))
2514 && NEXT_OFF(n)
2515 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2516
2517 if (OP(n) == TAIL || n > next)
2518 stringok = 0;
2519 if (PL_regkind[OP(n)] == NOTHING) {
2520 DEBUG_PEEP("skip:",n,depth);
2521 NEXT_OFF(scan) += NEXT_OFF(n);
2522 next = n + NODE_STEP_REGNODE;
2523#ifdef DEBUGGING
2524 if (stringok)
2525 stop = n;
2526#endif
2527 n = regnext(n);
2528 }
2529 else if (stringok) {
2530 const unsigned int oldl = STR_LEN(scan);
2531 regnode * const nnext = regnext(n);
2532
2533 DEBUG_PEEP("merg",n,depth);
2534
2535 merged++;
2536 if (oldl + STR_LEN(n) > U8_MAX)
2537 break;
2538 NEXT_OFF(scan) += NEXT_OFF(n);
2539 STR_LEN(scan) += STR_LEN(n);
2540 next = n + NODE_SZ_STR(n);
2541 /* Now we can overwrite *n : */
2542 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2543#ifdef DEBUGGING
2544 stop = next - 1;
2545#endif
2546 n = nnext;
2547 if (stopnow) break;
2548 }
2549
2550#ifdef EXPERIMENTAL_INPLACESCAN
2551 if (flags && !NEXT_OFF(n)) {
2552 DEBUG_PEEP("atch", val, depth);
2553 if (reg_off_by_arg[OP(n)]) {
2554 ARG_SET(n, val - n);
2555 }
2556 else {
2557 NEXT_OFF(n) = val - n;
2558 }
2559 stopnow = 1;
2560 }
2561#endif
2562 }
2563#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2564#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2565#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2566#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2567
2568 if (UTF
2569 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
2570 && ( STR_LEN(scan) >= 6 ) )
2571 {
2572 /*
2573 Two problematic code points in Unicode casefolding of EXACT nodes:
2574
2575 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2576 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2577
2578 which casefold to
2579
2580 Unicode UTF-8
2581
2582 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2583 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2584
2585 This means that in case-insensitive matching (or "loose matching",
2586 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2587 length of the above casefolded versions) can match a target string
2588 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2589 This would rather mess up the minimum length computation.
2590
2591 What we'll do is to look for the tail four bytes, and then peek
2592 at the preceding two bytes to see whether we need to decrease
2593 the minimum length by four (six minus two).
2594
2595 Thanks to the design of UTF-8, there cannot be false matches:
2596 A sequence of valid UTF-8 bytes cannot be a subsequence of
2597 another valid sequence of UTF-8 bytes.
2598
2599 */
2600 char * const s0 = STRING(scan), *s, *t;
2601 char * const s1 = s0 + STR_LEN(scan) - 1;
2602 char * const s2 = s1 - 4;
2603#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2604 const char t0[] = "\xaf\x49\xaf\x42";
2605#else
2606 const char t0[] = "\xcc\x88\xcc\x81";
2607#endif
2608 const char * const t1 = t0 + 3;
2609
2610 for (s = s0 + 2;
2611 s < s2 && (t = ninstr(s, s1, t0, t1));
2612 s = t + 4) {
2613#ifdef EBCDIC
2614 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2615 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2616#else
2617 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2618 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2619#endif
2620 *min -= 4;
2621 }
2622 }
2623
2624#ifdef DEBUGGING
2625 /* Allow dumping */
2626 n = scan + NODE_SZ_STR(scan);
2627 while (n <= stop) {
2628 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2629 OP(n) = OPTIMIZED;
2630 NEXT_OFF(n) = 0;
2631 }
2632 n++;
2633 }
2634#endif
2635 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2636 return stopnow;
2637}
2638
2639/* REx optimizer. Converts nodes into quicker variants "in place".
2640 Finds fixed substrings. */
2641
2642/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2643 to the position after last scanned or to NULL. */
2644
2645#define INIT_AND_WITHP \
2646 assert(!and_withp); \
2647 Newx(and_withp,1,struct regnode_charclass_class); \
2648 SAVEFREEPV(and_withp)
2649
2650/* this is a chain of data about sub patterns we are processing that
2651 need to be handled separately/specially in study_chunk. Its so
2652 we can simulate recursion without losing state. */
2653struct scan_frame;
2654typedef struct scan_frame {
2655 regnode *last; /* last node to process in this frame */
2656 regnode *next; /* next node to process when last is reached */
2657 struct scan_frame *prev; /*previous frame*/
2658 I32 stop; /* what stopparen do we use */
2659} scan_frame;
2660
2661
2662#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2663
2664#define CASE_SYNST_FNC(nAmE) \
2665case nAmE: \
2666 if (flags & SCF_DO_STCLASS_AND) { \
2667 for (value = 0; value < 256; value++) \
2668 if (!is_ ## nAmE ## _cp(value)) \
2669 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2670 } \
2671 else { \
2672 for (value = 0; value < 256; value++) \
2673 if (is_ ## nAmE ## _cp(value)) \
2674 ANYOF_BITMAP_SET(data->start_class, value); \
2675 } \
2676 break; \
2677case N ## nAmE: \
2678 if (flags & SCF_DO_STCLASS_AND) { \
2679 for (value = 0; value < 256; value++) \
2680 if (is_ ## nAmE ## _cp(value)) \
2681 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2682 } \
2683 else { \
2684 for (value = 0; value < 256; value++) \
2685 if (!is_ ## nAmE ## _cp(value)) \
2686 ANYOF_BITMAP_SET(data->start_class, value); \
2687 } \
2688 break
2689
2690
2691
2692STATIC I32
2693S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2694 I32 *minlenp, I32 *deltap,
2695 regnode *last,
2696 scan_data_t *data,
2697 I32 stopparen,
2698 U8* recursed,
2699 struct regnode_charclass_class *and_withp,
2700 U32 flags, U32 depth)
2701 /* scanp: Start here (read-write). */
2702 /* deltap: Write maxlen-minlen here. */
2703 /* last: Stop before this one. */
2704 /* data: string data about the pattern */
2705 /* stopparen: treat close N as END */
2706 /* recursed: which subroutines have we recursed into */
2707 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2708{
2709 dVAR;
2710 I32 min = 0, pars = 0, code;
2711 regnode *scan = *scanp, *next;
2712 I32 delta = 0;
2713 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2714 int is_inf_internal = 0; /* The studied chunk is infinite */
2715 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2716 scan_data_t data_fake;
2717 SV *re_trie_maxbuff = NULL;
2718 regnode *first_non_open = scan;
2719 I32 stopmin = I32_MAX;
2720 scan_frame *frame = NULL;
2721 GET_RE_DEBUG_FLAGS_DECL;
2722
2723 PERL_ARGS_ASSERT_STUDY_CHUNK;
2724
2725#ifdef DEBUGGING
2726 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2727#endif
2728
2729 if ( depth == 0 ) {
2730 while (first_non_open && OP(first_non_open) == OPEN)
2731 first_non_open=regnext(first_non_open);
2732 }
2733
2734
2735 fake_study_recurse:
2736 while ( scan && OP(scan) != END && scan < last ){
2737 /* Peephole optimizer: */
2738 DEBUG_STUDYDATA("Peep:", data,depth);
2739 DEBUG_PEEP("Peep",scan,depth);
2740 JOIN_EXACT(scan,&min,0);
2741
2742 /* Follow the next-chain of the current node and optimize
2743 away all the NOTHINGs from it. */
2744 if (OP(scan) != CURLYX) {
2745 const int max = (reg_off_by_arg[OP(scan)]
2746 ? I32_MAX
2747 /* I32 may be smaller than U16 on CRAYs! */
2748 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2749 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2750 int noff;
2751 regnode *n = scan;
2752
2753 /* Skip NOTHING and LONGJMP. */
2754 while ((n = regnext(n))
2755 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2756 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2757 && off + noff < max)
2758 off += noff;
2759 if (reg_off_by_arg[OP(scan)])
2760 ARG(scan) = off;
2761 else
2762 NEXT_OFF(scan) = off;
2763 }
2764
2765
2766
2767 /* The principal pseudo-switch. Cannot be a switch, since we
2768 look into several different things. */
2769 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2770 || OP(scan) == IFTHEN) {
2771 next = regnext(scan);
2772 code = OP(scan);
2773 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2774
2775 if (OP(next) == code || code == IFTHEN) {
2776 /* NOTE - There is similar code to this block below for handling
2777 TRIE nodes on a re-study. If you change stuff here check there
2778 too. */
2779 I32 max1 = 0, min1 = I32_MAX, num = 0;
2780 struct regnode_charclass_class accum;
2781 regnode * const startbranch=scan;
2782
2783 if (flags & SCF_DO_SUBSTR)
2784 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2785 if (flags & SCF_DO_STCLASS)
2786 cl_init_zero(pRExC_state, &accum);
2787
2788 while (OP(scan) == code) {
2789 I32 deltanext, minnext, f = 0, fake;
2790 struct regnode_charclass_class this_class;
2791
2792 num++;
2793 data_fake.flags = 0;
2794 if (data) {
2795 data_fake.whilem_c = data->whilem_c;
2796 data_fake.last_closep = data->last_closep;
2797 }
2798 else
2799 data_fake.last_closep = &fake;
2800
2801 data_fake.pos_delta = delta;
2802 next = regnext(scan);
2803 scan = NEXTOPER(scan);
2804 if (code != BRANCH)
2805 scan = NEXTOPER(scan);
2806 if (flags & SCF_DO_STCLASS) {
2807 cl_init(pRExC_state, &this_class);
2808 data_fake.start_class = &this_class;
2809 f = SCF_DO_STCLASS_AND;
2810 }
2811 if (flags & SCF_WHILEM_VISITED_POS)
2812 f |= SCF_WHILEM_VISITED_POS;
2813
2814 /* we suppose the run is continuous, last=next...*/
2815 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2816 next, &data_fake,
2817 stopparen, recursed, NULL, f,depth+1);
2818 if (min1 > minnext)
2819 min1 = minnext;
2820 if (max1 < minnext + deltanext)
2821 max1 = minnext + deltanext;
2822 if (deltanext == I32_MAX)
2823 is_inf = is_inf_internal = 1;
2824 scan = next;
2825 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2826 pars++;
2827 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2828 if ( stopmin > minnext)
2829 stopmin = min + min1;
2830 flags &= ~SCF_DO_SUBSTR;
2831 if (data)
2832 data->flags |= SCF_SEEN_ACCEPT;
2833 }
2834 if (data) {
2835 if (data_fake.flags & SF_HAS_EVAL)
2836 data->flags |= SF_HAS_EVAL;
2837 data->whilem_c = data_fake.whilem_c;
2838 }
2839 if (flags & SCF_DO_STCLASS)
2840 cl_or(pRExC_state, &accum, &this_class);
2841 }
2842 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2843 min1 = 0;
2844 if (flags & SCF_DO_SUBSTR) {
2845 data->pos_min += min1;
2846 data->pos_delta += max1 - min1;
2847 if (max1 != min1 || is_inf)
2848 data->longest = &(data->longest_float);
2849 }
2850 min += min1;
2851 delta += max1 - min1;
2852 if (flags & SCF_DO_STCLASS_OR) {
2853 cl_or(pRExC_state, data->start_class, &accum);
2854 if (min1) {
2855 cl_and(data->start_class, and_withp);
2856 flags &= ~SCF_DO_STCLASS;
2857 }
2858 }
2859 else if (flags & SCF_DO_STCLASS_AND) {
2860 if (min1) {
2861 cl_and(data->start_class, &accum);
2862 flags &= ~SCF_DO_STCLASS;
2863 }
2864 else {
2865 /* Switch to OR mode: cache the old value of
2866 * data->start_class */
2867 INIT_AND_WITHP;
2868 StructCopy(data->start_class, and_withp,
2869 struct regnode_charclass_class);
2870 flags &= ~SCF_DO_STCLASS_AND;
2871 StructCopy(&accum, data->start_class,
2872 struct regnode_charclass_class);
2873 flags |= SCF_DO_STCLASS_OR;
2874 data->start_class->flags |= ANYOF_EOS;
2875 }
2876 }
2877
2878 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2879 /* demq.
2880
2881 Assuming this was/is a branch we are dealing with: 'scan' now
2882 points at the item that follows the branch sequence, whatever
2883 it is. We now start at the beginning of the sequence and look
2884 for subsequences of
2885
2886 BRANCH->EXACT=>x1
2887 BRANCH->EXACT=>x2
2888 tail
2889
2890 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2891
2892 If we can find such a subsequence we need to turn the first
2893 element into a trie and then add the subsequent branch exact
2894 strings to the trie.
2895
2896 We have two cases
2897
2898 1. patterns where the whole set of branches can be converted.
2899
2900 2. patterns where only a subset can be converted.
2901
2902 In case 1 we can replace the whole set with a single regop
2903 for the trie. In case 2 we need to keep the start and end
2904 branches so
2905
2906 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2907 becomes BRANCH TRIE; BRANCH X;
2908
2909 There is an additional case, that being where there is a
2910 common prefix, which gets split out into an EXACT like node
2911 preceding the TRIE node.
2912
2913 If x(1..n)==tail then we can do a simple trie, if not we make
2914 a "jump" trie, such that when we match the appropriate word
2915 we "jump" to the appropriate tail node. Essentially we turn
2916 a nested if into a case structure of sorts.
2917
2918 */
2919
2920 int made=0;
2921 if (!re_trie_maxbuff) {
2922 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2923 if (!SvIOK(re_trie_maxbuff))
2924 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2925 }
2926 if ( SvIV(re_trie_maxbuff)>=0 ) {
2927 regnode *cur;
2928 regnode *first = (regnode *)NULL;
2929 regnode *last = (regnode *)NULL;
2930 regnode *tail = scan;
2931 U8 optype = 0;
2932 U32 count=0;
2933
2934#ifdef DEBUGGING
2935 SV * const mysv = sv_newmortal(); /* for dumping */
2936#endif
2937 /* var tail is used because there may be a TAIL
2938 regop in the way. Ie, the exacts will point to the
2939 thing following the TAIL, but the last branch will
2940 point at the TAIL. So we advance tail. If we
2941 have nested (?:) we may have to move through several
2942 tails.
2943 */
2944
2945 while ( OP( tail ) == TAIL ) {
2946 /* this is the TAIL generated by (?:) */
2947 tail = regnext( tail );
2948 }
2949
2950
2951 DEBUG_OPTIMISE_r({
2952 regprop(RExC_rx, mysv, tail );
2953 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2954 (int)depth * 2 + 2, "",
2955 "Looking for TRIE'able sequences. Tail node is: ",
2956 SvPV_nolen_const( mysv )
2957 );
2958 });
2959
2960 /*
2961
2962 step through the branches, cur represents each
2963 branch, noper is the first thing to be matched
2964 as part of that branch and noper_next is the
2965 regnext() of that node. if noper is an EXACT
2966 and noper_next is the same as scan (our current
2967 position in the regex) then the EXACT branch is
2968 a possible optimization target. Once we have
2969 two or more consecutive such branches we can
2970 create a trie of the EXACT's contents and stich
2971 it in place. If the sequence represents all of
2972 the branches we eliminate the whole thing and
2973 replace it with a single TRIE. If it is a
2974 subsequence then we need to stitch it in. This
2975 means the first branch has to remain, and needs
2976 to be repointed at the item on the branch chain
2977 following the last branch optimized. This could
2978 be either a BRANCH, in which case the
2979 subsequence is internal, or it could be the
2980 item following the branch sequence in which
2981 case the subsequence is at the end.
2982
2983 */
2984
2985 /* dont use tail as the end marker for this traverse */
2986 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2987 regnode * const noper = NEXTOPER( cur );
2988#if defined(DEBUGGING) || defined(NOJUMPTRIE)
2989 regnode * const noper_next = regnext( noper );
2990#endif
2991
2992 DEBUG_OPTIMISE_r({
2993 regprop(RExC_rx, mysv, cur);
2994 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2995 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2996
2997 regprop(RExC_rx, mysv, noper);
2998 PerlIO_printf( Perl_debug_log, " -> %s",
2999 SvPV_nolen_const(mysv));
3000
3001 if ( noper_next ) {
3002 regprop(RExC_rx, mysv, noper_next );
3003 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3004 SvPV_nolen_const(mysv));
3005 }
3006 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3007 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3008 });
3009 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
3010 : PL_regkind[ OP( noper ) ] == EXACT )
3011 || OP(noper) == NOTHING )
3012#ifdef NOJUMPTRIE
3013 && noper_next == tail
3014#endif
3015 && count < U16_MAX)
3016 {
3017 count++;
3018 if ( !first || optype == NOTHING ) {
3019 if (!first) first = cur;
3020 optype = OP( noper );
3021 } else {
3022 last = cur;
3023 }
3024 } else {
3025/*
3026 Currently we do not believe that the trie logic can
3027 handle case insensitive matching properly when the
3028 pattern is not unicode (thus forcing unicode semantics).
3029
3030 If/when this is fixed the following define can be swapped
3031 in below to fully enable trie logic.
3032
3033 XXX It may work if not UTF and/or /a (AT_LEAST_UNI_SEMANTICS) but perhaps
3034 not /aa
3035
3036#define TRIE_TYPE_IS_SAFE 1
3037
3038*/
3039#define TRIE_TYPE_IS_SAFE ((UTF && UNI_SEMANTICS) || optype==EXACT)
3040
3041 if ( last && TRIE_TYPE_IS_SAFE ) {
3042 make_trie( pRExC_state,
3043 startbranch, first, cur, tail, count,
3044 optype, depth+1 );
3045 }
3046 if ( PL_regkind[ OP( noper ) ] == EXACT
3047#ifdef NOJUMPTRIE
3048 && noper_next == tail
3049#endif
3050 ){
3051 count = 1;
3052 first = cur;
3053 optype = OP( noper );
3054 } else {
3055 count = 0;
3056 first = NULL;
3057 optype = 0;
3058 }
3059 last = NULL;
3060 }
3061 }
3062 DEBUG_OPTIMISE_r({
3063 regprop(RExC_rx, mysv, cur);
3064 PerlIO_printf( Perl_debug_log,
3065 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3066 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3067
3068 });
3069
3070 if ( last && TRIE_TYPE_IS_SAFE ) {
3071 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3072#ifdef TRIE_STUDY_OPT
3073 if ( ((made == MADE_EXACT_TRIE &&
3074 startbranch == first)
3075 || ( first_non_open == first )) &&
3076 depth==0 ) {
3077 flags |= SCF_TRIE_RESTUDY;
3078 if ( startbranch == first
3079 && scan == tail )
3080 {
3081 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3082 }
3083 }
3084#endif
3085 }
3086 }
3087
3088 } /* do trie */
3089
3090 }
3091 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3092 scan = NEXTOPER(NEXTOPER(scan));
3093 } else /* single branch is optimized. */
3094 scan = NEXTOPER(scan);
3095 continue;
3096 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3097 scan_frame *newframe = NULL;
3098 I32 paren;
3099 regnode *start;
3100 regnode *end;
3101
3102 if (OP(scan) != SUSPEND) {
3103 /* set the pointer */
3104 if (OP(scan) == GOSUB) {
3105 paren = ARG(scan);
3106 RExC_recurse[ARG2L(scan)] = scan;
3107 start = RExC_open_parens[paren-1];
3108 end = RExC_close_parens[paren-1];
3109 } else {
3110 paren = 0;
3111 start = RExC_rxi->program + 1;
3112 end = RExC_opend;
3113 }
3114 if (!recursed) {
3115 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3116 SAVEFREEPV(recursed);
3117 }
3118 if (!PAREN_TEST(recursed,paren+1)) {
3119 PAREN_SET(recursed,paren+1);
3120 Newx(newframe,1,scan_frame);
3121 } else {
3122 if (flags & SCF_DO_SUBSTR) {
3123 SCAN_COMMIT(pRExC_state,data,minlenp);
3124 data->longest = &(data->longest_float);
3125 }
3126 is_inf = is_inf_internal = 1;
3127 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3128 cl_anything(pRExC_state, data->start_class);
3129 flags &= ~SCF_DO_STCLASS;
3130 }
3131 } else {
3132 Newx(newframe,1,scan_frame);
3133 paren = stopparen;
3134 start = scan+2;
3135 end = regnext(scan);
3136 }
3137 if (newframe) {
3138 assert(start);
3139 assert(end);
3140 SAVEFREEPV(newframe);
3141 newframe->next = regnext(scan);
3142 newframe->last = last;
3143 newframe->stop = stopparen;
3144 newframe->prev = frame;
3145
3146 frame = newframe;
3147 scan = start;
3148 stopparen = paren;
3149 last = end;
3150
3151 continue;
3152 }
3153 }
3154 else if (OP(scan) == EXACT) {
3155 I32 l = STR_LEN(scan);
3156 UV uc;
3157 if (UTF) {
3158 const U8 * const s = (U8*)STRING(scan);
3159 l = utf8_length(s, s + l);
3160 uc = utf8_to_uvchr(s, NULL);
3161 } else {
3162 uc = *((U8*)STRING(scan));
3163 }
3164 min += l;
3165 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3166 /* The code below prefers earlier match for fixed
3167 offset, later match for variable offset. */
3168 if (data->last_end == -1) { /* Update the start info. */
3169 data->last_start_min = data->pos_min;
3170 data->last_start_max = is_inf
3171 ? I32_MAX : data->pos_min + data->pos_delta;
3172 }
3173 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3174 if (UTF)
3175 SvUTF8_on(data->last_found);
3176 {
3177 SV * const sv = data->last_found;
3178 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3179 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3180 if (mg && mg->mg_len >= 0)
3181 mg->mg_len += utf8_length((U8*)STRING(scan),
3182 (U8*)STRING(scan)+STR_LEN(scan));
3183 }
3184 data->last_end = data->pos_min + l;
3185 data->pos_min += l; /* As in the first entry. */
3186 data->flags &= ~SF_BEFORE_EOL;
3187 }
3188 if (flags & SCF_DO_STCLASS_AND) {
3189 /* Check whether it is compatible with what we know already! */
3190 int compat = 1;
3191
3192
3193 /* If compatible, we or it in below. It is compatible if is
3194 * in the bitmp and either 1) its bit or its fold is set, or 2)
3195 * it's for a locale. Even if there isn't unicode semantics
3196 * here, at runtime there may be because of matching against a
3197 * utf8 string, so accept a possible false positive for
3198 * latin1-range folds */
3199 if (uc >= 0x100 ||
3200 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3201 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3202 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3203 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3204 )
3205 {
3206 compat = 0;
3207 }
3208 ANYOF_CLASS_ZERO(data->start_class);
3209 ANYOF_BITMAP_ZERO(data->start_class);
3210 if (compat)
3211 ANYOF_BITMAP_SET(data->start_class, uc);
3212 else if (uc >= 0x100) {
3213 int i;
3214
3215 /* Some Unicode code points fold to the Latin1 range; as
3216 * XXX temporary code, instead of figuring out if this is
3217 * one, just assume it is and set all the start class bits
3218 * that could be some such above 255 code point's fold
3219 * which will generate fals positives. As the code
3220 * elsewhere that does compute the fold settles down, it
3221 * can be extracted out and re-used here */
3222 for (i = 0; i < 256; i++){
3223 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3224 ANYOF_BITMAP_SET(data->start_class, i);
3225 }
3226 }
3227 }
3228 data->start_class->flags &= ~ANYOF_EOS;
3229 if (uc < 0x100)
3230 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3231 }
3232 else if (flags & SCF_DO_STCLASS_OR) {
3233 /* false positive possible if the class is case-folded */
3234 if (uc < 0x100)
3235 ANYOF_BITMAP_SET(data->start_class, uc);
3236 else
3237 data->start_class->flags |= ANYOF_UNICODE_ALL;
3238 data->start_class->flags &= ~ANYOF_EOS;
3239 cl_and(data->start_class, and_withp);
3240 }
3241 flags &= ~SCF_DO_STCLASS;
3242 }
3243 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3244 I32 l = STR_LEN(scan);
3245 UV uc = *((U8*)STRING(scan));
3246
3247 /* Search for fixed substrings supports EXACT only. */
3248 if (flags & SCF_DO_SUBSTR) {
3249 assert(data);
3250 SCAN_COMMIT(pRExC_state, data, minlenp);
3251 }
3252 if (UTF) {
3253 const U8 * const s = (U8 *)STRING(scan);
3254 l = utf8_length(s, s + l);
3255 uc = utf8_to_uvchr(s, NULL);
3256 }
3257 min += l;
3258 if (flags & SCF_DO_SUBSTR)
3259 data->pos_min += l;
3260 if (flags & SCF_DO_STCLASS_AND) {
3261 /* Check whether it is compatible with what we know already! */
3262 int compat = 1;
3263 if (uc >= 0x100 ||
3264 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3265 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3266 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3267 {
3268 compat = 0;
3269 }
3270 ANYOF_CLASS_ZERO(data->start_class);
3271 ANYOF_BITMAP_ZERO(data->start_class);
3272 if (compat) {
3273 ANYOF_BITMAP_SET(data->start_class, uc);
3274 data->start_class->flags &= ~ANYOF_EOS;
3275 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3276 if (OP(scan) == EXACTFL) {
3277 /* XXX This set is probably no longer necessary, and
3278 * probably wrong as LOCALE now is on in the initial
3279 * state */
3280 data->start_class->flags |= ANYOF_LOCALE;
3281 }
3282 else {
3283
3284 /* Also set the other member of the fold pair. In case
3285 * that unicode semantics is called for at runtime, use
3286 * the full latin1 fold. (Can't do this for locale,
3287 * because not known until runtime */
3288 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3289 }
3290 }
3291 else if (uc >= 0x100) {
3292 int i;
3293 for (i = 0; i < 256; i++){
3294 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3295 ANYOF_BITMAP_SET(data->start_class, i);
3296 }
3297 }
3298 }
3299 }
3300 else if (flags & SCF_DO_STCLASS_OR) {
3301 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3302 /* false positive possible if the class is case-folded.
3303 Assume that the locale settings are the same... */
3304 if (uc < 0x100) {
3305 ANYOF_BITMAP_SET(data->start_class, uc);
3306 if (OP(scan) != EXACTFL) {
3307
3308 /* And set the other member of the fold pair, but
3309 * can't do that in locale because not known until
3310 * run-time */
3311 ANYOF_BITMAP_SET(data->start_class,
3312 PL_fold_latin1[uc]);
3313 }
3314 }
3315 data->start_class->flags &= ~ANYOF_EOS;
3316 }
3317 cl_and(data->start_class, and_withp);
3318 }
3319 flags &= ~SCF_DO_STCLASS;
3320 }
3321 else if (REGNODE_VARIES(OP(scan))) {
3322 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3323 I32 f = flags, pos_before = 0;
3324 regnode * const oscan = scan;
3325 struct regnode_charclass_class this_class;
3326 struct regnode_charclass_class *oclass = NULL;
3327 I32 next_is_eval = 0;
3328
3329 switch (PL_regkind[OP(scan)]) {
3330 case WHILEM: /* End of (?:...)* . */
3331 scan = NEXTOPER(scan);
3332 goto finish;
3333 case PLUS:
3334 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3335 next = NEXTOPER(scan);
3336 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3337 mincount = 1;
3338 maxcount = REG_INFTY;
3339 next = regnext(scan);
3340 scan = NEXTOPER(scan);
3341 goto do_curly;
3342 }
3343 }
3344 if (flags & SCF_DO_SUBSTR)
3345 data->pos_min++;
3346 min++;
3347 /* Fall through. */
3348 case STAR:
3349 if (flags & SCF_DO_STCLASS) {
3350 mincount = 0;
3351 maxcount = REG_INFTY;
3352 next = regnext(scan);
3353 scan = NEXTOPER(scan);
3354 goto do_curly;
3355 }
3356 is_inf = is_inf_internal = 1;
3357 scan = regnext(scan);
3358 if (flags & SCF_DO_SUBSTR) {
3359 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3360 data->longest = &(data->longest_float);
3361 }
3362 goto optimize_curly_tail;
3363 case CURLY:
3364 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3365 && (scan->flags == stopparen))
3366 {
3367 mincount = 1;
3368 maxcount = 1;
3369 } else {
3370 mincount = ARG1(scan);
3371 maxcount = ARG2(scan);
3372 }
3373 next = regnext(scan);
3374 if (OP(scan) == CURLYX) {
3375 I32 lp = (data ? *(data->last_closep) : 0);
3376 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3377 }
3378 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3379 next_is_eval = (OP(scan) == EVAL);
3380 do_curly:
3381 if (flags & SCF_DO_SUBSTR) {
3382 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3383 pos_before = data->pos_min;
3384 }
3385 if (data) {
3386 fl = data->flags;
3387 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3388 if (is_inf)
3389 data->flags |= SF_IS_INF;
3390 }
3391 if (flags & SCF_DO_STCLASS) {
3392 cl_init(pRExC_state, &this_class);
3393 oclass = data->start_class;
3394 data->start_class = &this_class;
3395 f |= SCF_DO_STCLASS_AND;
3396 f &= ~SCF_DO_STCLASS_OR;
3397 }
3398 /* Exclude from super-linear cache processing any {n,m}
3399 regops for which the combination of input pos and regex
3400 pos is not enough information to determine if a match
3401 will be possible.
3402
3403 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3404 regex pos at the \s*, the prospects for a match depend not
3405 only on the input position but also on how many (bar\s*)
3406 repeats into the {4,8} we are. */
3407 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3408 f &= ~SCF_WHILEM_VISITED_POS;
3409
3410 /* This will finish on WHILEM, setting scan, or on NULL: */
3411 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3412 last, data, stopparen, recursed, NULL,
3413 (mincount == 0
3414 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3415
3416 if (flags & SCF_DO_STCLASS)
3417 data->start_class = oclass;
3418 if (mincount == 0 || minnext == 0) {
3419 if (flags & SCF_DO_STCLASS_OR) {
3420 cl_or(pRExC_state, data->start_class, &this_class);
3421 }
3422 else if (flags & SCF_DO_STCLASS_AND) {
3423 /* Switch to OR mode: cache the old value of
3424 * data->start_class */
3425 INIT_AND_WITHP;
3426 StructCopy(data->start_class, and_withp,
3427 struct regnode_charclass_class);
3428 flags &= ~SCF_DO_STCLASS_AND;
3429 StructCopy(&this_class, data->start_class,
3430 struct regnode_charclass_class);
3431 flags |= SCF_DO_STCLASS_OR;
3432 data->start_class->flags |= ANYOF_EOS;
3433 }
3434 } else { /* Non-zero len */
3435 if (flags & SCF_DO_STCLASS_OR) {
3436 cl_or(pRExC_state, data->start_class, &this_class);
3437 cl_and(data->start_class, and_withp);
3438 }
3439 else if (flags & SCF_DO_STCLASS_AND)
3440 cl_and(data->start_class, &this_class);
3441 flags &= ~SCF_DO_STCLASS;
3442 }
3443 if (!scan) /* It was not CURLYX, but CURLY. */
3444 scan = next;
3445 if ( /* ? quantifier ok, except for (?{ ... }) */
3446 (next_is_eval || !(mincount == 0 && maxcount == 1))
3447 && (minnext == 0) && (deltanext == 0)
3448 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3449 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3450 {
3451 ckWARNreg(RExC_parse,
3452 "Quantifier unexpected on zero-length expression");
3453 }
3454
3455 min += minnext * mincount;
3456 is_inf_internal |= ((maxcount == REG_INFTY
3457 && (minnext + deltanext) > 0)
3458 || deltanext == I32_MAX);
3459 is_inf |= is_inf_internal;
3460 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3461
3462 /* Try powerful optimization CURLYX => CURLYN. */
3463 if ( OP(oscan) == CURLYX && data
3464 && data->flags & SF_IN_PAR
3465 && !(data->flags & SF_HAS_EVAL)
3466 && !deltanext && minnext == 1 ) {
3467 /* Try to optimize to CURLYN. */
3468 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3469 regnode * const nxt1 = nxt;
3470#ifdef DEBUGGING
3471 regnode *nxt2;
3472#endif
3473
3474 /* Skip open. */
3475 nxt = regnext(nxt);
3476 if (!REGNODE_SIMPLE(OP(nxt))
3477 && !(PL_regkind[OP(nxt)] == EXACT
3478 && STR_LEN(nxt) == 1))
3479 goto nogo;
3480#ifdef DEBUGGING
3481 nxt2 = nxt;
3482#endif
3483 nxt = regnext(nxt);
3484 if (OP(nxt) != CLOSE)
3485 goto nogo;
3486 if (RExC_open_parens) {
3487 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3488 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3489 }
3490 /* Now we know that nxt2 is the only contents: */
3491 oscan->flags = (U8)ARG(nxt);
3492 OP(oscan) = CURLYN;
3493 OP(nxt1) = NOTHING; /* was OPEN. */
3494
3495#ifdef DEBUGGING
3496 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3497 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3498 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3499 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3500 OP(nxt + 1) = OPTIMIZED; /* was count. */
3501 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3502#endif
3503 }
3504 nogo:
3505
3506 /* Try optimization CURLYX => CURLYM. */
3507 if ( OP(oscan) == CURLYX && data
3508 && !(data->flags & SF_HAS_PAR)
3509 && !(data->flags & SF_HAS_EVAL)
3510 && !deltanext /* atom is fixed width */
3511 && minnext != 0 /* CURLYM can't handle zero width */
3512 ) {
3513 /* XXXX How to optimize if data == 0? */
3514 /* Optimize to a simpler form. */
3515 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3516 regnode *nxt2;
3517
3518 OP(oscan) = CURLYM;
3519 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3520 && (OP(nxt2) != WHILEM))
3521 nxt = nxt2;
3522 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3523 /* Need to optimize away parenths. */
3524 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3525 /* Set the parenth number. */
3526 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3527
3528 oscan->flags = (U8)ARG(nxt);
3529 if (RExC_open_parens) {
3530 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3531 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3532 }
3533 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3534 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3535
3536#ifdef DEBUGGING
3537 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3538 OP(nxt + 1) = OPTIMIZED; /* was count. */
3539 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3540 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3541#endif
3542#if 0
3543 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3544 regnode *nnxt = regnext(nxt1);
3545 if (nnxt == nxt) {
3546 if (reg_off_by_arg[OP(nxt1)])
3547 ARG_SET(nxt1, nxt2 - nxt1);
3548 else if (nxt2 - nxt1 < U16_MAX)
3549 NEXT_OFF(nxt1) = nxt2 - nxt1;
3550 else
3551 OP(nxt) = NOTHING; /* Cannot beautify */
3552 }
3553 nxt1 = nnxt;
3554 }
3555#endif
3556 /* Optimize again: */
3557 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3558 NULL, stopparen, recursed, NULL, 0,depth+1);
3559 }
3560 else
3561 oscan->flags = 0;
3562 }
3563 else if ((OP(oscan) == CURLYX)
3564 && (flags & SCF_WHILEM_VISITED_POS)
3565 /* See the comment on a similar expression above.
3566 However, this time it's not a subexpression
3567 we care about, but the expression itself. */
3568 && (maxcount == REG_INFTY)
3569 && data && ++data->whilem_c < 16) {
3570 /* This stays as CURLYX, we can put the count/of pair. */
3571 /* Find WHILEM (as in regexec.c) */
3572 regnode *nxt = oscan + NEXT_OFF(oscan);
3573
3574 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3575 nxt += ARG(nxt);
3576 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3577 | (RExC_whilem_seen << 4)); /* On WHILEM */
3578 }
3579 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3580 pars++;
3581 if (flags & SCF_DO_SUBSTR) {
3582 SV *last_str = NULL;
3583 int counted = mincount != 0;
3584
3585 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3586#if defined(SPARC64_GCC_WORKAROUND)
3587 I32 b = 0;
3588 STRLEN l = 0;
3589 const char *s = NULL;
3590 I32 old = 0;
3591
3592 if (pos_before >= data->last_start_min)
3593 b = pos_before;
3594 else
3595 b = data->last_start_min;
3596
3597 l = 0;
3598 s = SvPV_const(data->last_found, l);
3599 old = b - data->last_start_min;
3600
3601#else
3602 I32 b = pos_before >= data->last_start_min
3603 ? pos_before : data->last_start_min;
3604 STRLEN l;
3605 const char * const s = SvPV_const(data->last_found, l);
3606 I32 old = b - data->last_start_min;
3607#endif
3608
3609 if (UTF)
3610 old = utf8_hop((U8*)s, old) - (U8*)s;
3611 l -= old;
3612 /* Get the added string: */
3613 last_str = newSVpvn_utf8(s + old, l, UTF);
3614 if (deltanext == 0 && pos_before == b) {
3615 /* What was added is a constant string */
3616 if (mincount > 1) {
3617 SvGROW(last_str, (mincount * l) + 1);
3618 repeatcpy(SvPVX(last_str) + l,
3619 SvPVX_const(last_str), l, mincount - 1);
3620 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3621 /* Add additional parts. */
3622 SvCUR_set(data->last_found,
3623 SvCUR(data->last_found) - l);
3624 sv_catsv(data->last_found, last_str);
3625 {
3626 SV * sv = data->last_found;
3627 MAGIC *mg =
3628 SvUTF8(sv) && SvMAGICAL(sv) ?
3629 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3630 if (mg && mg->mg_len >= 0)
3631 mg->mg_len += CHR_SVLEN(last_str) - l;
3632 }
3633 data->last_end += l * (mincount - 1);
3634 }
3635 } else {
3636 /* start offset must point into the last copy */
3637 data->last_start_min += minnext * (mincount - 1);
3638 data->last_start_max += is_inf ? I32_MAX
3639 : (maxcount - 1) * (minnext + data->pos_delta);
3640 }
3641 }
3642 /* It is counted once already... */
3643 data->pos_min += minnext * (mincount - counted);
3644 data->pos_delta += - counted * deltanext +
3645 (minnext + deltanext) * maxcount - minnext * mincount;
3646 if (mincount != maxcount) {
3647 /* Cannot extend fixed substrings found inside
3648 the group. */
3649 SCAN_COMMIT(pRExC_state,data,minlenp);
3650 if (mincount && last_str) {
3651 SV * const sv = data->last_found;
3652 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3653 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3654
3655 if (mg)
3656 mg->mg_len = -1;
3657 sv_setsv(sv, last_str);
3658 data->last_end = data->pos_min;
3659 data->last_start_min =
3660 data->pos_min - CHR_SVLEN(last_str);
3661 data->last_start_max = is_inf
3662 ? I32_MAX
3663 : data->pos_min + data->pos_delta
3664 - CHR_SVLEN(last_str);
3665 }
3666 data->longest = &(data->longest_float);
3667 }
3668 SvREFCNT_dec(last_str);
3669 }
3670 if (data && (fl & SF_HAS_EVAL))
3671 data->flags |= SF_HAS_EVAL;
3672 optimize_curly_tail:
3673 if (OP(oscan) != CURLYX) {
3674 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3675 && NEXT_OFF(next))
3676 NEXT_OFF(oscan) += NEXT_OFF(next);
3677 }
3678 continue;
3679 default: /* REF, ANYOFV, and CLUMP only? */
3680 if (flags & SCF_DO_SUBSTR) {
3681 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3682 data->longest = &(data->longest_float);
3683 }
3684 is_inf = is_inf_internal = 1;
3685 if (flags & SCF_DO_STCLASS_OR)
3686 cl_anything(pRExC_state, data->start_class);
3687 flags &= ~SCF_DO_STCLASS;
3688 break;
3689 }
3690 }
3691 else if (OP(scan) == LNBREAK) {
3692 if (flags & SCF_DO_STCLASS) {
3693 int value = 0;
3694 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3695 if (flags & SCF_DO_STCLASS_AND) {
3696 for (value = 0; value < 256; value++)
3697 if (!is_VERTWS_cp(value))
3698 ANYOF_BITMAP_CLEAR(data->start_class, value);
3699 }
3700 else {
3701 for (value = 0; value < 256; value++)
3702 if (is_VERTWS_cp(value))
3703 ANYOF_BITMAP_SET(data->start_class, value);
3704 }
3705 if (flags & SCF_DO_STCLASS_OR)
3706 cl_and(data->start_class, and_withp);
3707 flags &= ~SCF_DO_STCLASS;
3708 }
3709 min += 1;
3710 delta += 1;
3711 if (flags & SCF_DO_SUBSTR) {
3712 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3713 data->pos_min += 1;
3714 data->pos_delta += 1;
3715 data->longest = &(data->longest_float);
3716 }
3717 }
3718 else if (OP(scan) == FOLDCHAR) {
3719 int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3720 flags &= ~SCF_DO_STCLASS;
3721 min += 1;
3722 delta += d;
3723 if (flags & SCF_DO_SUBSTR) {
3724 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3725 data->pos_min += 1;
3726 data->pos_delta += d;
3727 data->longest = &(data->longest_float);
3728 }
3729 }
3730 else if (REGNODE_SIMPLE(OP(scan))) {
3731 int value = 0;
3732
3733 if (flags & SCF_DO_SUBSTR) {
3734 SCAN_COMMIT(pRExC_state,data,minlenp);
3735 data->pos_min++;
3736 }
3737 min++;
3738 if (flags & SCF_DO_STCLASS) {
3739 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3740
3741 /* Some of the logic below assumes that switching
3742 locale on will only add false positives. */
3743 switch (PL_regkind[OP(scan)]) {
3744 case SANY:
3745 default:
3746 do_default:
3747 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3748 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3749 cl_anything(pRExC_state, data->start_class);
3750 break;
3751 case REG_ANY:
3752 if (OP(scan) == SANY)
3753 goto do_default;
3754 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3755 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3756 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3757 cl_anything(pRExC_state, data->start_class);
3758 }
3759 if (flags & SCF_DO_STCLASS_AND || !value)
3760 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3761 break;
3762 case ANYOF:
3763 if (flags & SCF_DO_STCLASS_AND)
3764 cl_and(data->start_class,
3765 (struct regnode_charclass_class*)scan);
3766 else
3767 cl_or(pRExC_state, data->start_class,
3768 (struct regnode_charclass_class*)scan);
3769 break;
3770 case ALNUM:
3771 if (flags & SCF_DO_STCLASS_AND) {
3772 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3773 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3774 if (OP(scan) == ALNUMU) {
3775 for (value = 0; value < 256; value++) {
3776 if (!isWORDCHAR_L1(value)) {
3777 ANYOF_BITMAP_CLEAR(data->start_class, value);
3778 }
3779 }
3780 } else {
3781 for (value = 0; value < 256; value++) {
3782 if (!isALNUM(value)) {
3783 ANYOF_BITMAP_CLEAR(data->start_class, value);
3784 }
3785 }
3786 }
3787 }
3788 }
3789 else {
3790 if (data->start_class->flags & ANYOF_LOCALE)
3791 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3792
3793 /* Even if under locale, set the bits for non-locale
3794 * in case it isn't a true locale-node. This will
3795 * create false positives if it truly is locale */
3796 if (OP(scan) == ALNUMU) {
3797 for (value = 0; value < 256; value++) {
3798 if (isWORDCHAR_L1(value)) {
3799 ANYOF_BITMAP_SET(data->start_class, value);
3800 }
3801 }
3802 } else {
3803 for (value = 0; value < 256; value++) {
3804 if (isALNUM(value)) {
3805 ANYOF_BITMAP_SET(data->start_class, value);
3806 }
3807 }
3808 }
3809 }
3810 break;
3811 case NALNUM:
3812 if (flags & SCF_DO_STCLASS_AND) {
3813 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3814 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3815 if (OP(scan) == NALNUMU) {
3816 for (value = 0; value < 256; value++) {
3817 if (isWORDCHAR_L1(value)) {
3818 ANYOF_BITMAP_CLEAR(data->start_class, value);
3819 }
3820 }
3821 } else {
3822 for (value = 0; value < 256; value++) {
3823 if (isALNUM(value)) {
3824 ANYOF_BITMAP_CLEAR(data->start_class, value);
3825 }
3826 }
3827 }
3828 }
3829 }
3830 else {
3831 if (data->start_class->flags & ANYOF_LOCALE)
3832 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3833
3834 /* Even if under locale, set the bits for non-locale in
3835 * case it isn't a true locale-node. This will create
3836 * false positives if it truly is locale */
3837 if (OP(scan) == NALNUMU) {
3838 for (value = 0; value < 256; value++) {
3839 if (! isWORDCHAR_L1(value)) {
3840 ANYOF_BITMAP_SET(data->start_class, value);
3841 }
3842 }
3843 } else {
3844 for (value = 0; value < 256; value++) {
3845 if (! isALNUM(value)) {
3846 ANYOF_BITMAP_SET(data->start_class, value);
3847 }
3848 }
3849 }
3850 }
3851 break;
3852 case SPACE:
3853 if (flags & SCF_DO_STCLASS_AND) {
3854 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3855 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3856 if (OP(scan) == SPACEU) {
3857 for (value = 0; value < 256; value++) {
3858 if (!isSPACE_L1(value)) {
3859 ANYOF_BITMAP_CLEAR(data->start_class, value);
3860 }
3861 }
3862 } else {
3863 for (value = 0; value < 256; value++) {
3864 if (!isSPACE(value)) {
3865 ANYOF_BITMAP_CLEAR(data->start_class, value);
3866 }
3867 }
3868 }
3869 }
3870 }
3871 else {
3872 if (data->start_class->flags & ANYOF_LOCALE) {
3873 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3874 }
3875 if (OP(scan) == SPACEU) {
3876 for (value = 0; value < 256; value++) {
3877 if (isSPACE_L1(value)) {
3878 ANYOF_BITMAP_SET(data->start_class, value);
3879 }
3880 }
3881 } else {
3882 for (value = 0; value < 256; value++) {
3883 if (isSPACE(value)) {
3884 ANYOF_BITMAP_SET(data->start_class, value);
3885 }
3886 }
3887 }
3888 }
3889 break;
3890 case NSPACE:
3891 if (flags & SCF_DO_STCLASS_AND) {
3892 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3893 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3894 if (OP(scan) == NSPACEU) {
3895 for (value = 0; value < 256; value++) {
3896 if (isSPACE_L1(value)) {
3897 ANYOF_BITMAP_CLEAR(data->start_class, value);
3898 }
3899 }
3900 } else {
3901 for (value = 0; value < 256; value++) {
3902 if (isSPACE(value)) {
3903 ANYOF_BITMAP_CLEAR(data->start_class, value);
3904 }
3905 }
3906 }
3907 }
3908 }
3909 else {
3910 if (data->start_class->flags & ANYOF_LOCALE)
3911 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3912 if (OP(scan) == NSPACEU) {
3913 for (value = 0; value < 256; value++) {
3914 if (!isSPACE_L1(value)) {
3915 ANYOF_BITMAP_SET(data->start_class, value);
3916 }
3917 }
3918 }
3919 else {
3920 for (value = 0; value < 256; value++) {
3921 if (!isSPACE(value)) {
3922 ANYOF_BITMAP_SET(data->start_class, value);
3923 }
3924 }
3925 }
3926 }
3927 break;
3928 case DIGIT:
3929 if (flags & SCF_DO_STCLASS_AND) {
3930 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3931 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3932 for (value = 0; value < 256; value++)
3933 if (!isDIGIT(value))
3934 ANYOF_BITMAP_CLEAR(data->start_class, value);
3935 }
3936 }
3937 else {
3938 if (data->start_class->flags & ANYOF_LOCALE)
3939 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3940 for (value = 0; value < 256; value++)
3941 if (isDIGIT(value))
3942 ANYOF_BITMAP_SET(data->start_class, value);
3943 }
3944 break;
3945 case NDIGIT:
3946 if (flags & SCF_DO_STCLASS_AND) {
3947 if (!(data->start_class->flags & ANYOF_LOCALE))
3948 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3949 for (value = 0; value < 256; value++)
3950 if (isDIGIT(value))
3951 ANYOF_BITMAP_CLEAR(data->start_class, value);
3952 }
3953 else {
3954 if (data->start_class->flags & ANYOF_LOCALE)
3955 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3956 for (value = 0; value < 256; value++)
3957 if (!isDIGIT(value))
3958 ANYOF_BITMAP_SET(data->start_class, value);
3959 }
3960 break;
3961 CASE_SYNST_FNC(VERTWS);
3962 CASE_SYNST_FNC(HORIZWS);
3963
3964 }
3965 if (flags & SCF_DO_STCLASS_OR)
3966 cl_and(data->start_class, and_withp);
3967 flags &= ~SCF_DO_STCLASS;
3968 }
3969 }
3970 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3971 data->flags |= (OP(scan) == MEOL
3972 ? SF_BEFORE_MEOL
3973 : SF_BEFORE_SEOL);
3974 }
3975 else if ( PL_regkind[OP(scan)] == BRANCHJ
3976 /* Lookbehind, or need to calculate parens/evals/stclass: */
3977 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3978 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3979 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3980 || OP(scan) == UNLESSM )
3981 {
3982 /* Negative Lookahead/lookbehind
3983 In this case we can't do fixed string optimisation.
3984 */
3985
3986 I32 deltanext, minnext, fake = 0;
3987 regnode *nscan;
3988 struct regnode_charclass_class intrnl;
3989 int f = 0;
3990
3991 data_fake.flags = 0;
3992 if (data) {
3993 data_fake.whilem_c = data->whilem_c;
3994 data_fake.last_closep = data->last_closep;
3995 }
3996 else
3997 data_fake.last_closep = &fake;
3998 data_fake.pos_delta = delta;
3999 if ( flags & SCF_DO_STCLASS && !scan->flags
4000 && OP(scan) == IFMATCH ) { /* Lookahead */
4001 cl_init(pRExC_state, &intrnl);
4002 data_fake.start_class = &intrnl;
4003 f |= SCF_DO_STCLASS_AND;
4004 }
4005 if (flags & SCF_WHILEM_VISITED_POS)
4006 f |= SCF_WHILEM_VISITED_POS;
4007 next = regnext(scan);
4008 nscan = NEXTOPER(NEXTOPER(scan));
4009 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4010 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4011 if (scan->flags) {
4012 if (deltanext) {
4013 FAIL("Variable length lookbehind not implemented");
4014 }
4015 else if (minnext > (I32)U8_MAX) {
4016 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4017 }
4018 scan->flags = (U8)minnext;
4019 }
4020 if (data) {
4021 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4022 pars++;
4023 if (data_fake.flags & SF_HAS_EVAL)
4024 data->flags |= SF_HAS_EVAL;
4025 data->whilem_c = data_fake.whilem_c;
4026 }
4027 if (f & SCF_DO_STCLASS_AND) {
4028 if (flags & SCF_DO_STCLASS_OR) {
4029 /* OR before, AND after: ideally we would recurse with
4030 * data_fake to get the AND applied by study of the
4031 * remainder of the pattern, and then derecurse;
4032 * *** HACK *** for now just treat as "no information".
4033 * See [perl #56690].
4034 */
4035 cl_init(pRExC_state, data->start_class);
4036 } else {
4037 /* AND before and after: combine and continue */
4038 const int was = (data->start_class->flags & ANYOF_EOS);
4039
4040 cl_and(data->start_class, &intrnl);
4041 if (was)
4042 data->start_class->flags |= ANYOF_EOS;
4043 }
4044 }
4045 }
4046#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4047 else {
4048 /* Positive Lookahead/lookbehind
4049 In this case we can do fixed string optimisation,
4050 but we must be careful about it. Note in the case of
4051 lookbehind the positions will be offset by the minimum
4052 length of the pattern, something we won't know about
4053 until after the recurse.
4054 */
4055 I32 deltanext, fake = 0;
4056 regnode *nscan;
4057 struct regnode_charclass_class intrnl;
4058 int f = 0;
4059 /* We use SAVEFREEPV so that when the full compile
4060 is finished perl will clean up the allocated
4061 minlens when it's all done. This way we don't
4062 have to worry about freeing them when we know
4063 they wont be used, which would be a pain.
4064 */
4065 I32 *minnextp;
4066 Newx( minnextp, 1, I32 );
4067 SAVEFREEPV(minnextp);
4068
4069 if (data) {
4070 StructCopy(data, &data_fake, scan_data_t);
4071 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4072 f |= SCF_DO_SUBSTR;
4073 if (scan->flags)
4074 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4075 data_fake.last_found=newSVsv(data->last_found);
4076 }
4077 }
4078 else
4079 data_fake.last_closep = &fake;
4080 data_fake.flags = 0;
4081 data_fake.pos_delta = delta;
4082 if (is_inf)
4083 data_fake.flags |= SF_IS_INF;
4084 if ( flags & SCF_DO_STCLASS && !scan->flags
4085 && OP(scan) == IFMATCH ) { /* Lookahead */
4086 cl_init(pRExC_state, &intrnl);
4087 data_fake.start_class = &intrnl;
4088 f |= SCF_DO_STCLASS_AND;
4089 }
4090 if (flags & SCF_WHILEM_VISITED_POS)
4091 f |= SCF_WHILEM_VISITED_POS;
4092 next = regnext(scan);
4093 nscan = NEXTOPER(NEXTOPER(scan));
4094
4095 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4096 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4097 if (scan->flags) {
4098 if (deltanext) {
4099 FAIL("Variable length lookbehind not implemented");
4100 }
4101 else if (*minnextp > (I32)U8_MAX) {
4102 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4103 }
4104 scan->flags = (U8)*minnextp;
4105 }
4106
4107 *minnextp += min;
4108
4109 if (f & SCF_DO_STCLASS_AND) {
4110 const int was = (data->start_class->flags & ANYOF_EOS);
4111
4112 cl_and(data->start_class, &intrnl);
4113 if (was)
4114 data->start_class->flags |= ANYOF_EOS;
4115 }
4116 if (data) {
4117 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4118 pars++;
4119 if (data_fake.flags & SF_HAS_EVAL)
4120 data->flags |= SF_HAS_EVAL;
4121 data->whilem_c = data_fake.whilem_c;
4122 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4123 if (RExC_rx->minlen<*minnextp)
4124 RExC_rx->minlen=*minnextp;
4125 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4126 SvREFCNT_dec(data_fake.last_found);
4127
4128 if ( data_fake.minlen_fixed != minlenp )
4129 {
4130 data->offset_fixed= data_fake.offset_fixed;
4131 data->minlen_fixed= data_fake.minlen_fixed;
4132 data->lookbehind_fixed+= scan->flags;
4133 }
4134 if ( data_fake.minlen_float != minlenp )
4135 {
4136 data->minlen_float= data_fake.minlen_float;
4137 data->offset_float_min=data_fake.offset_float_min;
4138 data->offset_float_max=data_fake.offset_float_max;
4139 data->lookbehind_float+= scan->flags;
4140 }
4141 }
4142 }
4143
4144
4145 }
4146#endif
4147 }
4148 else if (OP(scan) == OPEN) {
4149 if (stopparen != (I32)ARG(scan))
4150 pars++;
4151 }
4152 else if (OP(scan) == CLOSE) {
4153 if (stopparen == (I32)ARG(scan)) {
4154 break;
4155 }
4156 if ((I32)ARG(scan) == is_par) {
4157 next = regnext(scan);
4158
4159 if ( next && (OP(next) != WHILEM) && next < last)
4160 is_par = 0; /* Disable optimization */
4161 }
4162 if (data)
4163 *(data->last_closep) = ARG(scan);
4164 }
4165 else if (OP(scan) == EVAL) {
4166 if (data)
4167 data->flags |= SF_HAS_EVAL;
4168 }
4169 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4170 if (flags & SCF_DO_SUBSTR) {
4171 SCAN_COMMIT(pRExC_state,data,minlenp);
4172 flags &= ~SCF_DO_SUBSTR;
4173 }
4174 if (data && OP(scan)==ACCEPT) {
4175 data->flags |= SCF_SEEN_ACCEPT;
4176 if (stopmin > min)
4177 stopmin = min;
4178 }
4179 }
4180 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4181 {
4182 if (flags & SCF_DO_SUBSTR) {
4183 SCAN_COMMIT(pRExC_state,data,minlenp);
4184 data->longest = &(data->longest_float);
4185 }
4186 is_inf = is_inf_internal = 1;
4187 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4188 cl_anything(pRExC_state, data->start_class);
4189 flags &= ~SCF_DO_STCLASS;
4190 }
4191 else if (OP(scan) == GPOS) {
4192 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4193 !(delta || is_inf || (data && data->pos_delta)))
4194 {
4195 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4196 RExC_rx->extflags |= RXf_ANCH_GPOS;
4197 if (RExC_rx->gofs < (U32)min)
4198 RExC_rx->gofs = min;
4199 } else {
4200 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4201 RExC_rx->gofs = 0;
4202 }
4203 }
4204#ifdef TRIE_STUDY_OPT
4205#ifdef FULL_TRIE_STUDY
4206 else if (PL_regkind[OP(scan)] == TRIE) {
4207 /* NOTE - There is similar code to this block above for handling
4208 BRANCH nodes on the initial study. If you change stuff here
4209 check there too. */
4210 regnode *trie_node= scan;
4211 regnode *tail= regnext(scan);
4212 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4213 I32 max1 = 0, min1 = I32_MAX;
4214 struct regnode_charclass_class accum;
4215
4216 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4217 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4218 if (flags & SCF_DO_STCLASS)
4219 cl_init_zero(pRExC_state, &accum);
4220
4221 if (!trie->jump) {
4222 min1= trie->minlen;
4223 max1= trie->maxlen;
4224 } else {
4225 const regnode *nextbranch= NULL;
4226 U32 word;
4227
4228 for ( word=1 ; word <= trie->wordcount ; word++)
4229 {
4230 I32 deltanext=0, minnext=0, f = 0, fake;
4231 struct regnode_charclass_class this_class;
4232
4233 data_fake.flags = 0;
4234 if (data) {
4235 data_fake.whilem_c = data->whilem_c;
4236 data_fake.last_closep = data->last_closep;
4237 }
4238 else
4239 data_fake.last_closep = &fake;
4240 data_fake.pos_delta = delta;
4241 if (flags & SCF_DO_STCLASS) {
4242 cl_init(pRExC_state, &this_class);
4243 data_fake.start_class = &this_class;
4244 f = SCF_DO_STCLASS_AND;
4245 }
4246 if (flags & SCF_WHILEM_VISITED_POS)
4247 f |= SCF_WHILEM_VISITED_POS;
4248
4249 if (trie->jump[word]) {
4250 if (!nextbranch)
4251 nextbranch = trie_node + trie->jump[0];
4252 scan= trie_node + trie->jump[word];
4253 /* We go from the jump point to the branch that follows
4254 it. Note this means we need the vestigal unused branches
4255 even though they arent otherwise used.
4256 */
4257 minnext = study_chunk(pRExC_state, &scan, minlenp,
4258 &deltanext, (regnode *)nextbranch, &data_fake,
4259 stopparen, recursed, NULL, f,depth+1);
4260 }
4261 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4262 nextbranch= regnext((regnode*)nextbranch);
4263
4264 if (min1 > (I32)(minnext + trie->minlen))
4265 min1 = minnext + trie->minlen;
4266 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4267 max1 = minnext + deltanext + trie->maxlen;
4268 if (deltanext == I32_MAX)
4269 is_inf = is_inf_internal = 1;
4270
4271 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4272 pars++;
4273 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4274 if ( stopmin > min + min1)
4275 stopmin = min + min1;
4276 flags &= ~SCF_DO_SUBSTR;
4277 if (data)
4278 data->flags |= SCF_SEEN_ACCEPT;
4279 }
4280 if (data) {
4281 if (data_fake.flags & SF_HAS_EVAL)
4282 data->flags |= SF_HAS_EVAL;
4283 data->whilem_c = data_fake.whilem_c;
4284 }
4285 if (flags & SCF_DO_STCLASS)
4286 cl_or(pRExC_state, &accum, &this_class);
4287 }
4288 }
4289 if (flags & SCF_DO_SUBSTR) {
4290 data->pos_min += min1;
4291 data->pos_delta += max1 - min1;
4292 if (max1 != min1 || is_inf)
4293 data->longest = &(data->longest_float);
4294 }
4295 min += min1;
4296 delta += max1 - min1;
4297 if (flags & SCF_DO_STCLASS_OR) {
4298 cl_or(pRExC_state, data->start_class, &accum);
4299 if (min1) {
4300 cl_and(data->start_class, and_withp);
4301 flags &= ~SCF_DO_STCLASS;
4302 }
4303 }
4304 else if (flags & SCF_DO_STCLASS_AND) {
4305 if (min1) {
4306 cl_and(data->start_class, &accum);
4307 flags &= ~SCF_DO_STCLASS;
4308 }
4309 else {
4310 /* Switch to OR mode: cache the old value of
4311 * data->start_class */
4312 INIT_AND_WITHP;
4313 StructCopy(data->start_class, and_withp,
4314 struct regnode_charclass_class);
4315 flags &= ~SCF_DO_STCLASS_AND;
4316 StructCopy(&accum, data->start_class,
4317 struct regnode_charclass_class);
4318 flags |= SCF_DO_STCLASS_OR;
4319 data->start_class->flags |= ANYOF_EOS;
4320 }
4321 }
4322 scan= tail;
4323 continue;
4324 }
4325#else
4326 else if (PL_regkind[OP(scan)] == TRIE) {
4327 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4328 U8*bang=NULL;
4329
4330 min += trie->minlen;
4331 delta += (trie->maxlen - trie->minlen);
4332 flags &= ~SCF_DO_STCLASS; /* xxx */
4333 if (flags & SCF_DO_SUBSTR) {
4334 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4335 data->pos_min += trie->minlen;
4336 data->pos_delta += (trie->maxlen - trie->minlen);
4337 if (trie->maxlen != trie->minlen)
4338 data->longest = &(data->longest_float);
4339 }
4340 if (trie->jump) /* no more substrings -- for now /grr*/
4341 flags &= ~SCF_DO_SUBSTR;
4342 }
4343#endif /* old or new */
4344#endif /* TRIE_STUDY_OPT */
4345
4346 /* Else: zero-length, ignore. */
4347 scan = regnext(scan);
4348 }
4349 if (frame) {
4350 last = frame->last;
4351 scan = frame->next;
4352 stopparen = frame->stop;
4353 frame = frame->prev;
4354 goto fake_study_recurse;
4355 }
4356
4357 finish:
4358 assert(!frame);
4359 DEBUG_STUDYDATA("pre-fin:",data,depth);
4360
4361 *scanp = scan;
4362 *deltap = is_inf_internal ? I32_MAX : delta;
4363 if (flags & SCF_DO_SUBSTR && is_inf)
4364 data->pos_delta = I32_MAX - data->pos_min;
4365 if (is_par > (I32)U8_MAX)
4366 is_par = 0;
4367 if (is_par && pars==1 && data) {
4368 data->flags |= SF_IN_PAR;
4369 data->flags &= ~SF_HAS_PAR;
4370 }
4371 else if (pars && data) {
4372 data->flags |= SF_HAS_PAR;
4373 data->flags &= ~SF_IN_PAR;
4374 }
4375 if (flags & SCF_DO_STCLASS_OR)
4376 cl_and(data->start_class, and_withp);
4377 if (flags & SCF_TRIE_RESTUDY)
4378 data->flags |= SCF_TRIE_RESTUDY;
4379
4380 DEBUG_STUDYDATA("post-fin:",data,depth);
4381
4382 return min < stopmin ? min : stopmin;
4383}
4384
4385STATIC U32
4386S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4387{
4388 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4389
4390 PERL_ARGS_ASSERT_ADD_DATA;
4391
4392 Renewc(RExC_rxi->data,
4393 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4394 char, struct reg_data);
4395 if(count)
4396 Renew(RExC_rxi->data->what, count + n, U8);
4397 else
4398 Newx(RExC_rxi->data->what, n, U8);
4399 RExC_rxi->data->count = count + n;
4400 Copy(s, RExC_rxi->data->what + count, n, U8);
4401 return count;
4402}
4403
4404/*XXX: todo make this not included in a non debugging perl */
4405#ifndef PERL_IN_XSUB_RE
4406void
4407Perl_reginitcolors(pTHX)
4408{
4409 dVAR;
4410 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4411 if (s) {
4412 char *t = savepv(s);
4413 int i = 0;
4414 PL_colors[0] = t;
4415 while (++i < 6) {
4416 t = strchr(t, '\t');
4417 if (t) {
4418 *t = '\0';
4419 PL_colors[i] = ++t;
4420 }
4421 else
4422 PL_colors[i] = t = (char *)"";
4423 }
4424 } else {
4425 int i = 0;
4426 while (i < 6)
4427 PL_colors[i++] = (char *)"";
4428 }
4429 PL_colorset = 1;
4430}
4431#endif
4432
4433
4434#ifdef TRIE_STUDY_OPT
4435#define CHECK_RESTUDY_GOTO \
4436 if ( \
4437 (data.flags & SCF_TRIE_RESTUDY) \
4438 && ! restudied++ \
4439 ) goto reStudy
4440#else
4441#define CHECK_RESTUDY_GOTO
4442#endif
4443
4444/*
4445 - pregcomp - compile a regular expression into internal code
4446 *
4447 * We can't allocate space until we know how big the compiled form will be,
4448 * but we can't compile it (and thus know how big it is) until we've got a
4449 * place to put the code. So we cheat: we compile it twice, once with code
4450 * generation turned off and size counting turned on, and once "for real".
4451 * This also means that we don't allocate space until we are sure that the
4452 * thing really will compile successfully, and we never have to move the
4453 * code and thus invalidate pointers into it. (Note that it has to be in
4454 * one piece because free() must be able to free it all.) [NB: not true in perl]
4455 *
4456 * Beware that the optimization-preparation code in here knows about some
4457 * of the structure of the compiled regexp. [I'll say.]
4458 */
4459
4460
4461
4462#ifndef PERL_IN_XSUB_RE
4463#define RE_ENGINE_PTR &PL_core_reg_engine
4464#else
4465extern const struct regexp_engine my_reg_engine;
4466#define RE_ENGINE_PTR &my_reg_engine
4467#endif
4468
4469#ifndef PERL_IN_XSUB_RE
4470REGEXP *
4471Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4472{
4473 dVAR;
4474 HV * const table = GvHV(PL_hintgv);
4475
4476 PERL_ARGS_ASSERT_PREGCOMP;
4477
4478 /* Dispatch a request to compile a regexp to correct
4479 regexp engine. */
4480 if (table) {
4481 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4482 GET_RE_DEBUG_FLAGS_DECL;
4483 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4484 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4485 DEBUG_COMPILE_r({
4486 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4487 SvIV(*ptr));
4488 });
4489 return CALLREGCOMP_ENG(eng, pattern, flags);
4490 }
4491 }
4492 return Perl_re_compile(aTHX_ pattern, flags);
4493}
4494#endif
4495
4496REGEXP *
4497Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4498{
4499 dVAR;
4500 REGEXP *rx;
4501 struct regexp *r;
4502 register regexp_internal *ri;
4503 STRLEN plen;
4504 char *exp;
4505 char* xend;
4506 regnode *scan;
4507 I32 flags;
4508 I32 minlen = 0;
4509 U32 pm_flags;
4510
4511 /* these are all flags - maybe they should be turned
4512 * into a single int with different bit masks */
4513 I32 sawlookahead = 0;
4514 I32 sawplus = 0;
4515 I32 sawopen = 0;
4516 bool used_setjump = FALSE;
4517 regex_charset initial_charset = get_regex_charset(orig_pm_flags);
4518
4519 U8 jump_ret = 0;
4520 dJMPENV;
4521 scan_data_t data;
4522 RExC_state_t RExC_state;
4523 RExC_state_t * const pRExC_state = &RExC_state;
4524#ifdef TRIE_STUDY_OPT
4525 int restudied;
4526 RExC_state_t copyRExC_state;
4527#endif
4528 GET_RE_DEBUG_FLAGS_DECL;
4529
4530 PERL_ARGS_ASSERT_RE_COMPILE;
4531
4532 DEBUG_r(if (!PL_colorset) reginitcolors());
4533
4534 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4535 RExC_uni_semantics = 0;
4536 RExC_contains_locale = 0;
4537
4538 /****************** LONG JUMP TARGET HERE***********************/
4539 /* Longjmp back to here if have to switch in midstream to utf8 */
4540 if (! RExC_orig_utf8) {
4541 JMPENV_PUSH(jump_ret);
4542 used_setjump = TRUE;
4543 }
4544
4545 if (jump_ret == 0) { /* First time through */
4546 exp = SvPV(pattern, plen);
4547 xend = exp + plen;
4548 /* ignore the utf8ness if the pattern is 0 length */
4549 if (plen == 0) {
4550 RExC_utf8 = RExC_orig_utf8 = 0;
4551 }
4552
4553 DEBUG_COMPILE_r({
4554 SV *dsv= sv_newmortal();
4555 RE_PV_QUOTED_DECL(s, RExC_utf8,
4556 dsv, exp, plen, 60);
4557 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4558 PL_colors[4],PL_colors[5],s);
4559 });
4560 }
4561 else { /* longjumped back */
4562 STRLEN len = plen;
4563
4564 /* If the cause for the longjmp was other than changing to utf8, pop
4565 * our own setjmp, and longjmp to the correct handler */
4566 if (jump_ret != UTF8_LONGJMP) {
4567 JMPENV_POP;
4568 JMPENV_JUMP(jump_ret);
4569 }
4570
4571 GET_RE_DEBUG_FLAGS;
4572
4573 /* It's possible to write a regexp in ascii that represents Unicode
4574 codepoints outside of the byte range, such as via \x{100}. If we
4575 detect such a sequence we have to convert the entire pattern to utf8
4576 and then recompile, as our sizing calculation will have been based
4577 on 1 byte == 1 character, but we will need to use utf8 to encode
4578 at least some part of the pattern, and therefore must convert the whole
4579 thing.
4580 -- dmq */
4581 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4582 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4583 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4584 xend = exp + len;
4585 RExC_orig_utf8 = RExC_utf8 = 1;
4586 SAVEFREEPV(exp);
4587 }
4588
4589#ifdef TRIE_STUDY_OPT
4590 restudied = 0;
4591#endif
4592
4593 pm_flags = orig_pm_flags;
4594
4595 if (initial_charset == REGEX_LOCALE_CHARSET) {
4596 RExC_contains_locale = 1;
4597 }
4598 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
4599
4600 /* Set to use unicode semantics if the pattern is in utf8 and has the
4601 * 'depends' charset specified, as it means unicode when utf8 */
4602 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4603 }
4604
4605 RExC_precomp = exp;
4606 RExC_flags = pm_flags;
4607 RExC_sawback = 0;
4608
4609 RExC_seen = 0;
4610 RExC_in_lookbehind = 0;
4611 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4612 RExC_seen_evals = 0;
4613 RExC_extralen = 0;
4614
4615 /* First pass: determine size, legality. */
4616 RExC_parse = exp;
4617 RExC_start = exp;
4618 RExC_end = xend;
4619 RExC_naughty = 0;
4620 RExC_npar = 1;
4621 RExC_nestroot = 0;
4622 RExC_size = 0L;
4623 RExC_emit = &PL_regdummy;
4624 RExC_whilem_seen = 0;
4625 RExC_open_parens = NULL;
4626 RExC_close_parens = NULL;
4627 RExC_opend = NULL;
4628 RExC_paren_names = NULL;
4629#ifdef DEBUGGING
4630 RExC_paren_name_list = NULL;
4631#endif
4632 RExC_recurse = NULL;
4633 RExC_recurse_count = 0;
4634
4635#if 0 /* REGC() is (currently) a NOP at the first pass.
4636 * Clever compilers notice this and complain. --jhi */
4637 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4638#endif
4639 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4640 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4641 RExC_precomp = NULL;
4642 return(NULL);
4643 }
4644
4645 /* Here, finished first pass. Get rid of any added setjmp */
4646 if (used_setjump) {
4647 JMPENV_POP;
4648 }
4649
4650 DEBUG_PARSE_r({
4651 PerlIO_printf(Perl_debug_log,
4652 "Required size %"IVdf" nodes\n"
4653 "Starting second pass (creation)\n",
4654 (IV)RExC_size);
4655 RExC_lastnum=0;
4656 RExC_lastparse=NULL;
4657 });
4658
4659 /* The first pass could have found things that force Unicode semantics */
4660 if ((RExC_utf8 || RExC_uni_semantics)
4661 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
4662 {
4663 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4664 }
4665
4666 /* Small enough for pointer-storage convention?
4667 If extralen==0, this means that we will not need long jumps. */
4668 if (RExC_size >= 0x10000L && RExC_extralen)
4669 RExC_size += RExC_extralen;
4670 else
4671 RExC_extralen = 0;
4672 if (RExC_whilem_seen > 15)
4673 RExC_whilem_seen = 15;
4674
4675 /* Allocate space and zero-initialize. Note, the two step process
4676 of zeroing when in debug mode, thus anything assigned has to
4677 happen after that */
4678 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4679 r = (struct regexp*)SvANY(rx);
4680 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4681 char, regexp_internal);
4682 if ( r == NULL || ri == NULL )
4683 FAIL("Regexp out of space");
4684#ifdef DEBUGGING
4685 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4686 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4687#else
4688 /* bulk initialize base fields with 0. */
4689 Zero(ri, sizeof(regexp_internal), char);
4690#endif
4691
4692 /* non-zero initialization begins here */
4693 RXi_SET( r, ri );
4694 r->engine= RE_ENGINE_PTR;
4695 r->extflags = pm_flags;
4696 {
4697 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4698 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
4699
4700 /* The caret is output if there are any defaults: if not all the STD
4701 * flags are set, or if no character set specifier is needed */
4702 bool has_default =
4703 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4704 || ! has_charset);
4705 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4706 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4707 >> RXf_PMf_STD_PMMOD_SHIFT);
4708 const char *fptr = STD_PAT_MODS; /*"msix"*/
4709 char *p;
4710 /* Allocate for the worst case, which is all the std flags are turned
4711 * on. If more precision is desired, we could do a population count of
4712 * the flags set. This could be done with a small lookup table, or by
4713 * shifting, masking and adding, or even, when available, assembly
4714 * language for a machine-language population count.
4715 * We never output a minus, as all those are defaults, so are
4716 * covered by the caret */
4717 const STRLEN wraplen = plen + has_p + has_runon
4718 + has_default /* If needs a caret */
4719
4720 /* If needs a character set specifier */
4721 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
4722 + (sizeof(STD_PAT_MODS) - 1)
4723 + (sizeof("(?:)") - 1);
4724
4725 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4726 SvPOK_on(rx);
4727 SvFLAGS(rx) |= SvUTF8(pattern);
4728 *p++='('; *p++='?';
4729
4730 /* If a default, cover it using the caret */
4731 if (has_default) {
4732 *p++= DEFAULT_PAT_MOD;
4733 }
4734 if (has_charset) {
4735 STRLEN len;
4736 const char* const name = get_regex_charset_name(r->extflags, &len);
4737 Copy(name, p, len, char);
4738 p += len;
4739 }
4740 if (has_p)
4741 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4742 {
4743 char ch;
4744 while((ch = *fptr++)) {
4745 if(reganch & 1)
4746 *p++ = ch;
4747 reganch >>= 1;
4748 }
4749 }
4750
4751 *p++ = ':';
4752 Copy(RExC_precomp, p, plen, char);
4753 assert ((RX_WRAPPED(rx) - p) < 16);
4754 r->pre_prefix = p - RX_WRAPPED(rx);
4755 p += plen;
4756 if (has_runon)
4757 *p++ = '\n';
4758 *p++ = ')';
4759 *p = 0;
4760 SvCUR_set(rx, p - SvPVX_const(rx));
4761 }
4762
4763 r->intflags = 0;
4764 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4765
4766 if (RExC_seen & REG_SEEN_RECURSE) {
4767 Newxz(RExC_open_parens, RExC_npar,regnode *);
4768 SAVEFREEPV(RExC_open_parens);
4769 Newxz(RExC_close_parens,RExC_npar,regnode *);
4770 SAVEFREEPV(RExC_close_parens);
4771 }
4772
4773 /* Useful during FAIL. */
4774#ifdef RE_TRACK_PATTERN_OFFSETS
4775 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4776 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4777 "%s %"UVuf" bytes for offset annotations.\n",
4778 ri->u.offsets ? "Got" : "Couldn't get",
4779 (UV)((2*RExC_size+1) * sizeof(U32))));
4780#endif
4781 SetProgLen(ri,RExC_size);
4782 RExC_rx_sv = rx;
4783 RExC_rx = r;
4784 RExC_rxi = ri;
4785
4786 /* Second pass: emit code. */
4787 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4788 RExC_parse = exp;
4789 RExC_end = xend;
4790 RExC_naughty = 0;
4791 RExC_npar = 1;
4792 RExC_emit_start = ri->program;
4793 RExC_emit = ri->program;
4794 RExC_emit_bound = ri->program + RExC_size + 1;
4795
4796 /* Store the count of eval-groups for security checks: */
4797 RExC_rx->seen_evals = RExC_seen_evals;
4798 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4799 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4800 ReREFCNT_dec(rx);
4801 return(NULL);
4802 }
4803 /* XXXX To minimize changes to RE engine we always allocate
4804 3-units-long substrs field. */
4805 Newx(r->substrs, 1, struct reg_substr_data);
4806 if (RExC_recurse_count) {
4807 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4808 SAVEFREEPV(RExC_recurse);
4809 }
4810
4811reStudy:
4812 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4813 Zero(r->substrs, 1, struct reg_substr_data);
4814
4815#ifdef TRIE_STUDY_OPT
4816 if (!restudied) {
4817 StructCopy(&zero_scan_data, &data, scan_data_t);
4818 copyRExC_state = RExC_state;
4819 } else {
4820 U32 seen=RExC_seen;
4821 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4822
4823 RExC_state = copyRExC_state;
4824 if (seen & REG_TOP_LEVEL_BRANCHES)
4825 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4826 else
4827 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4828 if (data.last_found) {
4829 SvREFCNT_dec(data.longest_fixed);
4830 SvREFCNT_dec(data.longest_float);
4831 SvREFCNT_dec(data.last_found);
4832 }
4833 StructCopy(&zero_scan_data, &data, scan_data_t);
4834 }
4835#else
4836 StructCopy(&zero_scan_data, &data, scan_data_t);
4837#endif
4838
4839 /* Dig out information for optimizations. */
4840 r->extflags = RExC_flags; /* was pm_op */
4841 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4842
4843 if (UTF)
4844 SvUTF8_on(rx); /* Unicode in it? */
4845 ri->regstclass = NULL;
4846 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4847 r->intflags |= PREGf_NAUGHTY;
4848 scan = ri->program + 1; /* First BRANCH. */
4849
4850 /* testing for BRANCH here tells us whether there is "must appear"
4851 data in the pattern. If there is then we can use it for optimisations */
4852 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4853 I32 fake;
4854 STRLEN longest_float_length, longest_fixed_length;
4855 struct regnode_charclass_class ch_class; /* pointed to by data */
4856 int stclass_flag;
4857 I32 last_close = 0; /* pointed to by data */
4858 regnode *first= scan;
4859 regnode *first_next= regnext(first);
4860 /*
4861 * Skip introductions and multiplicators >= 1
4862 * so that we can extract the 'meat' of the pattern that must
4863 * match in the large if() sequence following.
4864 * NOTE that EXACT is NOT covered here, as it is normally
4865 * picked up by the optimiser separately.
4866 *
4867 * This is unfortunate as the optimiser isnt handling lookahead
4868 * properly currently.
4869 *
4870 */
4871 while ((OP(first) == OPEN && (sawopen = 1)) ||
4872 /* An OR of *one* alternative - should not happen now. */
4873 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4874 /* for now we can't handle lookbehind IFMATCH*/
4875 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4876 (OP(first) == PLUS) ||
4877 (OP(first) == MINMOD) ||
4878 /* An {n,m} with n>0 */
4879 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4880 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4881 {
4882 /*
4883 * the only op that could be a regnode is PLUS, all the rest
4884 * will be regnode_1 or regnode_2.
4885 *
4886 */
4887 if (OP(first) == PLUS)
4888 sawplus = 1;
4889 else
4890 first += regarglen[OP(first)];
4891
4892 first = NEXTOPER(first);
4893 first_next= regnext(first);
4894 }
4895
4896 /* Starting-point info. */
4897 again:
4898 DEBUG_PEEP("first:",first,0);
4899 /* Ignore EXACT as we deal with it later. */
4900 if (PL_regkind[OP(first)] == EXACT) {
4901 if (OP(first) == EXACT)
4902 NOOP; /* Empty, get anchored substr later. */
4903 else
4904 ri->regstclass = first;
4905 }
4906#ifdef TRIE_STCLASS
4907 else if (PL_regkind[OP(first)] == TRIE &&
4908 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4909 {
4910 regnode *trie_op;
4911 /* this can happen only on restudy */
4912 if ( OP(first) == TRIE ) {
4913 struct regnode_1 *trieop = (struct regnode_1 *)
4914 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4915 StructCopy(first,trieop,struct regnode_1);
4916 trie_op=(regnode *)trieop;
4917 } else {
4918 struct regnode_charclass *trieop = (struct regnode_charclass *)
4919 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4920 StructCopy(first,trieop,struct regnode_charclass);
4921 trie_op=(regnode *)trieop;
4922 }
4923 OP(trie_op)+=2;
4924 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4925 ri->regstclass = trie_op;
4926 }
4927#endif
4928 else if (REGNODE_SIMPLE(OP(first)))
4929 ri->regstclass = first;
4930 else if (PL_regkind[OP(first)] == BOUND ||
4931 PL_regkind[OP(first)] == NBOUND)
4932 ri->regstclass = first;
4933 else if (PL_regkind[OP(first)] == BOL) {
4934 r->extflags |= (OP(first) == MBOL
4935 ? RXf_ANCH_MBOL
4936 : (OP(first) == SBOL
4937 ? RXf_ANCH_SBOL
4938 : RXf_ANCH_BOL));
4939 first = NEXTOPER(first);
4940 goto again;
4941 }
4942 else if (OP(first) == GPOS) {
4943 r->extflags |= RXf_ANCH_GPOS;
4944 first = NEXTOPER(first);
4945 goto again;
4946 }
4947 else if ((!sawopen || !RExC_sawback) &&
4948 (OP(first) == STAR &&
4949 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4950 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4951 {
4952 /* turn .* into ^.* with an implied $*=1 */
4953 const int type =
4954 (OP(NEXTOPER(first)) == REG_ANY)
4955 ? RXf_ANCH_MBOL
4956 : RXf_ANCH_SBOL;
4957 r->extflags |= type;
4958 r->intflags |= PREGf_IMPLICIT;
4959 first = NEXTOPER(first);
4960 goto again;
4961 }
4962 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4963 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4964 /* x+ must match at the 1st pos of run of x's */
4965 r->intflags |= PREGf_SKIP;
4966
4967 /* Scan is after the zeroth branch, first is atomic matcher. */
4968#ifdef TRIE_STUDY_OPT
4969 DEBUG_PARSE_r(
4970 if (!restudied)
4971 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4972 (IV)(first - scan + 1))
4973 );
4974#else
4975 DEBUG_PARSE_r(
4976 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4977 (IV)(first - scan + 1))
4978 );
4979#endif
4980
4981
4982 /*
4983 * If there's something expensive in the r.e., find the
4984 * longest literal string that must appear and make it the
4985 * regmust. Resolve ties in favor of later strings, since
4986 * the regstart check works with the beginning of the r.e.
4987 * and avoiding duplication strengthens checking. Not a
4988 * strong reason, but sufficient in the absence of others.
4989 * [Now we resolve ties in favor of the earlier string if
4990 * it happens that c_offset_min has been invalidated, since the
4991 * earlier string may buy us something the later one won't.]
4992 */
4993
4994 data.longest_fixed = newSVpvs("");
4995 data.longest_float = newSVpvs("");
4996 data.last_found = newSVpvs("");
4997 data.longest = &(data.longest_fixed);
4998 first = scan;
4999 if (!ri->regstclass) {
5000 cl_init(pRExC_state, &ch_class);
5001 data.start_class = &ch_class;
5002 stclass_flag = SCF_DO_STCLASS_AND;
5003 } else /* XXXX Check for BOUND? */
5004 stclass_flag = 0;
5005 data.last_closep = &last_close;
5006
5007 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5008 &data, -1, NULL, NULL,
5009 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5010
5011
5012 CHECK_RESTUDY_GOTO;
5013
5014
5015 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5016 && data.last_start_min == 0 && data.last_end > 0
5017 && !RExC_seen_zerolen
5018 && !(RExC_seen & REG_SEEN_VERBARG)
5019 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5020 r->extflags |= RXf_CHECK_ALL;
5021 scan_commit(pRExC_state, &data,&minlen,0);
5022 SvREFCNT_dec(data.last_found);
5023
5024 /* Note that code very similar to this but for anchored string
5025 follows immediately below, changes may need to be made to both.
5026 Be careful.
5027 */
5028 longest_float_length = CHR_SVLEN(data.longest_float);
5029 if (longest_float_length
5030 || (data.flags & SF_FL_BEFORE_EOL
5031 && (!(data.flags & SF_FL_BEFORE_MEOL)
5032 || (RExC_flags & RXf_PMf_MULTILINE))))
5033 {
5034 I32 t,ml;
5035
5036 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
5037 && data.offset_fixed == data.offset_float_min
5038 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
5039 goto remove_float; /* As in (a)+. */
5040
5041 /* copy the information about the longest float from the reg_scan_data
5042 over to the program. */
5043 if (SvUTF8(data.longest_float)) {
5044 r->float_utf8 = data.longest_float;
5045 r->float_substr = NULL;
5046 } else {
5047 r->float_substr = data.longest_float;
5048 r->float_utf8 = NULL;
5049 }
5050 /* float_end_shift is how many chars that must be matched that
5051 follow this item. We calculate it ahead of time as once the
5052 lookbehind offset is added in we lose the ability to correctly
5053 calculate it.*/
5054 ml = data.minlen_float ? *(data.minlen_float)
5055 : (I32)longest_float_length;
5056 r->float_end_shift = ml - data.offset_float_min
5057 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5058 + data.lookbehind_float;
5059 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5060 r->float_max_offset = data.offset_float_max;
5061 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5062 r->float_max_offset -= data.lookbehind_float;
5063
5064 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5065 && (!(data.flags & SF_FL_BEFORE_MEOL)
5066 || (RExC_flags & RXf_PMf_MULTILINE)));
5067 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5068 }
5069 else {
5070 remove_float:
5071 r->float_substr = r->float_utf8 = NULL;
5072 SvREFCNT_dec(data.longest_float);
5073 longest_float_length = 0;
5074 }
5075
5076 /* Note that code very similar to this but for floating string
5077 is immediately above, changes may need to be made to both.
5078 Be careful.
5079 */
5080 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5081 if (longest_fixed_length
5082 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5083 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5084 || (RExC_flags & RXf_PMf_MULTILINE))))
5085 {
5086 I32 t,ml;
5087
5088 /* copy the information about the longest fixed
5089 from the reg_scan_data over to the program. */
5090 if (SvUTF8(data.longest_fixed)) {
5091 r->anchored_utf8 = data.longest_fixed;
5092 r->anchored_substr = NULL;
5093 } else {
5094 r->anchored_substr = data.longest_fixed;
5095 r->anchored_utf8 = NULL;
5096 }
5097 /* fixed_end_shift is how many chars that must be matched that
5098 follow this item. We calculate it ahead of time as once the
5099 lookbehind offset is added in we lose the ability to correctly
5100 calculate it.*/
5101 ml = data.minlen_fixed ? *(data.minlen_fixed)
5102 : (I32)longest_fixed_length;
5103 r->anchored_end_shift = ml - data.offset_fixed
5104 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5105 + data.lookbehind_fixed;
5106 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5107
5108 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5109 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5110 || (RExC_flags & RXf_PMf_MULTILINE)));
5111 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5112 }
5113 else {
5114 r->anchored_substr = r->anchored_utf8 = NULL;
5115 SvREFCNT_dec(data.longest_fixed);
5116 longest_fixed_length = 0;
5117 }
5118 if (ri->regstclass
5119 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5120 ri->regstclass = NULL;
5121
5122 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5123 && stclass_flag
5124 && !(data.start_class->flags & ANYOF_EOS)
5125 && !cl_is_anything(data.start_class))
5126 {
5127 const U32 n = add_data(pRExC_state, 1, "f");
5128 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5129
5130 Newx(RExC_rxi->data->data[n], 1,
5131 struct regnode_charclass_class);
5132 StructCopy(data.start_class,
5133 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5134 struct regnode_charclass_class);
5135 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5136 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5137 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5138 regprop(r, sv, (regnode*)data.start_class);
5139 PerlIO_printf(Perl_debug_log,
5140 "synthetic stclass \"%s\".\n",
5141 SvPVX_const(sv));});
5142 }
5143
5144 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5145 if (longest_fixed_length > longest_float_length) {
5146 r->check_end_shift = r->anchored_end_shift;
5147 r->check_substr = r->anchored_substr;
5148 r->check_utf8 = r->anchored_utf8;
5149 r->check_offset_min = r->check_offset_max = r->anchored_offset;
5150 if (r->extflags & RXf_ANCH_SINGLE)
5151 r->extflags |= RXf_NOSCAN;
5152 }
5153 else {
5154 r->check_end_shift = r->float_end_shift;
5155 r->check_substr = r->float_substr;
5156 r->check_utf8 = r->float_utf8;
5157 r->check_offset_min = r->float_min_offset;
5158 r->check_offset_max = r->float_max_offset;
5159 }
5160 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5161 This should be changed ASAP! */
5162 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5163 r->extflags |= RXf_USE_INTUIT;
5164 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5165 r->extflags |= RXf_INTUIT_TAIL;
5166 }
5167 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5168 if ( (STRLEN)minlen < longest_float_length )
5169 minlen= longest_float_length;
5170 if ( (STRLEN)minlen < longest_fixed_length )
5171 minlen= longest_fixed_length;
5172 */
5173 }
5174 else {
5175 /* Several toplevels. Best we can is to set minlen. */
5176 I32 fake;
5177 struct regnode_charclass_class ch_class;
5178 I32 last_close = 0;
5179
5180 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5181
5182 scan = ri->program + 1;
5183 cl_init(pRExC_state, &ch_class);
5184 data.start_class = &ch_class;
5185 data.last_closep = &last_close;
5186
5187
5188 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5189 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5190
5191 CHECK_RESTUDY_GOTO;
5192
5193 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5194 = r->float_substr = r->float_utf8 = NULL;
5195
5196 if (!(data.start_class->flags & ANYOF_EOS)
5197 && !cl_is_anything(data.start_class))
5198 {
5199 const U32 n = add_data(pRExC_state, 1, "f");
5200 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5201
5202 Newx(RExC_rxi->data->data[n], 1,
5203 struct regnode_charclass_class);
5204 StructCopy(data.start_class,
5205 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5206 struct regnode_charclass_class);
5207 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5208 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5209 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5210 regprop(r, sv, (regnode*)data.start_class);
5211 PerlIO_printf(Perl_debug_log,
5212 "synthetic stclass \"%s\".\n",
5213 SvPVX_const(sv));});
5214 }
5215 }
5216
5217 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5218 the "real" pattern. */
5219 DEBUG_OPTIMISE_r({
5220 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5221 (IV)minlen, (IV)r->minlen);
5222 });
5223 r->minlenret = minlen;
5224 if (r->minlen < minlen)
5225 r->minlen = minlen;
5226
5227 if (RExC_seen & REG_SEEN_GPOS)
5228 r->extflags |= RXf_GPOS_SEEN;
5229 if (RExC_seen & REG_SEEN_LOOKBEHIND)
5230 r->extflags |= RXf_LOOKBEHIND_SEEN;
5231 if (RExC_seen & REG_SEEN_EVAL)
5232 r->extflags |= RXf_EVAL_SEEN;
5233 if (RExC_seen & REG_SEEN_CANY)
5234 r->extflags |= RXf_CANY_SEEN;
5235 if (RExC_seen & REG_SEEN_VERBARG)
5236 r->intflags |= PREGf_VERBARG_SEEN;
5237 if (RExC_seen & REG_SEEN_CUTGROUP)
5238 r->intflags |= PREGf_CUTGROUP_SEEN;
5239 if (RExC_paren_names)
5240 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5241 else
5242 RXp_PAREN_NAMES(r) = NULL;
5243
5244#ifdef STUPID_PATTERN_CHECKS
5245 if (RX_PRELEN(rx) == 0)
5246 r->extflags |= RXf_NULL;
5247 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5248 /* XXX: this should happen BEFORE we compile */
5249 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5250 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5251 r->extflags |= RXf_WHITE;
5252 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5253 r->extflags |= RXf_START_ONLY;
5254#else
5255 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5256 /* XXX: this should happen BEFORE we compile */
5257 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5258 else {
5259 regnode *first = ri->program + 1;
5260 U8 fop = OP(first);
5261
5262 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5263 r->extflags |= RXf_NULL;
5264 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5265 r->extflags |= RXf_START_ONLY;
5266 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5267 && OP(regnext(first)) == END)
5268 r->extflags |= RXf_WHITE;
5269 }
5270#endif
5271#ifdef DEBUGGING
5272 if (RExC_paren_names) {
5273 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5274 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5275 } else
5276#endif
5277 ri->name_list_idx = 0;
5278
5279 if (RExC_recurse_count) {
5280 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5281 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5282 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5283 }
5284 }
5285 Newxz(r->offs, RExC_npar, regexp_paren_pair);
5286 /* assume we don't need to swap parens around before we match */
5287
5288 DEBUG_DUMP_r({
5289 PerlIO_printf(Perl_debug_log,"Final program:\n");
5290 regdump(r);
5291 });
5292#ifdef RE_TRACK_PATTERN_OFFSETS
5293 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5294 const U32 len = ri->u.offsets[0];
5295 U32 i;
5296 GET_RE_DEBUG_FLAGS_DECL;
5297 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5298 for (i = 1; i <= len; i++) {
5299 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5300 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5301 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5302 }
5303 PerlIO_printf(Perl_debug_log, "\n");
5304 });
5305#endif
5306 return rx;
5307}
5308
5309#undef RE_ENGINE_PTR
5310
5311
5312SV*
5313Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5314 const U32 flags)
5315{
5316 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5317
5318 PERL_UNUSED_ARG(value);
5319
5320 if (flags & RXapif_FETCH) {
5321 return reg_named_buff_fetch(rx, key, flags);
5322 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5323 Perl_croak_no_modify(aTHX);
5324 return NULL;
5325 } else if (flags & RXapif_EXISTS) {
5326 return reg_named_buff_exists(rx, key, flags)
5327 ? &PL_sv_yes
5328 : &PL_sv_no;
5329 } else if (flags & RXapif_REGNAMES) {
5330 return reg_named_buff_all(rx, flags);
5331 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5332 return reg_named_buff_scalar(rx, flags);
5333 } else {
5334 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5335 return NULL;
5336 }
5337}
5338
5339SV*
5340Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5341 const U32 flags)
5342{
5343 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5344 PERL_UNUSED_ARG(lastkey);
5345
5346 if (flags & RXapif_FIRSTKEY)
5347 return reg_named_buff_firstkey(rx, flags);
5348 else if (flags & RXapif_NEXTKEY)
5349 return reg_named_buff_nextkey(rx, flags);
5350 else {
5351 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5352 return NULL;
5353 }
5354}
5355
5356SV*
5357Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5358 const U32 flags)
5359{
5360 AV *retarray = NULL;
5361 SV *ret;
5362 struct regexp *const rx = (struct regexp *)SvANY(r);
5363
5364 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5365
5366 if (flags & RXapif_ALL)
5367 retarray=newAV();
5368
5369 if (rx && RXp_PAREN_NAMES(rx)) {
5370 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5371 if (he_str) {
5372 IV i;
5373 SV* sv_dat=HeVAL(he_str);
5374 I32 *nums=(I32*)SvPVX(sv_dat);
5375 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5376 if ((I32)(rx->nparens) >= nums[i]
5377 && rx->offs[nums[i]].start != -1
5378 && rx->offs[nums[i]].end != -1)
5379 {
5380 ret = newSVpvs("");
5381 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5382 if (!retarray)
5383 return ret;
5384 } else {
5385 ret = newSVsv(&PL_sv_undef);
5386 }
5387 if (retarray)
5388 av_push(retarray, ret);
5389 }
5390 if (retarray)
5391 return newRV_noinc(MUTABLE_SV(retarray));
5392 }
5393 }
5394 return NULL;
5395}
5396
5397bool
5398Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5399 const U32 flags)
5400{
5401 struct regexp *const rx = (struct regexp *)SvANY(r);
5402
5403 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5404
5405 if (rx && RXp_PAREN_NAMES(rx)) {
5406 if (flags & RXapif_ALL) {
5407 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5408 } else {
5409 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5410 if (sv) {
5411 SvREFCNT_dec(sv);
5412 return TRUE;
5413 } else {
5414 return FALSE;
5415 }
5416 }
5417 } else {
5418 return FALSE;
5419 }
5420}
5421
5422SV*
5423Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5424{
5425 struct regexp *const rx = (struct regexp *)SvANY(r);
5426
5427 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5428
5429 if ( rx && RXp_PAREN_NAMES(rx) ) {
5430 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5431
5432 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5433 } else {
5434 return FALSE;
5435 }
5436}
5437
5438SV*
5439Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5440{
5441 struct regexp *const rx = (struct regexp *)SvANY(r);
5442 GET_RE_DEBUG_FLAGS_DECL;
5443
5444 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5445
5446 if (rx && RXp_PAREN_NAMES(rx)) {
5447 HV *hv = RXp_PAREN_NAMES(rx);
5448 HE *temphe;
5449 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5450 IV i;
5451 IV parno = 0;
5452 SV* sv_dat = HeVAL(temphe);
5453 I32 *nums = (I32*)SvPVX(sv_dat);
5454 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5455 if ((I32)(rx->lastparen) >= nums[i] &&
5456 rx->offs[nums[i]].start != -1 &&
5457 rx->offs[nums[i]].end != -1)
5458 {
5459 parno = nums[i];
5460 break;
5461 }
5462 }
5463 if (parno || flags & RXapif_ALL) {
5464 return newSVhek(HeKEY_hek(temphe));
5465 }
5466 }
5467 }
5468 return NULL;
5469}
5470
5471SV*
5472Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5473{
5474 SV *ret;
5475 AV *av;
5476 I32 length;
5477 struct regexp *const rx = (struct regexp *)SvANY(r);
5478
5479 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5480
5481 if (rx && RXp_PAREN_NAMES(rx)) {
5482 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5483 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5484 } else if (flags & RXapif_ONE) {
5485 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5486 av = MUTABLE_AV(SvRV(ret));
5487 length = av_len(av);
5488 SvREFCNT_dec(ret);
5489 return newSViv(length + 1);
5490 } else {
5491 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5492 return NULL;
5493 }
5494 }
5495 return &PL_sv_undef;
5496}
5497
5498SV*
5499Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5500{
5501 struct regexp *const rx = (struct regexp *)SvANY(r);
5502 AV *av = newAV();
5503
5504 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5505
5506 if (rx && RXp_PAREN_NAMES(rx)) {
5507 HV *hv= RXp_PAREN_NAMES(rx);
5508 HE *temphe;
5509 (void)hv_iterinit(hv);
5510 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5511 IV i;
5512 IV parno = 0;
5513 SV* sv_dat = HeVAL(temphe);
5514 I32 *nums = (I32*)SvPVX(sv_dat);
5515 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5516 if ((I32)(rx->lastparen) >= nums[i] &&
5517 rx->offs[nums[i]].start != -1 &&
5518 rx->offs[nums[i]].end != -1)
5519 {
5520 parno = nums[i];
5521 break;
5522 }
5523 }
5524 if (parno || flags & RXapif_ALL) {
5525 av_push(av, newSVhek(HeKEY_hek(temphe)));
5526 }
5527 }
5528 }
5529
5530 return newRV_noinc(MUTABLE_SV(av));
5531}
5532
5533void
5534Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5535 SV * const sv)
5536{
5537 struct regexp *const rx = (struct regexp *)SvANY(r);
5538 char *s = NULL;
5539 I32 i = 0;
5540 I32 s1, t1;
5541
5542 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5543
5544 if (!rx->subbeg) {
5545 sv_setsv(sv,&PL_sv_undef);
5546 return;
5547 }
5548 else
5549 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5550 /* $` */
5551 i = rx->offs[0].start;
5552 s = rx->subbeg;
5553 }
5554 else
5555 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5556 /* $' */
5557 s = rx->subbeg + rx->offs[0].end;
5558 i = rx->sublen - rx->offs[0].end;
5559 }
5560 else
5561 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5562 (s1 = rx->offs[paren].start) != -1 &&
5563 (t1 = rx->offs[paren].end) != -1)
5564 {
5565 /* $& $1 ... */
5566 i = t1 - s1;
5567 s = rx->subbeg + s1;
5568 } else {
5569 sv_setsv(sv,&PL_sv_undef);
5570 return;
5571 }
5572 assert(rx->sublen >= (s - rx->subbeg) + i );
5573 if (i >= 0) {
5574 const int oldtainted = PL_tainted;
5575 TAINT_NOT;
5576 sv_setpvn(sv, s, i);
5577 PL_tainted = oldtainted;
5578 if ( (rx->extflags & RXf_CANY_SEEN)
5579 ? (RXp_MATCH_UTF8(rx)
5580 && (!i || is_utf8_string((U8*)s, i)))
5581 : (RXp_MATCH_UTF8(rx)) )
5582 {
5583 SvUTF8_on(sv);
5584 }
5585 else
5586 SvUTF8_off(sv);
5587 if (PL_tainting) {
5588 if (RXp_MATCH_TAINTED(rx)) {
5589 if (SvTYPE(sv) >= SVt_PVMG) {
5590 MAGIC* const mg = SvMAGIC(sv);
5591 MAGIC* mgt;
5592 PL_tainted = 1;
5593 SvMAGIC_set(sv, mg->mg_moremagic);
5594 SvTAINT(sv);
5595 if ((mgt = SvMAGIC(sv))) {
5596 mg->mg_moremagic = mgt;
5597 SvMAGIC_set(sv, mg);
5598 }
5599 } else {
5600 PL_tainted = 1;
5601 SvTAINT(sv);
5602 }
5603 } else
5604 SvTAINTED_off(sv);
5605 }
5606 } else {
5607 sv_setsv(sv,&PL_sv_undef);
5608 return;
5609 }
5610}
5611
5612void
5613Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5614 SV const * const value)
5615{
5616 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5617
5618 PERL_UNUSED_ARG(rx);
5619 PERL_UNUSED_ARG(paren);
5620 PERL_UNUSED_ARG(value);
5621
5622 if (!PL_localizing)
5623 Perl_croak_no_modify(aTHX);
5624}
5625
5626I32
5627Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5628 const I32 paren)
5629{
5630 struct regexp *const rx = (struct regexp *)SvANY(r);
5631 I32 i;
5632 I32 s1, t1;
5633
5634 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5635
5636 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5637 switch (paren) {
5638 /* $` / ${^PREMATCH} */
5639 case RX_BUFF_IDX_PREMATCH:
5640 if (rx->offs[0].start != -1) {
5641 i = rx->offs[0].start;
5642 if (i > 0) {
5643 s1 = 0;
5644 t1 = i;
5645 goto getlen;
5646 }
5647 }
5648 return 0;
5649 /* $' / ${^POSTMATCH} */
5650 case RX_BUFF_IDX_POSTMATCH:
5651 if (rx->offs[0].end != -1) {
5652 i = rx->sublen - rx->offs[0].end;
5653 if (i > 0) {
5654 s1 = rx->offs[0].end;
5655 t1 = rx->sublen;
5656 goto getlen;
5657 }
5658 }
5659 return 0;
5660 /* $& / ${^MATCH}, $1, $2, ... */
5661 default:
5662 if (paren <= (I32)rx->nparens &&
5663 (s1 = rx->offs[paren].start) != -1 &&
5664 (t1 = rx->offs[paren].end) != -1)
5665 {
5666 i = t1 - s1;
5667 goto getlen;
5668 } else {
5669 if (ckWARN(WARN_UNINITIALIZED))
5670 report_uninit((const SV *)sv);
5671 return 0;
5672 }
5673 }
5674 getlen:
5675 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5676 const char * const s = rx->subbeg + s1;
5677 const U8 *ep;
5678 STRLEN el;
5679
5680 i = t1 - s1;
5681 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5682 i = el;
5683 }
5684 return i;
5685}
5686
5687SV*
5688Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5689{
5690 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5691 PERL_UNUSED_ARG(rx);
5692 if (0)
5693 return NULL;
5694 else
5695 return newSVpvs("Regexp");
5696}
5697
5698/* Scans the name of a named buffer from the pattern.
5699 * If flags is REG_RSN_RETURN_NULL returns null.
5700 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5701 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5702 * to the parsed name as looked up in the RExC_paren_names hash.
5703 * If there is an error throws a vFAIL().. type exception.
5704 */
5705
5706#define REG_RSN_RETURN_NULL 0
5707#define REG_RSN_RETURN_NAME 1
5708#define REG_RSN_RETURN_DATA 2
5709
5710STATIC SV*
5711S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5712{
5713 char *name_start = RExC_parse;
5714
5715 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5716
5717 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5718 /* skip IDFIRST by using do...while */
5719 if (UTF)
5720 do {
5721 RExC_parse += UTF8SKIP(RExC_parse);
5722 } while (isALNUM_utf8((U8*)RExC_parse));
5723 else
5724 do {
5725 RExC_parse++;
5726 } while (isALNUM(*RExC_parse));
5727 }
5728
5729 if ( flags ) {
5730 SV* sv_name
5731 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5732 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5733 if ( flags == REG_RSN_RETURN_NAME)
5734 return sv_name;
5735 else if (flags==REG_RSN_RETURN_DATA) {
5736 HE *he_str = NULL;
5737 SV *sv_dat = NULL;
5738 if ( ! sv_name ) /* should not happen*/
5739 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5740 if (RExC_paren_names)
5741 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5742 if ( he_str )
5743 sv_dat = HeVAL(he_str);
5744 if ( ! sv_dat )
5745 vFAIL("Reference to nonexistent named group");
5746 return sv_dat;
5747 }
5748 else {
5749 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5750 }
5751 /* NOT REACHED */
5752 }
5753 return NULL;
5754}
5755
5756#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5757 int rem=(int)(RExC_end - RExC_parse); \
5758 int cut; \
5759 int num; \
5760 int iscut=0; \
5761 if (rem>10) { \
5762 rem=10; \
5763 iscut=1; \
5764 } \
5765 cut=10-rem; \
5766 if (RExC_lastparse!=RExC_parse) \
5767 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5768 rem, RExC_parse, \
5769 cut + 4, \
5770 iscut ? "..." : "<" \
5771 ); \
5772 else \
5773 PerlIO_printf(Perl_debug_log,"%16s",""); \
5774 \
5775 if (SIZE_ONLY) \
5776 num = RExC_size + 1; \
5777 else \
5778 num=REG_NODE_NUM(RExC_emit); \
5779 if (RExC_lastnum!=num) \
5780 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5781 else \
5782 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5783 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5784 (int)((depth*2)), "", \
5785 (funcname) \
5786 ); \
5787 RExC_lastnum=num; \
5788 RExC_lastparse=RExC_parse; \
5789})
5790
5791
5792
5793#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5794 DEBUG_PARSE_MSG((funcname)); \
5795 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5796})
5797#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5798 DEBUG_PARSE_MSG((funcname)); \
5799 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5800})
5801
5802/* This section of code defines the inversion list object and its methods. The
5803 * interfaces are highly subject to change, so as much as possible is static to
5804 * this file. An inversion list is here implemented as a malloc'd C array with
5805 * some added info. More will be coming when functionality is added later.
5806 *
5807 * Some of the methods should always be private to the implementation, and some
5808 * should eventually be made public */
5809
5810#define INVLIST_INITIAL_LEN 10
5811#define INVLIST_ARRAY_KEY "array"
5812#define INVLIST_MAX_KEY "max"
5813#define INVLIST_LEN_KEY "len"
5814
5815PERL_STATIC_INLINE UV*
5816S_invlist_array(pTHX_ HV* const invlist)
5817{
5818 /* Returns the pointer to the inversion list's array. Every time the
5819 * length changes, this needs to be called in case malloc or realloc moved
5820 * it */
5821
5822 SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5823
5824 PERL_ARGS_ASSERT_INVLIST_ARRAY;
5825
5826 if (list_ptr == NULL) {
5827 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5828 INVLIST_ARRAY_KEY);
5829 }
5830
5831 return INT2PTR(UV *, SvUV(*list_ptr));
5832}
5833
5834PERL_STATIC_INLINE void
5835S_invlist_set_array(pTHX_ HV* const invlist, const UV* const array)
5836{
5837 PERL_ARGS_ASSERT_INVLIST_SET_ARRAY;
5838
5839 /* Sets the array stored in the inversion list to the memory beginning with
5840 * the parameter */
5841
5842 if (hv_stores(invlist, INVLIST_ARRAY_KEY, newSVuv(PTR2UV(array))) == NULL) {
5843 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5844 INVLIST_ARRAY_KEY);
5845 }
5846}
5847
5848PERL_STATIC_INLINE UV
5849S_invlist_len(pTHX_ HV* const invlist)
5850{
5851 /* Returns the current number of elements in the inversion list's array */
5852
5853 SV** len_ptr = hv_fetchs(invlist, INVLIST_LEN_KEY, FALSE);
5854
5855 PERL_ARGS_ASSERT_INVLIST_LEN;
5856
5857 if (len_ptr == NULL) {
5858 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5859 INVLIST_LEN_KEY);
5860 }
5861
5862 return SvUV(*len_ptr);
5863}
5864
5865PERL_STATIC_INLINE UV
5866S_invlist_max(pTHX_ HV* const invlist)
5867{
5868 /* Returns the maximum number of elements storable in the inversion list's
5869 * array, without having to realloc() */
5870
5871 SV** max_ptr = hv_fetchs(invlist, INVLIST_MAX_KEY, FALSE);
5872
5873 PERL_ARGS_ASSERT_INVLIST_MAX;
5874
5875 if (max_ptr == NULL) {
5876 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5877 INVLIST_MAX_KEY);
5878 }
5879
5880 return SvUV(*max_ptr);
5881}
5882
5883PERL_STATIC_INLINE void
5884S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
5885{
5886 /* Sets the current number of elements stored in the inversion list */
5887
5888 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
5889
5890 if (len != 0 && len > invlist_max(invlist)) {
5891 Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' more than %s=%"UVuf" in inversion list", INVLIST_LEN_KEY, len, INVLIST_MAX_KEY, invlist_max(invlist));
5892 }
5893
5894 if (hv_stores(invlist, INVLIST_LEN_KEY, newSVuv(len)) == NULL) {
5895 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5896 INVLIST_LEN_KEY);
5897 }
5898}
5899
5900PERL_STATIC_INLINE void
5901S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
5902{
5903
5904 /* Sets the maximum number of elements storable in the inversion list
5905 * without having to realloc() */
5906
5907 PERL_ARGS_ASSERT_INVLIST_SET_MAX;
5908
5909 if (max < invlist_len(invlist)) {
5910 Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' less than %s=%"UVuf" in inversion list", INVLIST_MAX_KEY, invlist_len(invlist), INVLIST_LEN_KEY, invlist_max(invlist));
5911 }
5912
5913 if (hv_stores(invlist, INVLIST_MAX_KEY, newSVuv(max)) == NULL) {
5914 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5915 INVLIST_LEN_KEY);
5916 }
5917}
5918
5919#ifndef PERL_IN_XSUB_RE
5920HV*
5921Perl__new_invlist(pTHX_ IV initial_size)
5922{
5923
5924 /* Return a pointer to a newly constructed inversion list, with enough
5925 * space to store 'initial_size' elements. If that number is negative, a
5926 * system default is used instead */
5927
5928 HV* invlist = newHV();
5929 UV* list;
5930
5931 if (initial_size < 0) {
5932 initial_size = INVLIST_INITIAL_LEN;
5933 }
5934
5935 /* Allocate the initial space */
5936 Newx(list, initial_size, UV);
5937 invlist_set_array(invlist, list);
5938
5939 /* set_len has to come before set_max, as the latter inspects the len */
5940 invlist_set_len(invlist, 0);
5941 invlist_set_max(invlist, initial_size);
5942
5943 return invlist;
5944}
5945#endif
5946
5947PERL_STATIC_INLINE void
5948S_invlist_destroy(pTHX_ HV* const invlist)
5949{
5950 /* Inversion list destructor */
5951
5952 SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5953
5954 PERL_ARGS_ASSERT_INVLIST_DESTROY;
5955
5956 if (list_ptr != NULL) {
5957 UV *list = INT2PTR(UV *, SvUV(*list_ptr)); /* PERL_POISON needs lvalue */
5958 Safefree(list);
5959 }
5960}
5961
5962STATIC void
5963S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
5964{
5965 /* Change the maximum size of an inversion list (up or down) */
5966
5967 UV* orig_array;
5968 UV* array;
5969 const UV old_max = invlist_max(invlist);
5970
5971 PERL_ARGS_ASSERT_INVLIST_EXTEND;
5972
5973 if (old_max == new_max) { /* If a no-op */
5974 return;
5975 }
5976
5977 array = orig_array = invlist_array(invlist);
5978 Renew(array, new_max, UV);
5979
5980 /* If the size change moved the list in memory, set the new one */
5981 if (array != orig_array) {
5982 invlist_set_array(invlist, array);
5983 }
5984
5985 invlist_set_max(invlist, new_max);
5986
5987}
5988
5989PERL_STATIC_INLINE void
5990S_invlist_trim(pTHX_ HV* const invlist)
5991{
5992 PERL_ARGS_ASSERT_INVLIST_TRIM;
5993
5994 /* Change the length of the inversion list to how many entries it currently
5995 * has */
5996
5997 invlist_extend(invlist, invlist_len(invlist));
5998}
5999
6000/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6001 * etc */
6002
6003#define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
6004
6005#ifndef PERL_IN_XSUB_RE
6006void
6007Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
6008{
6009 /* Subject to change or removal. Append the range from 'start' to 'end' at
6010 * the end of the inversion list. The range must be above any existing
6011 * ones. */
6012
6013 UV* array = invlist_array(invlist);
6014 UV max = invlist_max(invlist);
6015 UV len = invlist_len(invlist);
6016
6017 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6018
6019 if (len > 0) {
6020
6021 /* Here, the existing list is non-empty. The current max entry in the
6022 * list is generally the first value not in the set, except when the
6023 * set extends to the end of permissible values, in which case it is
6024 * the first entry in that final set, and so this call is an attempt to
6025 * append out-of-order */
6026
6027 UV final_element = len - 1;
6028 if (array[final_element] > start
6029 || ELEMENT_IN_INVLIST_SET(final_element))
6030 {
6031 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
6032 }
6033
6034 /* Here, it is a legal append. If the new range begins with the first
6035 * value not in the set, it is extending the set, so the new first
6036 * value not in the set is one greater than the newly extended range.
6037 * */
6038 if (array[final_element] == start) {
6039 if (end != UV_MAX) {
6040 array[final_element] = end + 1;
6041 }
6042 else {
6043 /* But if the end is the maximum representable on the machine,
6044 * just let the range that this would extend have no end */
6045 invlist_set_len(invlist, len - 1);
6046 }
6047 return;
6048 }
6049 }
6050
6051 /* Here the new range doesn't extend any existing set. Add it */
6052
6053 len += 2; /* Includes an element each for the start and end of range */
6054
6055 /* If overflows the existing space, extend, which may cause the array to be
6056 * moved */
6057 if (max < len) {
6058 invlist_extend(invlist, len);
6059 array = invlist_array(invlist);
6060 }
6061
6062 invlist_set_len(invlist, len);
6063
6064 /* The next item on the list starts the range, the one after that is
6065 * one past the new range. */
6066 array[len - 2] = start;
6067 if (end != UV_MAX) {
6068 array[len - 1] = end + 1;
6069 }
6070 else {
6071 /* But if the end is the maximum representable on the machine, just let
6072 * the range have no end */
6073 invlist_set_len(invlist, len - 1);
6074 }
6075}
6076#endif
6077
6078STATIC HV*
6079S_invlist_union(pTHX_ HV* const a, HV* const b)
6080{
6081 /* Return a new inversion list which is the union of two inversion lists.
6082 * The basis for this comes from "Unicode Demystified" Chapter 13 by
6083 * Richard Gillam, published by Addison-Wesley, and explained at some
6084 * length there. The preface says to incorporate its examples into your
6085 * code at your own risk.
6086 *
6087 * The algorithm is like a merge sort.
6088 *
6089 * XXX A potential performance improvement is to keep track as we go along
6090 * if only one of the inputs contributes to the result, meaning the other
6091 * is a subset of that one. In that case, we can skip the final copy and
6092 * return the larger of the input lists */
6093
6094 UV* array_a = invlist_array(a); /* a's array */
6095 UV* array_b = invlist_array(b);
6096 UV len_a = invlist_len(a); /* length of a's array */
6097 UV len_b = invlist_len(b);
6098
6099 HV* u; /* the resulting union */
6100 UV* array_u;
6101 UV len_u;
6102
6103 UV i_a = 0; /* current index into a's array */
6104 UV i_b = 0;
6105 UV i_u = 0;
6106
6107 /* running count, as explained in the algorithm source book; items are
6108 * stopped accumulating and are output when the count changes to/from 0.
6109 * The count is incremented when we start a range that's in the set, and
6110 * decremented when we start a range that's not in the set. So its range
6111 * is 0 to 2. Only when the count is zero is something not in the set.
6112 */
6113 UV count = 0;
6114
6115 PERL_ARGS_ASSERT_INVLIST_UNION;
6116
6117 /* Size the union for the worst case: that the sets are completely
6118 * disjoint */
6119 u = _new_invlist(len_a + len_b);
6120 array_u = invlist_array(u);
6121
6122 /* Go through each list item by item, stopping when exhausted one of
6123 * them */
6124 while (i_a < len_a && i_b < len_b) {
6125 UV cp; /* The element to potentially add to the union's array */
6126 bool cp_in_set; /* is it in the the input list's set or not */
6127
6128 /* We need to take one or the other of the two inputs for the union.
6129 * Since we are merging two sorted lists, we take the smaller of the
6130 * next items. In case of a tie, we take the one that is in its set
6131 * first. If we took one not in the set first, it would decrement the
6132 * count, possibly to 0 which would cause it to be output as ending the
6133 * range, and the next time through we would take the same number, and
6134 * output it again as beginning the next range. By doing it the
6135 * opposite way, there is no possibility that the count will be
6136 * momentarily decremented to 0, and thus the two adjoining ranges will
6137 * be seamlessly merged. (In a tie and both are in the set or both not
6138 * in the set, it doesn't matter which we take first.) */
6139 if (array_a[i_a] < array_b[i_b]
6140 || (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
6141 {
6142 cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6143 cp= array_a[i_a++];
6144 }
6145 else {
6146 cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6147 cp= array_b[i_b++];
6148 }
6149
6150 /* Here, have chosen which of the two inputs to look at. Only output
6151 * if the running count changes to/from 0, which marks the
6152 * beginning/end of a range in that's in the set */
6153 if (cp_in_set) {
6154 if (count == 0) {
6155 array_u[i_u++] = cp;
6156 }
6157 count++;
6158 }
6159 else {
6160 count--;
6161 if (count == 0) {
6162 array_u[i_u++] = cp;
6163 }
6164 }
6165 }
6166
6167 /* Here, we are finished going through at least one of the lists, which
6168 * means there is something remaining in at most one. We check if the list
6169 * that hasn't been exhausted is positioned such that we are in the middle
6170 * of a range in its set or not. (We are in the set if the next item in
6171 * the array marks the beginning of something not in the set) If in the
6172 * set, we decrement 'count'; if 0, there is potentially more to output.
6173 * There are four cases:
6174 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
6175 * in the union is entirely from the non-exhausted set.
6176 * 2) Both were in their sets, count is 2. Nothing further should
6177 * be output, as everything that remains will be in the exhausted
6178 * list's set, hence in the union; decrementing to 1 but not 0 insures
6179 * that
6180 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
6181 * Nothing further should be output because the union includes
6182 * everything from the exhausted set. Not decrementing insures that.
6183 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6184 * decrementing to 0 insures that we look at the remainder of the
6185 * non-exhausted set */
6186 if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6187 || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6188 {
6189 count--;
6190 }
6191
6192 /* The final length is what we've output so far, plus what else is about to
6193 * be output. (If 'count' is non-zero, then the input list we exhausted
6194 * has everything remaining up to the machine's limit in its set, and hence
6195 * in the union, so there will be no further output. */
6196 len_u = i_u;
6197 if (count == 0) {
6198 /* At most one of the subexpressions will be non-zero */
6199 len_u += (len_a - i_a) + (len_b - i_b);
6200 }
6201
6202 /* Set result to final length, which can change the pointer to array_u, so
6203 * re-find it */
6204 if (len_u != invlist_len(u)) {
6205 invlist_set_len(u, len_u);
6206 invlist_trim(u);
6207 array_u = invlist_array(u);
6208 }
6209
6210 /* When 'count' is 0, the list that was exhausted (if one was shorter than
6211 * the other) ended with everything above it not in its set. That means
6212 * that the remaining part of the union is precisely the same as the
6213 * non-exhausted list, so can just copy it unchanged. (If both list were
6214 * exhausted at the same time, then the operations below will be both 0.)
6215 */
6216 if (count == 0) {
6217 IV copy_count; /* At most one will have a non-zero copy count */
6218 if ((copy_count = len_a - i_a) > 0) {
6219 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6220 }
6221 else if ((copy_count = len_b - i_b) > 0) {
6222 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6223 }
6224 }
6225
6226 return u;
6227}
6228
6229STATIC HV*
6230S_invlist_intersection(pTHX_ HV* const a, HV* const b)
6231{
6232 /* Return the intersection of two inversion lists. The basis for this
6233 * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
6234 * by Addison-Wesley, and explained at some length there. The preface says
6235 * to incorporate its examples into your code at your own risk.
6236 *
6237 * The algorithm is like a merge sort, and is essentially the same as the
6238 * union above
6239 */
6240
6241 UV* array_a = invlist_array(a); /* a's array */
6242 UV* array_b = invlist_array(b);
6243 UV len_a = invlist_len(a); /* length of a's array */
6244 UV len_b = invlist_len(b);
6245
6246 HV* r; /* the resulting intersection */
6247 UV* array_r;
6248 UV len_r;
6249
6250 UV i_a = 0; /* current index into a's array */
6251 UV i_b = 0;
6252 UV i_r = 0;
6253
6254 /* running count, as explained in the algorithm source book; items are
6255 * stopped accumulating and are output when the count changes to/from 2.
6256 * The count is incremented when we start a range that's in the set, and
6257 * decremented when we start a range that's not in the set. So its range
6258 * is 0 to 2. Only when the count is 2 is something in the intersection.
6259 */
6260 UV count = 0;
6261
6262 PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
6263
6264 /* Size the intersection for the worst case: that the intersection ends up
6265 * fragmenting everything to be completely disjoint */
6266 r= _new_invlist(len_a + len_b);
6267 array_r = invlist_array(r);
6268
6269 /* Go through each list item by item, stopping when exhausted one of
6270 * them */
6271 while (i_a < len_a && i_b < len_b) {
6272 UV cp; /* The element to potentially add to the intersection's
6273 array */
6274 bool cp_in_set; /* Is it in the input list's set or not */
6275
6276 /* We need to take one or the other of the two inputs for the union.
6277 * Since we are merging two sorted lists, we take the smaller of the
6278 * next items. In case of a tie, we take the one that is not in its
6279 * set first (a difference from the union algorithm). If we took one
6280 * in the set first, it would increment the count, possibly to 2 which
6281 * would cause it to be output as starting a range in the intersection,
6282 * and the next time through we would take that same number, and output
6283 * it again as ending the set. By doing it the opposite of this, we
6284 * there is no possibility that the count will be momentarily
6285 * incremented to 2. (In a tie and both are in the set or both not in
6286 * the set, it doesn't matter which we take first.) */
6287 if (array_a[i_a] < array_b[i_b]
6288 || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
6289 {
6290 cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6291 cp= array_a[i_a++];
6292 }
6293 else {
6294 cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6295 cp= array_b[i_b++];
6296 }
6297
6298 /* Here, have chosen which of the two inputs to look at. Only output
6299 * if the running count changes to/from 2, which marks the
6300 * beginning/end of a range that's in the intersection */
6301 if (cp_in_set) {
6302 count++;
6303 if (count == 2) {
6304 array_r[i_r++] = cp;
6305 }
6306 }
6307 else {
6308 if (count == 2) {
6309 array_r[i_r++] = cp;
6310 }
6311 count--;
6312 }
6313 }
6314
6315 /* Here, we are finished going through at least one of the sets, which
6316 * means there is something remaining in at most one. See the comments in
6317 * the union code */
6318 if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6319 || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6320 {
6321 count--;
6322 }
6323
6324 /* The final length is what we've output so far plus what else is in the
6325 * intersection. Only one of the subexpressions below will be non-zero */
6326 len_r = i_r;
6327 if (count == 2) {
6328 len_r += (len_a - i_a) + (len_b - i_b);
6329 }
6330
6331 /* Set result to final length, which can change the pointer to array_r, so
6332 * re-find it */
6333 if (len_r != invlist_len(r)) {
6334 invlist_set_len(r, len_r);
6335 invlist_trim(r);
6336 array_r = invlist_array(r);
6337 }
6338
6339 /* Finish outputting any remaining */
6340 if (count == 2) { /* Only one of will have a non-zero copy count */
6341 IV copy_count;
6342 if ((copy_count = len_a - i_a) > 0) {
6343 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
6344 }
6345 else if ((copy_count = len_b - i_b) > 0) {
6346 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
6347 }
6348 }
6349
6350 return r;
6351}
6352
6353STATIC HV*
6354S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end)
6355{
6356 /* Add the range from 'start' to 'end' inclusive to the inversion list's
6357 * set. A pointer to the inversion list is returned. This may actually be
6358 * a new list, in which case the passed in one has been destroyed. The
6359 * passed in inversion list can be NULL, in which case a new one is created
6360 * with just the one range in it */
6361
6362 HV* range_invlist;
6363 HV* added_invlist;
6364 UV len;
6365
6366 if (invlist == NULL) {
6367 invlist = _new_invlist(2);
6368 len = 0;
6369 }
6370 else {
6371 len = invlist_len(invlist);
6372 }
6373
6374 /* If comes after the final entry, can just append it to the end */
6375 if (len == 0
6376 || start >= invlist_array(invlist)
6377 [invlist_len(invlist) - 1])
6378 {
6379 _append_range_to_invlist(invlist, start, end);
6380 return invlist;
6381 }
6382
6383 /* Here, can't just append things, create and return a new inversion list
6384 * which is the union of this range and the existing inversion list */
6385 range_invlist = _new_invlist(2);
6386 _append_range_to_invlist(range_invlist, start, end);
6387
6388 added_invlist = invlist_union(invlist, range_invlist);
6389
6390 /* The passed in list can be freed, as well as our temporary */
6391 invlist_destroy(range_invlist);
6392 if (invlist != added_invlist) {
6393 invlist_destroy(invlist);
6394 }
6395
6396 return added_invlist;
6397}
6398
6399PERL_STATIC_INLINE HV*
6400S_add_cp_to_invlist(pTHX_ HV* invlist, const UV cp) {
6401 return add_range_to_invlist(invlist, cp, cp);
6402}
6403
6404/* End of inversion list object */
6405
6406/*
6407 - reg - regular expression, i.e. main body or parenthesized thing
6408 *
6409 * Caller must absorb opening parenthesis.
6410 *
6411 * Combining parenthesis handling with the base level of regular expression
6412 * is a trifle forced, but the need to tie the tails of the branches to what
6413 * follows makes it hard to avoid.
6414 */
6415#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
6416#ifdef DEBUGGING
6417#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
6418#else
6419#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
6420#endif
6421
6422STATIC regnode *
6423S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
6424 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
6425{
6426 dVAR;
6427 register regnode *ret; /* Will be the head of the group. */
6428 register regnode *br;
6429 register regnode *lastbr;
6430 register regnode *ender = NULL;
6431 register I32 parno = 0;
6432 I32 flags;
6433 U32 oregflags = RExC_flags;
6434 bool have_branch = 0;
6435 bool is_open = 0;
6436 I32 freeze_paren = 0;
6437 I32 after_freeze = 0;
6438
6439 /* for (?g), (?gc), and (?o) warnings; warning
6440 about (?c) will warn about (?g) -- japhy */
6441
6442#define WASTED_O 0x01
6443#define WASTED_G 0x02
6444#define WASTED_C 0x04
6445#define WASTED_GC (0x02|0x04)
6446 I32 wastedflags = 0x00;
6447
6448 char * parse_start = RExC_parse; /* MJD */
6449 char * const oregcomp_parse = RExC_parse;
6450
6451 GET_RE_DEBUG_FLAGS_DECL;
6452
6453 PERL_ARGS_ASSERT_REG;
6454 DEBUG_PARSE("reg ");
6455
6456 *flagp = 0; /* Tentatively. */
6457
6458
6459 /* Make an OPEN node, if parenthesized. */
6460 if (paren) {
6461 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
6462 char *start_verb = RExC_parse;
6463 STRLEN verb_len = 0;
6464 char *start_arg = NULL;
6465 unsigned char op = 0;
6466 int argok = 1;
6467 int internal_argval = 0; /* internal_argval is only useful if !argok */
6468 while ( *RExC_parse && *RExC_parse != ')' ) {
6469 if ( *RExC_parse == ':' ) {
6470 start_arg = RExC_parse + 1;
6471 break;
6472 }
6473 RExC_parse++;
6474 }
6475 ++start_verb;
6476 verb_len = RExC_parse - start_verb;
6477 if ( start_arg ) {
6478 RExC_parse++;
6479 while ( *RExC_parse && *RExC_parse != ')' )
6480 RExC_parse++;
6481 if ( *RExC_parse != ')' )
6482 vFAIL("Unterminated verb pattern argument");
6483 if ( RExC_parse == start_arg )
6484 start_arg = NULL;
6485 } else {
6486 if ( *RExC_parse != ')' )
6487 vFAIL("Unterminated verb pattern");
6488 }
6489
6490 switch ( *start_verb ) {
6491 case 'A': /* (*ACCEPT) */
6492 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
6493 op = ACCEPT;
6494 internal_argval = RExC_nestroot;
6495 }
6496 break;
6497 case 'C': /* (*COMMIT) */
6498 if ( memEQs(start_verb,verb_len,"COMMIT") )
6499 op = COMMIT;
6500 break;
6501 case 'F': /* (*FAIL) */
6502 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
6503 op = OPFAIL;
6504 argok = 0;
6505 }
6506 break;
6507 case ':': /* (*:NAME) */
6508 case 'M': /* (*MARK:NAME) */
6509 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
6510 op = MARKPOINT;
6511 argok = -1;
6512 }
6513 break;
6514 case 'P': /* (*PRUNE) */
6515 if ( memEQs(start_verb,verb_len,"PRUNE") )
6516 op = PRUNE;
6517 break;
6518 case 'S': /* (*SKIP) */
6519 if ( memEQs(start_verb,verb_len,"SKIP") )
6520 op = SKIP;
6521 break;
6522 case 'T': /* (*THEN) */
6523 /* [19:06] <TimToady> :: is then */
6524 if ( memEQs(start_verb,verb_len,"THEN") ) {
6525 op = CUTGROUP;
6526 RExC_seen |= REG_SEEN_CUTGROUP;
6527 }
6528 break;
6529 }
6530 if ( ! op ) {
6531 RExC_parse++;
6532 vFAIL3("Unknown verb pattern '%.*s'",
6533 verb_len, start_verb);
6534 }
6535 if ( argok ) {
6536 if ( start_arg && internal_argval ) {
6537 vFAIL3("Verb pattern '%.*s' may not have an argument",
6538 verb_len, start_verb);
6539 } else if ( argok < 0 && !start_arg ) {
6540 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
6541 verb_len, start_verb);
6542 } else {
6543 ret = reganode(pRExC_state, op, internal_argval);
6544 if ( ! internal_argval && ! SIZE_ONLY ) {
6545 if (start_arg) {
6546 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
6547 ARG(ret) = add_data( pRExC_state, 1, "S" );
6548 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
6549 ret->flags = 0;
6550 } else {
6551 ret->flags = 1;
6552 }
6553 }
6554 }
6555 if (!internal_argval)
6556 RExC_seen |= REG_SEEN_VERBARG;
6557 } else if ( start_arg ) {
6558 vFAIL3("Verb pattern '%.*s' may not have an argument",
6559 verb_len, start_verb);
6560 } else {
6561 ret = reg_node(pRExC_state, op);
6562 }
6563 nextchar(pRExC_state);
6564 return ret;
6565 } else
6566 if (*RExC_parse == '?') { /* (?...) */
6567 bool is_logical = 0;
6568 const char * const seqstart = RExC_parse;
6569 bool has_use_defaults = FALSE;
6570
6571 RExC_parse++;
6572 paren = *RExC_parse++;
6573 ret = NULL; /* For look-ahead/behind. */
6574 switch (paren) {
6575
6576 case 'P': /* (?P...) variants for those used to PCRE/Python */
6577 paren = *RExC_parse++;
6578 if ( paren == '<') /* (?P<...>) named capture */
6579 goto named_capture;
6580 else if (paren == '>') { /* (?P>name) named recursion */
6581 goto named_recursion;
6582 }
6583 else if (paren == '=') { /* (?P=...) named backref */
6584 /* this pretty much dupes the code for \k<NAME> in regatom(), if
6585 you change this make sure you change that */
6586 char* name_start = RExC_parse;
6587 U32 num = 0;
6588 SV *sv_dat = reg_scan_name(pRExC_state,
6589 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6590 if (RExC_parse == name_start || *RExC_parse != ')')
6591 vFAIL2("Sequence %.3s... not terminated",parse_start);
6592
6593 if (!SIZE_ONLY) {
6594 num = add_data( pRExC_state, 1, "S" );
6595 RExC_rxi->data->data[num]=(void*)sv_dat;
6596 SvREFCNT_inc_simple_void(sv_dat);
6597 }
6598 RExC_sawback = 1;
6599 ret = reganode(pRExC_state,
6600 ((! FOLD)
6601 ? NREF
6602 : (MORE_ASCII_RESTRICTED)
6603 ? NREFFA
6604 : (AT_LEAST_UNI_SEMANTICS)
6605 ? NREFFU
6606 : (LOC)
6607 ? NREFFL
6608 : NREFF),
6609 num);
6610 *flagp |= HASWIDTH;
6611
6612 Set_Node_Offset(ret, parse_start+1);
6613 Set_Node_Cur_Length(ret); /* MJD */
6614
6615 nextchar(pRExC_state);
6616 return ret;
6617 }
6618 RExC_parse++;
6619 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6620 /*NOTREACHED*/
6621 case '<': /* (?<...) */
6622 if (*RExC_parse == '!')
6623 paren = ',';
6624 else if (*RExC_parse != '=')
6625 named_capture:
6626 { /* (?<...>) */
6627 char *name_start;
6628 SV *svname;
6629 paren= '>';
6630 case '\'': /* (?'...') */
6631 name_start= RExC_parse;
6632 svname = reg_scan_name(pRExC_state,
6633 SIZE_ONLY ? /* reverse test from the others */
6634 REG_RSN_RETURN_NAME :
6635 REG_RSN_RETURN_NULL);
6636 if (RExC_parse == name_start) {
6637 RExC_parse++;
6638 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6639 /*NOTREACHED*/
6640 }
6641 if (*RExC_parse != paren)
6642 vFAIL2("Sequence (?%c... not terminated",
6643 paren=='>' ? '<' : paren);
6644 if (SIZE_ONLY) {
6645 HE *he_str;
6646 SV *sv_dat = NULL;
6647 if (!svname) /* shouldn't happen */
6648 Perl_croak(aTHX_
6649 "panic: reg_scan_name returned NULL");
6650 if (!RExC_paren_names) {
6651 RExC_paren_names= newHV();
6652 sv_2mortal(MUTABLE_SV(RExC_paren_names));
6653#ifdef DEBUGGING
6654 RExC_paren_name_list= newAV();
6655 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
6656#endif
6657 }
6658 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
6659 if ( he_str )
6660 sv_dat = HeVAL(he_str);
6661 if ( ! sv_dat ) {
6662 /* croak baby croak */
6663 Perl_croak(aTHX_
6664 "panic: paren_name hash element allocation failed");
6665 } else if ( SvPOK(sv_dat) ) {
6666 /* (?|...) can mean we have dupes so scan to check
6667 its already been stored. Maybe a flag indicating
6668 we are inside such a construct would be useful,
6669 but the arrays are likely to be quite small, so
6670 for now we punt -- dmq */
6671 IV count = SvIV(sv_dat);
6672 I32 *pv = (I32*)SvPVX(sv_dat);
6673 IV i;
6674 for ( i = 0 ; i < count ; i++ ) {
6675 if ( pv[i] == RExC_npar ) {
6676 count = 0;
6677 break;
6678 }
6679 }
6680 if ( count ) {
6681 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
6682 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
6683 pv[count] = RExC_npar;
6684 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
6685 }
6686 } else {
6687 (void)SvUPGRADE(sv_dat,SVt_PVNV);
6688 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
6689 SvIOK_on(sv_dat);
6690 SvIV_set(sv_dat, 1);
6691 }
6692#ifdef DEBUGGING
6693 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
6694 SvREFCNT_dec(svname);
6695#endif
6696
6697 /*sv_dump(sv_dat);*/
6698 }
6699 nextchar(pRExC_state);
6700 paren = 1;
6701 goto capturing_parens;
6702 }
6703 RExC_seen |= REG_SEEN_LOOKBEHIND;
6704 RExC_in_lookbehind++;
6705 RExC_parse++;
6706 case '=': /* (?=...) */
6707 RExC_seen_zerolen++;
6708 break;
6709 case '!': /* (?!...) */
6710 RExC_seen_zerolen++;
6711 if (*RExC_parse == ')') {
6712 ret=reg_node(pRExC_state, OPFAIL);
6713 nextchar(pRExC_state);
6714 return ret;
6715 }
6716 break;
6717 case '|': /* (?|...) */
6718 /* branch reset, behave like a (?:...) except that
6719 buffers in alternations share the same numbers */
6720 paren = ':';
6721 after_freeze = freeze_paren = RExC_npar;
6722 break;
6723 case ':': /* (?:...) */
6724 case '>': /* (?>...) */
6725 break;
6726 case '$': /* (?$...) */
6727 case '@': /* (?@...) */
6728 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
6729 break;
6730 case '#': /* (?#...) */
6731 while (*RExC_parse && *RExC_parse != ')')
6732 RExC_parse++;
6733 if (*RExC_parse != ')')
6734 FAIL("Sequence (?#... not terminated");
6735 nextchar(pRExC_state);
6736 *flagp = TRYAGAIN;
6737 return NULL;
6738 case '0' : /* (?0) */
6739 case 'R' : /* (?R) */
6740 if (*RExC_parse != ')')
6741 FAIL("Sequence (?R) not terminated");
6742 ret = reg_node(pRExC_state, GOSTART);
6743 *flagp |= POSTPONED;
6744 nextchar(pRExC_state);
6745 return ret;
6746 /*notreached*/
6747 { /* named and numeric backreferences */
6748 I32 num;
6749 case '&': /* (?&NAME) */
6750 parse_start = RExC_parse - 1;
6751 named_recursion:
6752 {
6753 SV *sv_dat = reg_scan_name(pRExC_state,
6754 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6755 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6756 }
6757 goto gen_recurse_regop;
6758 /* NOT REACHED */
6759 case '+':
6760 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6761 RExC_parse++;
6762 vFAIL("Illegal pattern");
6763 }
6764 goto parse_recursion;
6765 /* NOT REACHED*/
6766 case '-': /* (?-1) */
6767 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6768 RExC_parse--; /* rewind to let it be handled later */
6769 goto parse_flags;
6770 }
6771 /*FALLTHROUGH */
6772 case '1': case '2': case '3': case '4': /* (?1) */
6773 case '5': case '6': case '7': case '8': case '9':
6774 RExC_parse--;
6775 parse_recursion:
6776 num = atoi(RExC_parse);
6777 parse_start = RExC_parse - 1; /* MJD */
6778 if (*RExC_parse == '-')
6779 RExC_parse++;
6780 while (isDIGIT(*RExC_parse))
6781 RExC_parse++;
6782 if (*RExC_parse!=')')
6783 vFAIL("Expecting close bracket");
6784
6785 gen_recurse_regop:
6786 if ( paren == '-' ) {
6787 /*
6788 Diagram of capture buffer numbering.
6789 Top line is the normal capture buffer numbers
6790 Bottom line is the negative indexing as from
6791 the X (the (?-2))
6792
6793 + 1 2 3 4 5 X 6 7
6794 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6795 - 5 4 3 2 1 X x x
6796
6797 */
6798 num = RExC_npar + num;
6799 if (num < 1) {
6800 RExC_parse++;
6801 vFAIL("Reference to nonexistent group");
6802 }
6803 } else if ( paren == '+' ) {
6804 num = RExC_npar + num - 1;
6805 }
6806
6807 ret = reganode(pRExC_state, GOSUB, num);
6808 if (!SIZE_ONLY) {
6809 if (num > (I32)RExC_rx->nparens) {
6810 RExC_parse++;
6811 vFAIL("Reference to nonexistent group");
6812 }
6813 ARG2L_SET( ret, RExC_recurse_count++);
6814 RExC_emit++;
6815 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6816 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6817 } else {
6818 RExC_size++;
6819 }
6820 RExC_seen |= REG_SEEN_RECURSE;
6821 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6822 Set_Node_Offset(ret, parse_start); /* MJD */
6823
6824 *flagp |= POSTPONED;
6825 nextchar(pRExC_state);
6826 return ret;
6827 } /* named and numeric backreferences */
6828 /* NOT REACHED */
6829
6830 case '?': /* (??...) */
6831 is_logical = 1;
6832 if (*RExC_parse != '{') {
6833 RExC_parse++;
6834 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6835 /*NOTREACHED*/
6836 }
6837 *flagp |= POSTPONED;
6838 paren = *RExC_parse++;
6839 /* FALL THROUGH */
6840 case '{': /* (?{...}) */
6841 {
6842 I32 count = 1;
6843 U32 n = 0;
6844 char c;
6845 char *s = RExC_parse;
6846
6847 RExC_seen_zerolen++;
6848 RExC_seen |= REG_SEEN_EVAL;
6849 while (count && (c = *RExC_parse)) {
6850 if (c == '\\') {
6851 if (RExC_parse[1])
6852 RExC_parse++;
6853 }
6854 else if (c == '{')
6855 count++;
6856 else if (c == '}')
6857 count--;
6858 RExC_parse++;
6859 }
6860 if (*RExC_parse != ')') {
6861 RExC_parse = s;
6862 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6863 }
6864 if (!SIZE_ONLY) {
6865 PAD *pad;
6866 OP_4tree *sop, *rop;
6867 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6868
6869 ENTER;
6870 Perl_save_re_context(aTHX);
6871 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6872 sop->op_private |= OPpREFCOUNTED;
6873 /* re_dup will OpREFCNT_inc */
6874 OpREFCNT_set(sop, 1);
6875 LEAVE;
6876
6877 n = add_data(pRExC_state, 3, "nop");
6878 RExC_rxi->data->data[n] = (void*)rop;
6879 RExC_rxi->data->data[n+1] = (void*)sop;
6880 RExC_rxi->data->data[n+2] = (void*)pad;
6881 SvREFCNT_dec(sv);
6882 }
6883 else { /* First pass */
6884 if (PL_reginterp_cnt < ++RExC_seen_evals
6885 && IN_PERL_RUNTIME)
6886 /* No compiled RE interpolated, has runtime
6887 components ===> unsafe. */
6888 FAIL("Eval-group not allowed at runtime, use re 'eval'");
6889 if (PL_tainting && PL_tainted)
6890 FAIL("Eval-group in insecure regular expression");
6891#if PERL_VERSION > 8
6892 if (IN_PERL_COMPILETIME)
6893 PL_cv_has_eval = 1;
6894#endif
6895 }
6896
6897 nextchar(pRExC_state);
6898 if (is_logical) {
6899 ret = reg_node(pRExC_state, LOGICAL);
6900 if (!SIZE_ONLY)
6901 ret->flags = 2;
6902 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6903 /* deal with the length of this later - MJD */
6904 return ret;
6905 }
6906 ret = reganode(pRExC_state, EVAL, n);
6907 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6908 Set_Node_Offset(ret, parse_start);
6909 return ret;
6910 }
6911 case '(': /* (?(?{...})...) and (?(?=...)...) */
6912 {
6913 int is_define= 0;
6914 if (RExC_parse[0] == '?') { /* (?(?...)) */
6915 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6916 || RExC_parse[1] == '<'
6917 || RExC_parse[1] == '{') { /* Lookahead or eval. */
6918 I32 flag;
6919
6920 ret = reg_node(pRExC_state, LOGICAL);
6921 if (!SIZE_ONLY)
6922 ret->flags = 1;
6923 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6924 goto insert_if;
6925 }
6926 }
6927 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6928 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6929 {
6930 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6931 char *name_start= RExC_parse++;
6932 U32 num = 0;
6933 SV *sv_dat=reg_scan_name(pRExC_state,
6934 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6935 if (RExC_parse == name_start || *RExC_parse != ch)
6936 vFAIL2("Sequence (?(%c... not terminated",
6937 (ch == '>' ? '<' : ch));
6938 RExC_parse++;
6939 if (!SIZE_ONLY) {
6940 num = add_data( pRExC_state, 1, "S" );
6941 RExC_rxi->data->data[num]=(void*)sv_dat;
6942 SvREFCNT_inc_simple_void(sv_dat);
6943 }
6944 ret = reganode(pRExC_state,NGROUPP,num);
6945 goto insert_if_check_paren;
6946 }
6947 else if (RExC_parse[0] == 'D' &&
6948 RExC_parse[1] == 'E' &&
6949 RExC_parse[2] == 'F' &&
6950 RExC_parse[3] == 'I' &&
6951 RExC_parse[4] == 'N' &&
6952 RExC_parse[5] == 'E')
6953 {
6954 ret = reganode(pRExC_state,DEFINEP,0);
6955 RExC_parse +=6 ;
6956 is_define = 1;
6957 goto insert_if_check_paren;
6958 }
6959 else if (RExC_parse[0] == 'R') {
6960 RExC_parse++;
6961 parno = 0;
6962 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6963 parno = atoi(RExC_parse++);
6964 while (isDIGIT(*RExC_parse))
6965 RExC_parse++;
6966 } else if (RExC_parse[0] == '&') {
6967 SV *sv_dat;
6968 RExC_parse++;
6969 sv_dat = reg_scan_name(pRExC_state,
6970 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6971 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6972 }
6973 ret = reganode(pRExC_state,INSUBP,parno);
6974 goto insert_if_check_paren;
6975 }
6976 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6977 /* (?(1)...) */
6978 char c;
6979 parno = atoi(RExC_parse++);
6980
6981 while (isDIGIT(*RExC_parse))
6982 RExC_parse++;
6983 ret = reganode(pRExC_state, GROUPP, parno);
6984
6985 insert_if_check_paren:
6986 if ((c = *nextchar(pRExC_state)) != ')')
6987 vFAIL("Switch condition not recognized");
6988 insert_if:
6989 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6990 br = regbranch(pRExC_state, &flags, 1,depth+1);
6991 if (br == NULL)
6992 br = reganode(pRExC_state, LONGJMP, 0);
6993 else
6994 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6995 c = *nextchar(pRExC_state);
6996 if (flags&HASWIDTH)
6997 *flagp |= HASWIDTH;
6998 if (c == '|') {
6999 if (is_define)
7000 vFAIL("(?(DEFINE)....) does not allow branches");
7001 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
7002 regbranch(pRExC_state, &flags, 1,depth+1);
7003 REGTAIL(pRExC_state, ret, lastbr);
7004 if (flags&HASWIDTH)
7005 *flagp |= HASWIDTH;
7006 c = *nextchar(pRExC_state);
7007 }
7008 else
7009 lastbr = NULL;
7010 if (c != ')')
7011 vFAIL("Switch (?(condition)... contains too many branches");
7012 ender = reg_node(pRExC_state, TAIL);
7013 REGTAIL(pRExC_state, br, ender);
7014 if (lastbr) {
7015 REGTAIL(pRExC_state, lastbr, ender);
7016 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
7017 }
7018 else
7019 REGTAIL(pRExC_state, ret, ender);
7020 RExC_size++; /* XXX WHY do we need this?!!
7021 For large programs it seems to be required
7022 but I can't figure out why. -- dmq*/
7023 return ret;
7024 }
7025 else {
7026 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
7027 }
7028 }
7029 case 0:
7030 RExC_parse--; /* for vFAIL to print correctly */
7031 vFAIL("Sequence (? incomplete");
7032 break;
7033 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
7034 that follow */
7035 has_use_defaults = TRUE;
7036 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
7037 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
7038 ? REGEX_UNICODE_CHARSET
7039 : REGEX_DEPENDS_CHARSET);
7040 goto parse_flags;
7041 default:
7042 --RExC_parse;
7043 parse_flags: /* (?i) */
7044 {
7045 U32 posflags = 0, negflags = 0;
7046 U32 *flagsp = &posflags;
7047 bool has_charset_modifier = 0;
7048 regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
7049 ? REGEX_UNICODE_CHARSET
7050 : REGEX_DEPENDS_CHARSET;
7051
7052 while (*RExC_parse) {
7053 /* && strchr("iogcmsx", *RExC_parse) */
7054 /* (?g), (?gc) and (?o) are useless here
7055 and must be globally applied -- japhy */
7056 switch (*RExC_parse) {
7057 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
7058 case LOCALE_PAT_MOD:
7059 if (has_charset_modifier || flagsp == &negflags) {
7060 goto fail_modifiers;
7061 }
7062 cs = REGEX_LOCALE_CHARSET;
7063 has_charset_modifier = 1;
7064 RExC_contains_locale = 1;
7065 break;
7066 case UNICODE_PAT_MOD:
7067 if (has_charset_modifier || flagsp == &negflags) {
7068 goto fail_modifiers;
7069 }
7070 cs = REGEX_UNICODE_CHARSET;
7071 has_charset_modifier = 1;
7072 break;
7073 case ASCII_RESTRICT_PAT_MOD:
7074 if (has_charset_modifier || flagsp == &negflags) {
7075 goto fail_modifiers;
7076 }
7077 if (*(RExC_parse + 1) == ASCII_RESTRICT_PAT_MOD) {
7078 /* Doubled modifier implies more restricted */
7079 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
7080 RExC_parse++;
7081 }
7082 else {
7083 cs = REGEX_ASCII_RESTRICTED_CHARSET;
7084 }
7085 has_charset_modifier = 1;
7086 break;
7087 case DEPENDS_PAT_MOD:
7088 if (has_use_defaults
7089 || has_charset_modifier
7090 || flagsp == &negflags)
7091 {
7092 goto fail_modifiers;
7093 }
7094
7095 /* The dual charset means unicode semantics if the
7096 * pattern (or target, not known until runtime) are
7097 * utf8, or something in the pattern indicates unicode
7098 * semantics */
7099 cs = (RExC_utf8 || RExC_uni_semantics)
7100 ? REGEX_UNICODE_CHARSET
7101 : REGEX_DEPENDS_CHARSET;
7102 has_charset_modifier = 1;
7103 break;
7104 case ONCE_PAT_MOD: /* 'o' */
7105 case GLOBAL_PAT_MOD: /* 'g' */
7106 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7107 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
7108 if (! (wastedflags & wflagbit) ) {
7109 wastedflags |= wflagbit;
7110 vWARN5(
7111 RExC_parse + 1,
7112 "Useless (%s%c) - %suse /%c modifier",
7113 flagsp == &negflags ? "?-" : "?",
7114 *RExC_parse,
7115 flagsp == &negflags ? "don't " : "",
7116 *RExC_parse
7117 );
7118 }
7119 }
7120 break;
7121
7122 case CONTINUE_PAT_MOD: /* 'c' */
7123 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7124 if (! (wastedflags & WASTED_C) ) {
7125 wastedflags |= WASTED_GC;
7126 vWARN3(
7127 RExC_parse + 1,
7128 "Useless (%sc) - %suse /gc modifier",
7129 flagsp == &negflags ? "?-" : "?",
7130 flagsp == &negflags ? "don't " : ""
7131 );
7132 }
7133 }
7134 break;
7135 case KEEPCOPY_PAT_MOD: /* 'p' */
7136 if (flagsp == &negflags) {
7137 if (SIZE_ONLY)
7138 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
7139 } else {
7140 *flagsp |= RXf_PMf_KEEPCOPY;
7141 }
7142 break;
7143 case '-':
7144 /* A flag is a default iff it is following a minus, so
7145 * if there is a minus, it means will be trying to
7146 * re-specify a default which is an error */
7147 if (has_use_defaults || flagsp == &negflags) {
7148 fail_modifiers:
7149 RExC_parse++;
7150 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7151 /*NOTREACHED*/
7152 }
7153 flagsp = &negflags;
7154 wastedflags = 0; /* reset so (?g-c) warns twice */
7155 break;
7156 case ':':
7157 paren = ':';
7158 /*FALLTHROUGH*/
7159 case ')':
7160 RExC_flags |= posflags;
7161 RExC_flags &= ~negflags;
7162 set_regex_charset(&RExC_flags, cs);
7163 if (paren != ':') {
7164 oregflags |= posflags;
7165 oregflags &= ~negflags;
7166 set_regex_charset(&oregflags, cs);
7167 }
7168 nextchar(pRExC_state);
7169 if (paren != ':') {
7170 *flagp = TRYAGAIN;
7171 return NULL;
7172 } else {
7173 ret = NULL;
7174 goto parse_rest;
7175 }
7176 /*NOTREACHED*/
7177 default:
7178 RExC_parse++;
7179 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7180 /*NOTREACHED*/
7181 }
7182 ++RExC_parse;
7183 }
7184 }} /* one for the default block, one for the switch */
7185 }
7186 else { /* (...) */
7187 capturing_parens:
7188 parno = RExC_npar;
7189 RExC_npar++;
7190
7191 ret = reganode(pRExC_state, OPEN, parno);
7192 if (!SIZE_ONLY ){
7193 if (!RExC_nestroot)
7194 RExC_nestroot = parno;
7195 if (RExC_seen & REG_SEEN_RECURSE
7196 && !RExC_open_parens[parno-1])
7197 {
7198 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7199 "Setting open paren #%"IVdf" to %d\n",
7200 (IV)parno, REG_NODE_NUM(ret)));
7201 RExC_open_parens[parno-1]= ret;
7202 }
7203 }
7204 Set_Node_Length(ret, 1); /* MJD */
7205 Set_Node_Offset(ret, RExC_parse); /* MJD */
7206 is_open = 1;
7207 }
7208 }
7209 else /* ! paren */
7210 ret = NULL;
7211
7212 parse_rest:
7213 /* Pick up the branches, linking them together. */
7214 parse_start = RExC_parse; /* MJD */
7215 br = regbranch(pRExC_state, &flags, 1,depth+1);
7216
7217 /* branch_len = (paren != 0); */
7218
7219 if (br == NULL)
7220 return(NULL);
7221 if (*RExC_parse == '|') {
7222 if (!SIZE_ONLY && RExC_extralen) {
7223 reginsert(pRExC_state, BRANCHJ, br, depth+1);
7224 }
7225 else { /* MJD */
7226 reginsert(pRExC_state, BRANCH, br, depth+1);
7227 Set_Node_Length(br, paren != 0);
7228 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
7229 }
7230 have_branch = 1;
7231 if (SIZE_ONLY)
7232 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
7233 }
7234 else if (paren == ':') {
7235 *flagp |= flags&SIMPLE;
7236 }
7237 if (is_open) { /* Starts with OPEN. */
7238 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
7239 }
7240 else if (paren != '?') /* Not Conditional */
7241 ret = br;
7242 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7243 lastbr = br;
7244 while (*RExC_parse == '|') {
7245 if (!SIZE_ONLY && RExC_extralen) {
7246 ender = reganode(pRExC_state, LONGJMP,0);
7247 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
7248 }
7249 if (SIZE_ONLY)
7250 RExC_extralen += 2; /* Account for LONGJMP. */
7251 nextchar(pRExC_state);
7252 if (freeze_paren) {
7253 if (RExC_npar > after_freeze)
7254 after_freeze = RExC_npar;
7255 RExC_npar = freeze_paren;
7256 }
7257 br = regbranch(pRExC_state, &flags, 0, depth+1);
7258
7259 if (br == NULL)
7260 return(NULL);
7261 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
7262 lastbr = br;
7263 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7264 }
7265
7266 if (have_branch || paren != ':') {
7267 /* Make a closing node, and hook it on the end. */
7268 switch (paren) {
7269 case ':':
7270 ender = reg_node(pRExC_state, TAIL);
7271 break;
7272 case 1:
7273 ender = reganode(pRExC_state, CLOSE, parno);
7274 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
7275 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7276 "Setting close paren #%"IVdf" to %d\n",
7277 (IV)parno, REG_NODE_NUM(ender)));
7278 RExC_close_parens[parno-1]= ender;
7279 if (RExC_nestroot == parno)
7280 RExC_nestroot = 0;
7281 }
7282 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
7283 Set_Node_Length(ender,1); /* MJD */
7284 break;
7285 case '<':
7286 case ',':
7287 case '=':
7288 case '!':
7289 *flagp &= ~HASWIDTH;
7290 /* FALL THROUGH */
7291 case '>':
7292 ender = reg_node(pRExC_state, SUCCEED);
7293 break;
7294 case 0:
7295 ender = reg_node(pRExC_state, END);
7296 if (!SIZE_ONLY) {
7297 assert(!RExC_opend); /* there can only be one! */
7298 RExC_opend = ender;
7299 }
7300 break;
7301 }
7302 REGTAIL(pRExC_state, lastbr, ender);
7303
7304 if (have_branch && !SIZE_ONLY) {
7305 if (depth==1)
7306 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
7307
7308 /* Hook the tails of the branches to the closing node. */
7309 for (br = ret; br; br = regnext(br)) {
7310 const U8 op = PL_regkind[OP(br)];
7311 if (op == BRANCH) {
7312 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
7313 }
7314 else if (op == BRANCHJ) {
7315 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
7316 }
7317 }
7318 }
7319 }
7320
7321 {
7322 const char *p;
7323 static const char parens[] = "=!<,>";
7324
7325 if (paren && (p = strchr(parens, paren))) {
7326 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
7327 int flag = (p - parens) > 1;
7328
7329 if (paren == '>')
7330 node = SUSPEND, flag = 0;
7331 reginsert(pRExC_state, node,ret, depth+1);
7332 Set_Node_Cur_Length(ret);
7333 Set_Node_Offset(ret, parse_start + 1);
7334 ret->flags = flag;
7335 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
7336 }
7337 }
7338
7339 /* Check for proper termination. */
7340 if (paren) {
7341 RExC_flags = oregflags;
7342 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
7343 RExC_parse = oregcomp_parse;
7344 vFAIL("Unmatched (");
7345 }
7346 }
7347 else if (!paren && RExC_parse < RExC_end) {
7348 if (*RExC_parse == ')') {
7349 RExC_parse++;
7350 vFAIL("Unmatched )");
7351 }
7352 else
7353 FAIL("Junk on end of regexp"); /* "Can't happen". */
7354 /* NOTREACHED */
7355 }
7356
7357 if (RExC_in_lookbehind) {
7358 RExC_in_lookbehind--;
7359 }
7360 if (after_freeze > RExC_npar)
7361 RExC_npar = after_freeze;
7362 return(ret);
7363}
7364
7365/*
7366 - regbranch - one alternative of an | operator
7367 *
7368 * Implements the concatenation operator.
7369 */
7370STATIC regnode *
7371S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
7372{
7373 dVAR;
7374 register regnode *ret;
7375 register regnode *chain = NULL;
7376 register regnode *latest;
7377 I32 flags = 0, c = 0;
7378 GET_RE_DEBUG_FLAGS_DECL;
7379
7380 PERL_ARGS_ASSERT_REGBRANCH;
7381
7382 DEBUG_PARSE("brnc");
7383
7384 if (first)
7385 ret = NULL;
7386 else {
7387 if (!SIZE_ONLY && RExC_extralen)
7388 ret = reganode(pRExC_state, BRANCHJ,0);
7389 else {
7390 ret = reg_node(pRExC_state, BRANCH);
7391 Set_Node_Length(ret, 1);
7392 }
7393 }
7394
7395 if (!first && SIZE_ONLY)
7396 RExC_extralen += 1; /* BRANCHJ */
7397
7398 *flagp = WORST; /* Tentatively. */
7399
7400 RExC_parse--;
7401 nextchar(pRExC_state);
7402 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
7403 flags &= ~TRYAGAIN;
7404 latest = regpiece(pRExC_state, &flags,depth+1);
7405 if (latest == NULL) {
7406 if (flags & TRYAGAIN)
7407 continue;
7408 return(NULL);
7409 }
7410 else if (ret == NULL)
7411 ret = latest;
7412 *flagp |= flags&(HASWIDTH|POSTPONED);
7413 if (chain == NULL) /* First piece. */
7414 *flagp |= flags&SPSTART;
7415 else {
7416 RExC_naughty++;
7417 REGTAIL(pRExC_state, chain, latest);
7418 }
7419 chain = latest;
7420 c++;
7421 }
7422 if (chain == NULL) { /* Loop ran zero times. */
7423 chain = reg_node(pRExC_state, NOTHING);
7424 if (ret == NULL)
7425 ret = chain;
7426 }
7427 if (c == 1) {
7428 *flagp |= flags&SIMPLE;
7429 }
7430
7431 return ret;
7432}
7433
7434/*
7435 - regpiece - something followed by possible [*+?]
7436 *
7437 * Note that the branching code sequences used for ? and the general cases
7438 * of * and + are somewhat optimized: they use the same NOTHING node as
7439 * both the endmarker for their branch list and the body of the last branch.
7440 * It might seem that this node could be dispensed with entirely, but the
7441 * endmarker role is not redundant.
7442 */
7443STATIC regnode *
7444S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7445{
7446 dVAR;
7447 register regnode *ret;
7448 register char op;
7449 register char *next;
7450 I32 flags;
7451 const char * const origparse = RExC_parse;
7452 I32 min;
7453 I32 max = REG_INFTY;
7454 char *parse_start;
7455 const char *maxpos = NULL;
7456 GET_RE_DEBUG_FLAGS_DECL;
7457
7458 PERL_ARGS_ASSERT_REGPIECE;
7459
7460 DEBUG_PARSE("piec");
7461
7462 ret = regatom(pRExC_state, &flags,depth+1);
7463 if (ret == NULL) {
7464 if (flags & TRYAGAIN)
7465 *flagp |= TRYAGAIN;
7466 return(NULL);
7467 }
7468
7469 op = *RExC_parse;
7470
7471 if (op == '{' && regcurly(RExC_parse)) {
7472 maxpos = NULL;
7473 parse_start = RExC_parse; /* MJD */
7474 next = RExC_parse + 1;
7475 while (isDIGIT(*next) || *next == ',') {
7476 if (*next == ',') {
7477 if (maxpos)
7478 break;
7479 else
7480 maxpos = next;
7481 }
7482 next++;
7483 }
7484 if (*next == '}') { /* got one */
7485 if (!maxpos)
7486 maxpos = next;
7487 RExC_parse++;
7488 min = atoi(RExC_parse);
7489 if (*maxpos == ',')
7490 maxpos++;
7491 else
7492 maxpos = RExC_parse;
7493 max = atoi(maxpos);
7494 if (!max && *maxpos != '0')
7495 max = REG_INFTY; /* meaning "infinity" */
7496 else if (max >= REG_INFTY)
7497 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
7498 RExC_parse = next;
7499 nextchar(pRExC_state);
7500
7501 do_curly:
7502 if ((flags&SIMPLE)) {
7503 RExC_naughty += 2 + RExC_naughty / 2;
7504 reginsert(pRExC_state, CURLY, ret, depth+1);
7505 Set_Node_Offset(ret, parse_start+1); /* MJD */
7506 Set_Node_Cur_Length(ret);
7507 }
7508 else {
7509 regnode * const w = reg_node(pRExC_state, WHILEM);
7510
7511 w->flags = 0;
7512 REGTAIL(pRExC_state, ret, w);
7513 if (!SIZE_ONLY && RExC_extralen) {
7514 reginsert(pRExC_state, LONGJMP,ret, depth+1);
7515 reginsert(pRExC_state, NOTHING,ret, depth+1);
7516 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
7517 }
7518 reginsert(pRExC_state, CURLYX,ret, depth+1);
7519 /* MJD hk */
7520 Set_Node_Offset(ret, parse_start+1);
7521 Set_Node_Length(ret,
7522 op == '{' ? (RExC_parse - parse_start) : 1);
7523
7524 if (!SIZE_ONLY && RExC_extralen)
7525 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
7526 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
7527 if (SIZE_ONLY)
7528 RExC_whilem_seen++, RExC_extralen += 3;
7529 RExC_naughty += 4 + RExC_naughty; /* compound interest */
7530 }
7531 ret->flags = 0;
7532
7533 if (min > 0)
7534 *flagp = WORST;
7535 if (max > 0)
7536 *flagp |= HASWIDTH;
7537 if (max < min)
7538 vFAIL("Can't do {n,m} with n > m");
7539 if (!SIZE_ONLY) {
7540 ARG1_SET(ret, (U16)min);
7541 ARG2_SET(ret, (U16)max);
7542 }
7543
7544 goto nest_check;
7545 }
7546 }
7547
7548 if (!ISMULT1(op)) {
7549 *flagp = flags;
7550 return(ret);
7551 }
7552
7553#if 0 /* Now runtime fix should be reliable. */
7554
7555 /* if this is reinstated, don't forget to put this back into perldiag:
7556
7557 =item Regexp *+ operand could be empty at {#} in regex m/%s/
7558
7559 (F) The part of the regexp subject to either the * or + quantifier
7560 could match an empty string. The {#} shows in the regular
7561 expression about where the problem was discovered.
7562
7563 */
7564
7565 if (!(flags&HASWIDTH) && op != '?')
7566 vFAIL("Regexp *+ operand could be empty");
7567#endif
7568
7569 parse_start = RExC_parse;
7570 nextchar(pRExC_state);
7571
7572 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
7573
7574 if (op == '*' && (flags&SIMPLE)) {
7575 reginsert(pRExC_state, STAR, ret, depth+1);
7576 ret->flags = 0;
7577 RExC_naughty += 4;
7578 }
7579 else if (op == '*') {
7580 min = 0;
7581 goto do_curly;
7582 }
7583 else if (op == '+' && (flags&SIMPLE)) {
7584 reginsert(pRExC_state, PLUS, ret, depth+1);
7585 ret->flags = 0;
7586 RExC_naughty += 3;
7587 }
7588 else if (op == '+') {
7589 min = 1;
7590 goto do_curly;
7591 }
7592 else if (op == '?') {
7593 min = 0; max = 1;
7594 goto do_curly;
7595 }
7596 nest_check:
7597 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
7598 ckWARN3reg(RExC_parse,
7599 "%.*s matches null string many times",
7600 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
7601 origparse);
7602 }
7603
7604 if (RExC_parse < RExC_end && *RExC_parse == '?') {
7605 nextchar(pRExC_state);
7606 reginsert(pRExC_state, MINMOD, ret, depth+1);
7607 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
7608 }
7609#ifndef REG_ALLOW_MINMOD_SUSPEND
7610 else
7611#endif
7612 if (RExC_parse < RExC_end && *RExC_parse == '+') {
7613 regnode *ender;
7614 nextchar(pRExC_state);
7615 ender = reg_node(pRExC_state, SUCCEED);
7616 REGTAIL(pRExC_state, ret, ender);
7617 reginsert(pRExC_state, SUSPEND, ret, depth+1);
7618 ret->flags = 0;
7619 ender = reg_node(pRExC_state, TAIL);
7620 REGTAIL(pRExC_state, ret, ender);
7621 /*ret= ender;*/
7622 }
7623
7624 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
7625 RExC_parse++;
7626 vFAIL("Nested quantifiers");
7627 }
7628
7629 return(ret);
7630}
7631
7632
7633/* reg_namedseq(pRExC_state,UVp)
7634
7635 This is expected to be called by a parser routine that has
7636 recognized '\N' and needs to handle the rest. RExC_parse is
7637 expected to point at the first char following the N at the time
7638 of the call.
7639
7640 The \N may be inside (indicated by valuep not being NULL) or outside a
7641 character class.
7642
7643 \N may begin either a named sequence, or if outside a character class, mean
7644 to match a non-newline. For non single-quoted regexes, the tokenizer has
7645 attempted to decide which, and in the case of a named sequence converted it
7646 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
7647 where c1... are the characters in the sequence. For single-quoted regexes,
7648 the tokenizer passes the \N sequence through unchanged; this code will not
7649 attempt to determine this nor expand those. The net effect is that if the
7650 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
7651 signals that this \N occurrence means to match a non-newline.
7652
7653 Only the \N{U+...} form should occur in a character class, for the same
7654 reason that '.' inside a character class means to just match a period: it
7655 just doesn't make sense.
7656
7657 If valuep is non-null then it is assumed that we are parsing inside
7658 of a charclass definition and the first codepoint in the resolved
7659 string is returned via *valuep and the routine will return NULL.
7660 In this mode if a multichar string is returned from the charnames
7661 handler, a warning will be issued, and only the first char in the
7662 sequence will be examined. If the string returned is zero length
7663 then the value of *valuep is undefined and NON-NULL will
7664 be returned to indicate failure. (This will NOT be a valid pointer
7665 to a regnode.)
7666
7667 If valuep is null then it is assumed that we are parsing normal text and a
7668 new EXACT node is inserted into the program containing the resolved string,
7669 and a pointer to the new node is returned. But if the string is zero length
7670 a NOTHING node is emitted instead.
7671
7672 On success RExC_parse is set to the char following the endbrace.
7673 Parsing failures will generate a fatal error via vFAIL(...)
7674 */
7675STATIC regnode *
7676S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
7677{
7678 char * endbrace; /* '}' following the name */
7679 regnode *ret = NULL;
7680#ifdef DEBUGGING
7681 char* parse_start = RExC_parse - 2; /* points to the '\N' */
7682#endif
7683 char* p;
7684
7685 GET_RE_DEBUG_FLAGS_DECL;
7686
7687 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
7688
7689 GET_RE_DEBUG_FLAGS;
7690
7691 /* The [^\n] meaning of \N ignores spaces and comments under the /x
7692 * modifier. The other meaning does not */
7693 p = (RExC_flags & RXf_PMf_EXTENDED)
7694 ? regwhite( pRExC_state, RExC_parse )
7695 : RExC_parse;
7696
7697 /* Disambiguate between \N meaning a named character versus \N meaning
7698 * [^\n]. The former is assumed when it can't be the latter. */
7699 if (*p != '{' || regcurly(p)) {
7700 RExC_parse = p;
7701 if (valuep) {
7702 /* no bare \N in a charclass */
7703 vFAIL("\\N in a character class must be a named character: \\N{...}");
7704 }
7705 nextchar(pRExC_state);
7706 ret = reg_node(pRExC_state, REG_ANY);
7707 *flagp |= HASWIDTH|SIMPLE;
7708 RExC_naughty++;
7709 RExC_parse--;
7710 Set_Node_Length(ret, 1); /* MJD */
7711 return ret;
7712 }
7713
7714 /* Here, we have decided it should be a named sequence */
7715
7716 /* The test above made sure that the next real character is a '{', but
7717 * under the /x modifier, it could be separated by space (or a comment and
7718 * \n) and this is not allowed (for consistency with \x{...} and the
7719 * tokenizer handling of \N{NAME}). */
7720 if (*RExC_parse != '{') {
7721 vFAIL("Missing braces on \\N{}");
7722 }
7723
7724 RExC_parse++; /* Skip past the '{' */
7725
7726 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
7727 || ! (endbrace == RExC_parse /* nothing between the {} */
7728 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
7729 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
7730 {
7731 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
7732 vFAIL("\\N{NAME} must be resolved by the lexer");
7733 }
7734
7735 if (endbrace == RExC_parse) { /* empty: \N{} */
7736 if (! valuep) {
7737 RExC_parse = endbrace + 1;
7738 return reg_node(pRExC_state,NOTHING);
7739 }
7740
7741 if (SIZE_ONLY) {
7742 ckWARNreg(RExC_parse,
7743 "Ignoring zero length \\N{} in character class"
7744 );
7745 RExC_parse = endbrace + 1;
7746 }
7747 *valuep = 0;
7748 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
7749 }
7750
7751 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
7752 RExC_parse += 2; /* Skip past the 'U+' */
7753
7754 if (valuep) { /* In a bracketed char class */
7755 /* We only pay attention to the first char of
7756 multichar strings being returned. I kinda wonder
7757 if this makes sense as it does change the behaviour
7758 from earlier versions, OTOH that behaviour was broken
7759 as well. XXX Solution is to recharacterize as
7760 [rest-of-class]|multi1|multi2... */
7761
7762 STRLEN length_of_hex;
7763 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7764 | PERL_SCAN_DISALLOW_PREFIX
7765 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7766
7767 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7768 if (endchar < endbrace) {
7769 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7770 }
7771
7772 length_of_hex = (STRLEN)(endchar - RExC_parse);
7773 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7774
7775 /* The tokenizer should have guaranteed validity, but it's possible to
7776 * bypass it by using single quoting, so check */
7777 if (length_of_hex == 0
7778 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7779 {
7780 RExC_parse += length_of_hex; /* Includes all the valid */
7781 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7782 ? UTF8SKIP(RExC_parse)
7783 : 1;
7784 /* Guard against malformed utf8 */
7785 if (RExC_parse >= endchar) RExC_parse = endchar;
7786 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7787 }
7788
7789 RExC_parse = endbrace + 1;
7790 if (endchar == endbrace) return NULL;
7791
7792 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
7793 }
7794 else { /* Not a char class */
7795 char *s; /* String to put in generated EXACT node */
7796 STRLEN len = 0; /* Its current byte length */
7797 char *endchar; /* Points to '.' or '}' ending cur char in the input
7798 stream */
7799 ret = reg_node(pRExC_state,
7800 (U8) ((! FOLD) ? EXACT
7801 : (LOC)
7802 ? EXACTFL
7803 : (MORE_ASCII_RESTRICTED)
7804 ? EXACTFA
7805 : (AT_LEAST_UNI_SEMANTICS)
7806 ? EXACTFU
7807 : EXACTF));
7808 s= STRING(ret);
7809
7810 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
7811 * the input which is of the form now 'c1.c2.c3...}' until find the
7812 * ending brace or exceed length 255. The characters that exceed this
7813 * limit are dropped. The limit could be relaxed should it become
7814 * desirable by reparsing this as (?:\N{NAME}), so could generate
7815 * multiple EXACT nodes, as is done for just regular input. But this
7816 * is primarily a named character, and not intended to be a huge long
7817 * string, so 255 bytes should be good enough */
7818 while (1) {
7819 STRLEN length_of_hex;
7820 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7821 | PERL_SCAN_DISALLOW_PREFIX
7822 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7823 UV cp; /* Ord of current character */
7824 bool use_this_char_fold = FOLD;
7825
7826 /* Code points are separated by dots. If none, there is only one
7827 * code point, and is terminated by the brace */
7828 endchar = RExC_parse + strcspn(RExC_parse, ".}");
7829
7830 /* The values are Unicode even on EBCDIC machines */
7831 length_of_hex = (STRLEN)(endchar - RExC_parse);
7832 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7833 if ( length_of_hex == 0
7834 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7835 {
7836 RExC_parse += length_of_hex; /* Includes all the valid */
7837 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7838 ? UTF8SKIP(RExC_parse)
7839 : 1;
7840 /* Guard against malformed utf8 */
7841 if (RExC_parse >= endchar) RExC_parse = endchar;
7842 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7843 }
7844
7845 /* XXX ? Change to ANYOF node
7846 if (FOLD
7847 && (cp > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
7848 && is_TRICKYFOLD_cp(cp))
7849 {
7850 }
7851 */
7852
7853 /* Under /aa, we can't mix ASCII with non- in a fold. If we are
7854 * folding, and the source isn't ASCII, look through all the
7855 * characters it folds to. If any one of them is ASCII, forbid
7856 * this fold. (cp is uni, so the 127 below is correct even for
7857 * EBCDIC). Similarly under locale rules, we don't mix under 256
7858 * with above 255. XXX It really doesn't make sense to have \N{}
7859 * which means a Unicode rules under locale. I (khw) think this
7860 * should be warned about, but the counter argument is that people
7861 * who have programmed around Perl's earlier lack of specifying the
7862 * rules and used \N{} to force Unicode things in a local
7863 * environment shouldn't get suddenly a warning */
7864 if (use_this_char_fold) {
7865 if (LOC && cp < 256) { /* Fold not known until run-time */
7866 use_this_char_fold = FALSE;
7867 }
7868 else if ((cp > 127 && MORE_ASCII_RESTRICTED)
7869 || (cp > 255 && LOC))
7870 {
7871 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
7872 U8* s = tmpbuf;
7873 U8* e;
7874 STRLEN foldlen;
7875
7876 (void) toFOLD_uni(cp, tmpbuf, &foldlen);
7877 e = s + foldlen;
7878
7879 while (s < e) {
7880 if (isASCII(*s)
7881 || (LOC && (UTF8_IS_INVARIANT(*s)
7882 || UTF8_IS_DOWNGRADEABLE_START(*s))))
7883 {
7884 use_this_char_fold = FALSE;
7885 break;
7886 }
7887 s += UTF8SKIP(s);
7888 }
7889 }
7890 }
7891
7892 if (! use_this_char_fold) { /* Not folding, just append to the
7893 string */
7894 STRLEN unilen;
7895
7896 /* Quit before adding this character if would exceed limit */
7897 if (len + UNISKIP(cp) > U8_MAX) break;
7898
7899 unilen = reguni(pRExC_state, cp, s);
7900 if (unilen > 0) {
7901 s += unilen;
7902 len += unilen;
7903 }
7904 } else { /* Folding, output the folded equivalent */
7905 STRLEN foldlen,numlen;
7906 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7907 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7908
7909 /* Quit before exceeding size limit */
7910 if (len + foldlen > U8_MAX) break;
7911
7912 for (foldbuf = tmpbuf;
7913 foldlen;
7914 foldlen -= numlen)
7915 {
7916 cp = utf8_to_uvchr(foldbuf, &numlen);
7917 if (numlen > 0) {
7918 const STRLEN unilen = reguni(pRExC_state, cp, s);
7919 s += unilen;
7920 len += unilen;
7921 /* In EBCDIC the numlen and unilen can differ. */
7922 foldbuf += numlen;
7923 if (numlen >= foldlen)
7924 break;
7925 }
7926 else
7927 break; /* "Can't happen." */
7928 }
7929 }
7930
7931 /* Point to the beginning of the next character in the sequence. */
7932 RExC_parse = endchar + 1;
7933
7934 /* Quit if no more characters */
7935 if (RExC_parse >= endbrace) break;
7936 }
7937
7938
7939 if (SIZE_ONLY) {
7940 if (RExC_parse < endbrace) {
7941 ckWARNreg(RExC_parse - 1,
7942 "Using just the first characters returned by \\N{}");
7943 }
7944
7945 RExC_size += STR_SZ(len);
7946 } else {
7947 STR_LEN(ret) = len;
7948 RExC_emit += STR_SZ(len);
7949 }
7950
7951 RExC_parse = endbrace + 1;
7952
7953 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7954 with malformed in t/re/pat_advanced.t */
7955 RExC_parse --;
7956 Set_Node_Cur_Length(ret); /* MJD */
7957 nextchar(pRExC_state);
7958 }
7959
7960 return ret;
7961}
7962
7963
7964/*
7965 * reg_recode
7966 *
7967 * It returns the code point in utf8 for the value in *encp.
7968 * value: a code value in the source encoding
7969 * encp: a pointer to an Encode object
7970 *
7971 * If the result from Encode is not a single character,
7972 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7973 */
7974STATIC UV
7975S_reg_recode(pTHX_ const char value, SV **encp)
7976{
7977 STRLEN numlen = 1;
7978 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7979 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7980 const STRLEN newlen = SvCUR(sv);
7981 UV uv = UNICODE_REPLACEMENT;
7982
7983 PERL_ARGS_ASSERT_REG_RECODE;
7984
7985 if (newlen)
7986 uv = SvUTF8(sv)
7987 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7988 : *(U8*)s;
7989
7990 if (!newlen || numlen != newlen) {
7991 uv = UNICODE_REPLACEMENT;
7992 *encp = NULL;
7993 }
7994 return uv;
7995}
7996
7997
7998/*
7999 - regatom - the lowest level
8000
8001 Try to identify anything special at the start of the pattern. If there
8002 is, then handle it as required. This may involve generating a single regop,
8003 such as for an assertion; or it may involve recursing, such as to
8004 handle a () structure.
8005
8006 If the string doesn't start with something special then we gobble up
8007 as much literal text as we can.
8008
8009 Once we have been able to handle whatever type of thing started the
8010 sequence, we return.
8011
8012 Note: we have to be careful with escapes, as they can be both literal
8013 and special, and in the case of \10 and friends can either, depending
8014 on context. Specifically there are two separate switches for handling
8015 escape sequences, with the one for handling literal escapes requiring
8016 a dummy entry for all of the special escapes that are actually handled
8017 by the other.
8018*/
8019
8020STATIC regnode *
8021S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8022{
8023 dVAR;
8024 register regnode *ret = NULL;
8025 I32 flags;
8026 char *parse_start = RExC_parse;
8027 U8 op;
8028 GET_RE_DEBUG_FLAGS_DECL;
8029 DEBUG_PARSE("atom");
8030 *flagp = WORST; /* Tentatively. */
8031
8032 PERL_ARGS_ASSERT_REGATOM;
8033
8034tryagain:
8035 switch ((U8)*RExC_parse) {
8036 case '^':
8037 RExC_seen_zerolen++;
8038 nextchar(pRExC_state);
8039 if (RExC_flags & RXf_PMf_MULTILINE)
8040 ret = reg_node(pRExC_state, MBOL);
8041 else if (RExC_flags & RXf_PMf_SINGLELINE)
8042 ret = reg_node(pRExC_state, SBOL);
8043 else
8044 ret = reg_node(pRExC_state, BOL);
8045 Set_Node_Length(ret, 1); /* MJD */
8046 break;
8047 case '$':
8048 nextchar(pRExC_state);
8049 if (*RExC_parse)
8050 RExC_seen_zerolen++;
8051 if (RExC_flags & RXf_PMf_MULTILINE)
8052 ret = reg_node(pRExC_state, MEOL);
8053 else if (RExC_flags & RXf_PMf_SINGLELINE)
8054 ret = reg_node(pRExC_state, SEOL);
8055 else
8056 ret = reg_node(pRExC_state, EOL);
8057 Set_Node_Length(ret, 1); /* MJD */
8058 break;
8059 case '.':
8060 nextchar(pRExC_state);
8061 if (RExC_flags & RXf_PMf_SINGLELINE)
8062 ret = reg_node(pRExC_state, SANY);
8063 else
8064 ret = reg_node(pRExC_state, REG_ANY);
8065 *flagp |= HASWIDTH|SIMPLE;
8066 RExC_naughty++;
8067 Set_Node_Length(ret, 1); /* MJD */
8068 break;
8069 case '[':
8070 {
8071 char * const oregcomp_parse = ++RExC_parse;
8072 ret = regclass(pRExC_state,depth+1);
8073 if (*RExC_parse != ']') {
8074 RExC_parse = oregcomp_parse;
8075 vFAIL("Unmatched [");
8076 }
8077 nextchar(pRExC_state);
8078 *flagp |= HASWIDTH|SIMPLE;
8079 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
8080 break;
8081 }
8082 case '(':
8083 nextchar(pRExC_state);
8084 ret = reg(pRExC_state, 1, &flags,depth+1);
8085 if (ret == NULL) {
8086 if (flags & TRYAGAIN) {
8087 if (RExC_parse == RExC_end) {
8088 /* Make parent create an empty node if needed. */
8089 *flagp |= TRYAGAIN;
8090 return(NULL);
8091 }
8092 goto tryagain;
8093 }
8094 return(NULL);
8095 }
8096 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
8097 break;
8098 case '|':
8099 case ')':
8100 if (flags & TRYAGAIN) {
8101 *flagp |= TRYAGAIN;
8102 return NULL;
8103 }
8104 vFAIL("Internal urp");
8105 /* Supposed to be caught earlier. */
8106 break;
8107 case '{':
8108 if (!regcurly(RExC_parse)) {
8109 RExC_parse++;
8110 goto defchar;
8111 }
8112 /* FALL THROUGH */
8113 case '?':
8114 case '+':
8115 case '*':
8116 RExC_parse++;
8117 vFAIL("Quantifier follows nothing");
8118 break;
8119 case LATIN_SMALL_LETTER_SHARP_S:
8120 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8121 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8122#if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
8123#error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ. Other instances in this code should have the case statement below.
8124 case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
8125#endif
8126 do_foldchar:
8127 if (!LOC && FOLD) {
8128 U32 len,cp;
8129 len=0; /* silence a spurious compiler warning */
8130 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
8131 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
8132 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
8133 ret = reganode(pRExC_state, FOLDCHAR, cp);
8134 Set_Node_Length(ret, 1); /* MJD */
8135 nextchar(pRExC_state); /* kill whitespace under /x */
8136 return ret;
8137 }
8138 }
8139 goto outer_default;
8140 case '\\':
8141 /* Special Escapes
8142
8143 This switch handles escape sequences that resolve to some kind
8144 of special regop and not to literal text. Escape sequnces that
8145 resolve to literal text are handled below in the switch marked
8146 "Literal Escapes".
8147
8148 Every entry in this switch *must* have a corresponding entry
8149 in the literal escape switch. However, the opposite is not
8150 required, as the default for this switch is to jump to the
8151 literal text handling code.
8152 */
8153 switch ((U8)*++RExC_parse) {
8154 case LATIN_SMALL_LETTER_SHARP_S:
8155 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8156 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8157 goto do_foldchar;
8158 /* Special Escapes */
8159 case 'A':
8160 RExC_seen_zerolen++;
8161 ret = reg_node(pRExC_state, SBOL);
8162 *flagp |= SIMPLE;
8163 goto finish_meta_pat;
8164 case 'G':
8165 ret = reg_node(pRExC_state, GPOS);
8166 RExC_seen |= REG_SEEN_GPOS;
8167 *flagp |= SIMPLE;
8168 goto finish_meta_pat;
8169 case 'K':
8170 RExC_seen_zerolen++;
8171 ret = reg_node(pRExC_state, KEEPS);
8172 *flagp |= SIMPLE;
8173 /* XXX:dmq : disabling in-place substitution seems to
8174 * be necessary here to avoid cases of memory corruption, as
8175 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
8176 */
8177 RExC_seen |= REG_SEEN_LOOKBEHIND;
8178 goto finish_meta_pat;
8179 case 'Z':
8180 ret = reg_node(pRExC_state, SEOL);
8181 *flagp |= SIMPLE;
8182 RExC_seen_zerolen++; /* Do not optimize RE away */
8183 goto finish_meta_pat;
8184 case 'z':
8185 ret = reg_node(pRExC_state, EOS);
8186 *flagp |= SIMPLE;
8187 RExC_seen_zerolen++; /* Do not optimize RE away */
8188 goto finish_meta_pat;
8189 case 'C':
8190 ret = reg_node(pRExC_state, CANY);
8191 RExC_seen |= REG_SEEN_CANY;
8192 *flagp |= HASWIDTH|SIMPLE;
8193 goto finish_meta_pat;
8194 case 'X':
8195 ret = reg_node(pRExC_state, CLUMP);
8196 *flagp |= HASWIDTH;
8197 goto finish_meta_pat;
8198 case 'w':
8199 switch (get_regex_charset(RExC_flags)) {
8200 case REGEX_LOCALE_CHARSET:
8201 op = ALNUML;
8202 break;
8203 case REGEX_UNICODE_CHARSET:
8204 op = ALNUMU;
8205 break;
8206 case REGEX_ASCII_RESTRICTED_CHARSET:
8207 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8208 op = ALNUMA;
8209 break;
8210 case REGEX_DEPENDS_CHARSET:
8211 op = ALNUM;
8212 break;
8213 default:
8214 goto bad_charset;
8215 }
8216 ret = reg_node(pRExC_state, op);
8217 *flagp |= HASWIDTH|SIMPLE;
8218 goto finish_meta_pat;
8219 case 'W':
8220 switch (get_regex_charset(RExC_flags)) {
8221 case REGEX_LOCALE_CHARSET:
8222 op = NALNUML;
8223 break;
8224 case REGEX_UNICODE_CHARSET:
8225 op = NALNUMU;
8226 break;
8227 case REGEX_ASCII_RESTRICTED_CHARSET:
8228 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8229 op = NALNUMA;
8230 break;
8231 case REGEX_DEPENDS_CHARSET:
8232 op = NALNUM;
8233 break;
8234 default:
8235 goto bad_charset;
8236 }
8237 ret = reg_node(pRExC_state, op);
8238 *flagp |= HASWIDTH|SIMPLE;
8239 goto finish_meta_pat;
8240 case 'b':
8241 RExC_seen_zerolen++;
8242 RExC_seen |= REG_SEEN_LOOKBEHIND;
8243 switch (get_regex_charset(RExC_flags)) {
8244 case REGEX_LOCALE_CHARSET:
8245 op = BOUNDL;
8246 break;
8247 case REGEX_UNICODE_CHARSET:
8248 op = BOUNDU;
8249 break;
8250 case REGEX_ASCII_RESTRICTED_CHARSET:
8251 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8252 op = BOUNDA;
8253 break;
8254 case REGEX_DEPENDS_CHARSET:
8255 op = BOUND;
8256 break;
8257 default:
8258 goto bad_charset;
8259 }
8260 ret = reg_node(pRExC_state, op);
8261 FLAGS(ret) = get_regex_charset(RExC_flags);
8262 *flagp |= SIMPLE;
8263 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8264 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
8265 }
8266 goto finish_meta_pat;
8267 case 'B':
8268 RExC_seen_zerolen++;
8269 RExC_seen |= REG_SEEN_LOOKBEHIND;
8270 switch (get_regex_charset(RExC_flags)) {
8271 case REGEX_LOCALE_CHARSET:
8272 op = NBOUNDL;
8273 break;
8274 case REGEX_UNICODE_CHARSET:
8275 op = NBOUNDU;
8276 break;
8277 case REGEX_ASCII_RESTRICTED_CHARSET:
8278 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8279 op = NBOUNDA;
8280 break;
8281 case REGEX_DEPENDS_CHARSET:
8282 op = NBOUND;
8283 break;
8284 default:
8285 goto bad_charset;
8286 }
8287 ret = reg_node(pRExC_state, op);
8288 FLAGS(ret) = get_regex_charset(RExC_flags);
8289 *flagp |= SIMPLE;
8290 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8291 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
8292 }
8293 goto finish_meta_pat;
8294 case 's':
8295 switch (get_regex_charset(RExC_flags)) {
8296 case REGEX_LOCALE_CHARSET:
8297 op = SPACEL;
8298 break;
8299 case REGEX_UNICODE_CHARSET:
8300 op = SPACEU;
8301 break;
8302 case REGEX_ASCII_RESTRICTED_CHARSET:
8303 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8304 op = SPACEA;
8305 break;
8306 case REGEX_DEPENDS_CHARSET:
8307 op = SPACE;
8308 break;
8309 default:
8310 goto bad_charset;
8311 }
8312 ret = reg_node(pRExC_state, op);
8313 *flagp |= HASWIDTH|SIMPLE;
8314 goto finish_meta_pat;
8315 case 'S':
8316 switch (get_regex_charset(RExC_flags)) {
8317 case REGEX_LOCALE_CHARSET:
8318 op = NSPACEL;
8319 break;
8320 case REGEX_UNICODE_CHARSET:
8321 op = NSPACEU;
8322 break;
8323 case REGEX_ASCII_RESTRICTED_CHARSET:
8324 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8325 op = NSPACEA;
8326 break;
8327 case REGEX_DEPENDS_CHARSET:
8328 op = NSPACE;
8329 break;
8330 default:
8331 goto bad_charset;
8332 }
8333 ret = reg_node(pRExC_state, op);
8334 *flagp |= HASWIDTH|SIMPLE;
8335 goto finish_meta_pat;
8336 case 'd':
8337 switch (get_regex_charset(RExC_flags)) {
8338 case REGEX_LOCALE_CHARSET:
8339 op = DIGITL;
8340 break;
8341 case REGEX_ASCII_RESTRICTED_CHARSET:
8342 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8343 op = DIGITA;
8344 break;
8345 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8346 case REGEX_UNICODE_CHARSET:
8347 op = DIGIT;
8348 break;
8349 default:
8350 goto bad_charset;
8351 }
8352 ret = reg_node(pRExC_state, op);
8353 *flagp |= HASWIDTH|SIMPLE;
8354 goto finish_meta_pat;
8355 case 'D':
8356 switch (get_regex_charset(RExC_flags)) {
8357 case REGEX_LOCALE_CHARSET:
8358 op = NDIGITL;
8359 break;
8360 case REGEX_ASCII_RESTRICTED_CHARSET:
8361 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8362 op = NDIGITA;
8363 break;
8364 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8365 case REGEX_UNICODE_CHARSET:
8366 op = NDIGIT;
8367 break;
8368 default:
8369 goto bad_charset;
8370 }
8371 ret = reg_node(pRExC_state, op);
8372 *flagp |= HASWIDTH|SIMPLE;
8373 goto finish_meta_pat;
8374 case 'R':
8375 ret = reg_node(pRExC_state, LNBREAK);
8376 *flagp |= HASWIDTH|SIMPLE;
8377 goto finish_meta_pat;
8378 case 'h':
8379 ret = reg_node(pRExC_state, HORIZWS);
8380 *flagp |= HASWIDTH|SIMPLE;
8381 goto finish_meta_pat;
8382 case 'H':
8383 ret = reg_node(pRExC_state, NHORIZWS);
8384 *flagp |= HASWIDTH|SIMPLE;
8385 goto finish_meta_pat;
8386 case 'v':
8387 ret = reg_node(pRExC_state, VERTWS);
8388 *flagp |= HASWIDTH|SIMPLE;
8389 goto finish_meta_pat;
8390 case 'V':
8391 ret = reg_node(pRExC_state, NVERTWS);
8392 *flagp |= HASWIDTH|SIMPLE;
8393 finish_meta_pat:
8394 nextchar(pRExC_state);
8395 Set_Node_Length(ret, 2); /* MJD */
8396 break;
8397 case 'p':
8398 case 'P':
8399 {
8400 char* const oldregxend = RExC_end;
8401#ifdef DEBUGGING
8402 char* parse_start = RExC_parse - 2;
8403#endif
8404
8405 if (RExC_parse[1] == '{') {
8406 /* a lovely hack--pretend we saw [\pX] instead */
8407 RExC_end = strchr(RExC_parse, '}');
8408 if (!RExC_end) {
8409 const U8 c = (U8)*RExC_parse;
8410 RExC_parse += 2;
8411 RExC_end = oldregxend;
8412 vFAIL2("Missing right brace on \\%c{}", c);
8413 }
8414 RExC_end++;
8415 }
8416 else {
8417 RExC_end = RExC_parse + 2;
8418 if (RExC_end > oldregxend)
8419 RExC_end = oldregxend;
8420 }
8421 RExC_parse--;
8422
8423 ret = regclass(pRExC_state,depth+1);
8424
8425 RExC_end = oldregxend;
8426 RExC_parse--;
8427
8428 Set_Node_Offset(ret, parse_start + 2);
8429 Set_Node_Cur_Length(ret);
8430 nextchar(pRExC_state);
8431 *flagp |= HASWIDTH|SIMPLE;
8432 }
8433 break;
8434 case 'N':
8435 /* Handle \N and \N{NAME} here and not below because it can be
8436 multicharacter. join_exact() will join them up later on.
8437 Also this makes sure that things like /\N{BLAH}+/ and
8438 \N{BLAH} being multi char Just Happen. dmq*/
8439 ++RExC_parse;
8440 ret= reg_namedseq(pRExC_state, NULL, flagp);
8441 break;
8442 case 'k': /* Handle \k<NAME> and \k'NAME' */
8443 parse_named_seq:
8444 {
8445 char ch= RExC_parse[1];
8446 if (ch != '<' && ch != '\'' && ch != '{') {
8447 RExC_parse++;
8448 vFAIL2("Sequence %.2s... not terminated",parse_start);
8449 } else {
8450 /* this pretty much dupes the code for (?P=...) in reg(), if
8451 you change this make sure you change that */
8452 char* name_start = (RExC_parse += 2);
8453 U32 num = 0;
8454 SV *sv_dat = reg_scan_name(pRExC_state,
8455 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8456 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
8457 if (RExC_parse == name_start || *RExC_parse != ch)
8458 vFAIL2("Sequence %.3s... not terminated",parse_start);
8459
8460 if (!SIZE_ONLY) {
8461 num = add_data( pRExC_state, 1, "S" );
8462 RExC_rxi->data->data[num]=(void*)sv_dat;
8463 SvREFCNT_inc_simple_void(sv_dat);
8464 }
8465
8466 RExC_sawback = 1;
8467 ret = reganode(pRExC_state,
8468 ((! FOLD)
8469 ? NREF
8470 : (MORE_ASCII_RESTRICTED)
8471 ? NREFFA
8472 : (AT_LEAST_UNI_SEMANTICS)
8473 ? NREFFU
8474 : (LOC)
8475 ? NREFFL
8476 : NREFF),
8477 num);
8478 *flagp |= HASWIDTH;
8479
8480 /* override incorrect value set in reganode MJD */
8481 Set_Node_Offset(ret, parse_start+1);
8482 Set_Node_Cur_Length(ret); /* MJD */
8483 nextchar(pRExC_state);
8484
8485 }
8486 break;
8487 }
8488 case 'g':
8489 case '1': case '2': case '3': case '4':
8490 case '5': case '6': case '7': case '8': case '9':
8491 {
8492 I32 num;
8493 bool isg = *RExC_parse == 'g';
8494 bool isrel = 0;
8495 bool hasbrace = 0;
8496 if (isg) {
8497 RExC_parse++;
8498 if (*RExC_parse == '{') {
8499 RExC_parse++;
8500 hasbrace = 1;
8501 }
8502 if (*RExC_parse == '-') {
8503 RExC_parse++;
8504 isrel = 1;
8505 }
8506 if (hasbrace && !isDIGIT(*RExC_parse)) {
8507 if (isrel) RExC_parse--;
8508 RExC_parse -= 2;
8509 goto parse_named_seq;
8510 } }
8511 num = atoi(RExC_parse);
8512 if (isg && num == 0)
8513 vFAIL("Reference to invalid group 0");
8514 if (isrel) {
8515 num = RExC_npar - num;
8516 if (num < 1)
8517 vFAIL("Reference to nonexistent or unclosed group");
8518 }
8519 if (!isg && num > 9 && num >= RExC_npar)
8520 goto defchar;
8521 else {
8522 char * const parse_start = RExC_parse - 1; /* MJD */
8523 while (isDIGIT(*RExC_parse))
8524 RExC_parse++;
8525 if (parse_start == RExC_parse - 1)
8526 vFAIL("Unterminated \\g... pattern");
8527 if (hasbrace) {
8528 if (*RExC_parse != '}')
8529 vFAIL("Unterminated \\g{...} pattern");
8530 RExC_parse++;
8531 }
8532 if (!SIZE_ONLY) {
8533 if (num > (I32)RExC_rx->nparens)
8534 vFAIL("Reference to nonexistent group");
8535 }
8536 RExC_sawback = 1;
8537 ret = reganode(pRExC_state,
8538 ((! FOLD)
8539 ? REF
8540 : (MORE_ASCII_RESTRICTED)
8541 ? REFFA
8542 : (AT_LEAST_UNI_SEMANTICS)
8543 ? REFFU
8544 : (LOC)
8545 ? REFFL
8546 : REFF),
8547 num);
8548 *flagp |= HASWIDTH;
8549
8550 /* override incorrect value set in reganode MJD */
8551 Set_Node_Offset(ret, parse_start+1);
8552 Set_Node_Cur_Length(ret); /* MJD */
8553 RExC_parse--;
8554 nextchar(pRExC_state);
8555 }
8556 }
8557 break;
8558 case '\0':
8559 if (RExC_parse >= RExC_end)
8560 FAIL("Trailing \\");
8561 /* FALL THROUGH */
8562 default:
8563 /* Do not generate "unrecognized" warnings here, we fall
8564 back into the quick-grab loop below */
8565 parse_start--;
8566 goto defchar;
8567 }
8568 break;
8569
8570 case '#':
8571 if (RExC_flags & RXf_PMf_EXTENDED) {
8572 if ( reg_skipcomment( pRExC_state ) )
8573 goto tryagain;
8574 }
8575 /* FALL THROUGH */
8576
8577 default:
8578 outer_default:{
8579 register STRLEN len;
8580 register UV ender;
8581 register char *p;
8582 char *s;
8583 STRLEN foldlen;
8584 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
8585 regnode * orig_emit;
8586
8587 parse_start = RExC_parse - 1;
8588
8589 RExC_parse++;
8590
8591 defchar:
8592 ender = 0;
8593 orig_emit = RExC_emit; /* Save the original output node position in
8594 case we need to output a different node
8595 type */
8596 ret = reg_node(pRExC_state,
8597 (U8) ((! FOLD) ? EXACT
8598 : (LOC)
8599 ? EXACTFL
8600 : (MORE_ASCII_RESTRICTED)
8601 ? EXACTFA
8602 : (AT_LEAST_UNI_SEMANTICS)
8603 ? EXACTFU
8604 : EXACTF)
8605 );
8606 s = STRING(ret);
8607 for (len = 0, p = RExC_parse - 1;
8608 len < 127 && p < RExC_end;
8609 len++)
8610 {
8611 char * const oldp = p;
8612
8613 if (RExC_flags & RXf_PMf_EXTENDED)
8614 p = regwhite( pRExC_state, p );
8615 switch ((U8)*p) {
8616 case LATIN_SMALL_LETTER_SHARP_S:
8617 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8618 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8619 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8620 goto normal_default;
8621 case '^':
8622 case '$':
8623 case '.':
8624 case '[':
8625 case '(':
8626 case ')':
8627 case '|':
8628 goto loopdone;
8629 case '\\':
8630 /* Literal Escapes Switch
8631
8632 This switch is meant to handle escape sequences that
8633 resolve to a literal character.
8634
8635 Every escape sequence that represents something
8636 else, like an assertion or a char class, is handled
8637 in the switch marked 'Special Escapes' above in this
8638 routine, but also has an entry here as anything that
8639 isn't explicitly mentioned here will be treated as
8640 an unescaped equivalent literal.
8641 */
8642
8643 switch ((U8)*++p) {
8644 /* These are all the special escapes. */
8645 case LATIN_SMALL_LETTER_SHARP_S:
8646 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8647 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8648 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8649 goto normal_default;
8650 case 'A': /* Start assertion */
8651 case 'b': case 'B': /* Word-boundary assertion*/
8652 case 'C': /* Single char !DANGEROUS! */
8653 case 'd': case 'D': /* digit class */
8654 case 'g': case 'G': /* generic-backref, pos assertion */
8655 case 'h': case 'H': /* HORIZWS */
8656 case 'k': case 'K': /* named backref, keep marker */
8657 case 'N': /* named char sequence */
8658 case 'p': case 'P': /* Unicode property */
8659 case 'R': /* LNBREAK */
8660 case 's': case 'S': /* space class */
8661 case 'v': case 'V': /* VERTWS */
8662 case 'w': case 'W': /* word class */
8663 case 'X': /* eXtended Unicode "combining character sequence" */
8664 case 'z': case 'Z': /* End of line/string assertion */
8665 --p;
8666 goto loopdone;
8667
8668 /* Anything after here is an escape that resolves to a
8669 literal. (Except digits, which may or may not)
8670 */
8671 case 'n':
8672 ender = '\n';
8673 p++;
8674 break;
8675 case 'r':
8676 ender = '\r';
8677 p++;
8678 break;
8679 case 't':
8680 ender = '\t';
8681 p++;
8682 break;
8683 case 'f':
8684 ender = '\f';
8685 p++;
8686 break;
8687 case 'e':
8688 ender = ASCII_TO_NATIVE('\033');
8689 p++;
8690 break;
8691 case 'a':
8692 ender = ASCII_TO_NATIVE('\007');
8693 p++;
8694 break;
8695 case 'o':
8696 {
8697 STRLEN brace_len = len;
8698 UV result;
8699 const char* error_msg;
8700
8701 bool valid = grok_bslash_o(p,
8702 &result,
8703 &brace_len,
8704 &error_msg,
8705 1);
8706 p += brace_len;
8707 if (! valid) {
8708 RExC_parse = p; /* going to die anyway; point
8709 to exact spot of failure */
8710 vFAIL(error_msg);
8711 }
8712 else
8713 {
8714 ender = result;
8715 }
8716 if (PL_encoding && ender < 0x100) {
8717 goto recode_encoding;
8718 }
8719 if (ender > 0xff) {
8720 REQUIRE_UTF8;
8721 }
8722 break;
8723 }
8724 case 'x':
8725 if (*++p == '{') {
8726 char* const e = strchr(p, '}');
8727
8728 if (!e) {
8729 RExC_parse = p + 1;
8730 vFAIL("Missing right brace on \\x{}");
8731 }
8732 else {
8733 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8734 | PERL_SCAN_DISALLOW_PREFIX;
8735 STRLEN numlen = e - p - 1;
8736 ender = grok_hex(p + 1, &numlen, &flags, NULL);
8737 if (ender > 0xff)
8738 REQUIRE_UTF8;
8739 p = e + 1;
8740 }
8741 }
8742 else {
8743 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8744 STRLEN numlen = 2;
8745 ender = grok_hex(p, &numlen, &flags, NULL);
8746 p += numlen;
8747 }
8748 if (PL_encoding && ender < 0x100)
8749 goto recode_encoding;
8750 break;
8751 case 'c':
8752 p++;
8753 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
8754 break;
8755 case '0': case '1': case '2': case '3':case '4':
8756 case '5': case '6': case '7': case '8':case '9':
8757 if (*p == '0' ||
8758 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
8759 {
8760 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8761 STRLEN numlen = 3;
8762 ender = grok_oct(p, &numlen, &flags, NULL);
8763 if (ender > 0xff) {
8764 REQUIRE_UTF8;
8765 }
8766 p += numlen;
8767 }
8768 else {
8769 --p;
8770 goto loopdone;
8771 }
8772 if (PL_encoding && ender < 0x100)
8773 goto recode_encoding;
8774 break;
8775 recode_encoding:
8776 {
8777 SV* enc = PL_encoding;
8778 ender = reg_recode((const char)(U8)ender, &enc);
8779 if (!enc && SIZE_ONLY)
8780 ckWARNreg(p, "Invalid escape in the specified encoding");
8781 REQUIRE_UTF8;
8782 }
8783 break;
8784 case '\0':
8785 if (p >= RExC_end)
8786 FAIL("Trailing \\");
8787 /* FALL THROUGH */
8788 default:
8789 if (!SIZE_ONLY&& isALPHA(*p)) {
8790 /* Include any { following the alpha to emphasize
8791 * that it could be part of an escape at some point
8792 * in the future */
8793 int len = (*(p + 1) == '{') ? 2 : 1;
8794 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
8795 }
8796 goto normal_default;
8797 }
8798 break;
8799 default:
8800 normal_default:
8801 if (UTF8_IS_START(*p) && UTF) {
8802 STRLEN numlen;
8803 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
8804 &numlen, UTF8_ALLOW_DEFAULT);
8805 p += numlen;
8806 }
8807 else
8808 ender = (U8) *p++;
8809 break;
8810 } /* End of switch on the literal */
8811
8812 /* Certain characters are problematic because their folded
8813 * length is so different from their original length that it
8814 * isn't handleable by the optimizer. They are therefore not
8815 * placed in an EXACTish node; and are here handled specially.
8816 * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
8817 * putting it in a special node keeps regexec from having to
8818 * deal with a non-utf8 multi-char fold */
8819 if (FOLD
8820 && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
8821 && is_TRICKYFOLD_cp(ender))
8822 {
8823 /* If is in middle of outputting characters into an
8824 * EXACTish node, go output what we have so far, and
8825 * position the parse so that this will be called again
8826 * immediately */
8827 if (len) {
8828 p = oldp;
8829 goto loopdone;
8830 }
8831 else {
8832
8833 /* Here we are ready to output our tricky fold
8834 * character. What's done is to pretend it's in a
8835 * [bracketed] class, and let the code that deals with
8836 * those handle it, as that code has all the
8837 * intelligence necessary. First save the current
8838 * parse state, get rid of the already allocated EXACT
8839 * node that the ANYOFV node will replace, and point
8840 * the parse to a buffer which we fill with the
8841 * character we want the regclass code to think is
8842 * being parsed */
8843 char* const oldregxend = RExC_end;
8844 char tmpbuf[2];
8845 RExC_emit = orig_emit;
8846 RExC_parse = tmpbuf;
8847 if (UTF) {
8848 tmpbuf[0] = UTF8_TWO_BYTE_HI(ender);
8849 tmpbuf[1] = UTF8_TWO_BYTE_LO(ender);
8850 RExC_end = RExC_parse + 2;
8851 }
8852 else {
8853 tmpbuf[0] = (char) ender;
8854 RExC_end = RExC_parse + 1;
8855 }
8856
8857 ret = regclass(pRExC_state,depth+1);
8858
8859 /* Here, have parsed the buffer. Reset the parse to
8860 * the actual input, and return */
8861 RExC_end = oldregxend;
8862 RExC_parse = p - 1;
8863
8864 Set_Node_Offset(ret, RExC_parse);
8865 Set_Node_Cur_Length(ret);
8866 nextchar(pRExC_state);
8867 *flagp |= HASWIDTH|SIMPLE;
8868 return ret;
8869 }
8870 }
8871
8872 if ( RExC_flags & RXf_PMf_EXTENDED)
8873 p = regwhite( pRExC_state, p );
8874 if (UTF && FOLD) {
8875 /* Prime the casefolded buffer. Locale rules, which apply
8876 * only to code points < 256, aren't known until execution,
8877 * so for them, just output the original character using
8878 * utf8 */
8879 if (LOC && ender < 256) {
8880 if (UNI_IS_INVARIANT(ender)) {
8881 *tmpbuf = (U8) ender;
8882 foldlen = 1;
8883 } else {
8884 *tmpbuf = UTF8_TWO_BYTE_HI(ender);
8885 *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
8886 foldlen = 2;
8887 }
8888 }
8889 else if (isASCII(ender)) { /* Note: Here can't also be LOC
8890 */
8891 ender = toLOWER(ender);
8892 *tmpbuf = (U8) ender;
8893 foldlen = 1;
8894 }
8895 else if (! MORE_ASCII_RESTRICTED && ! LOC) {
8896
8897 /* Locale and /aa require more selectivity about the
8898 * fold, so are handled below. Otherwise, here, just
8899 * use the fold */
8900 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
8901 }
8902 else {
8903 /* Under locale rules or /aa we are not to mix,
8904 * respectively, ords < 256 or ASCII with non-. So
8905 * reject folds that mix them, using only the
8906 * non-folded code point. So do the fold to a
8907 * temporary, and inspect each character in it. */
8908 U8 trialbuf[UTF8_MAXBYTES_CASE+1];
8909 U8* s = trialbuf;
8910 UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
8911 U8* e = s + foldlen;
8912 bool fold_ok = TRUE;
8913
8914 while (s < e) {
8915 if (isASCII(*s)
8916 || (LOC && (UTF8_IS_INVARIANT(*s)
8917 || UTF8_IS_DOWNGRADEABLE_START(*s))))
8918 {
8919 fold_ok = FALSE;
8920 break;
8921 }
8922 s += UTF8SKIP(s);
8923 }
8924 if (fold_ok) {
8925 Copy(trialbuf, tmpbuf, foldlen, U8);
8926 ender = tmpender;
8927 }
8928 else {
8929 uvuni_to_utf8(tmpbuf, ender);
8930 foldlen = UNISKIP(ender);
8931 }
8932 }
8933 }
8934 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
8935 if (len)
8936 p = oldp;
8937 else if (UTF) {
8938 if (FOLD) {
8939 /* Emit all the Unicode characters. */
8940 STRLEN numlen;
8941 for (foldbuf = tmpbuf;
8942 foldlen;
8943 foldlen -= numlen) {
8944 ender = utf8_to_uvchr(foldbuf, &numlen);
8945 if (numlen > 0) {
8946 const STRLEN unilen = reguni(pRExC_state, ender, s);
8947 s += unilen;
8948 len += unilen;
8949 /* In EBCDIC the numlen
8950 * and unilen can differ. */
8951 foldbuf += numlen;
8952 if (numlen >= foldlen)
8953 break;
8954 }
8955 else
8956 break; /* "Can't happen." */
8957 }
8958 }
8959 else {
8960 const STRLEN unilen = reguni(pRExC_state, ender, s);
8961 if (unilen > 0) {
8962 s += unilen;
8963 len += unilen;
8964 }
8965 }
8966 }
8967 else {
8968 len++;
8969 REGC((char)ender, s++);
8970 }
8971 break;
8972 }
8973 if (UTF) {
8974 if (FOLD) {
8975 /* Emit all the Unicode characters. */
8976 STRLEN numlen;
8977 for (foldbuf = tmpbuf;
8978 foldlen;
8979 foldlen -= numlen) {
8980 ender = utf8_to_uvchr(foldbuf, &numlen);
8981 if (numlen > 0) {
8982 const STRLEN unilen = reguni(pRExC_state, ender, s);
8983 len += unilen;
8984 s += unilen;
8985 /* In EBCDIC the numlen
8986 * and unilen can differ. */
8987 foldbuf += numlen;
8988 if (numlen >= foldlen)
8989 break;
8990 }
8991 else
8992 break;
8993 }
8994 }
8995 else {
8996 const STRLEN unilen = reguni(pRExC_state, ender, s);
8997 if (unilen > 0) {
8998 s += unilen;
8999 len += unilen;
9000 }
9001 }
9002 len--;
9003 }
9004 else
9005 REGC((char)ender, s++);
9006 }
9007 loopdone: /* Jumped to when encounters something that shouldn't be in
9008 the node */
9009 RExC_parse = p - 1;
9010 Set_Node_Cur_Length(ret); /* MJD */
9011 nextchar(pRExC_state);
9012 {
9013 /* len is STRLEN which is unsigned, need to copy to signed */
9014 IV iv = len;
9015 if (iv < 0)
9016 vFAIL("Internal disaster");
9017 }
9018 if (len > 0)
9019 *flagp |= HASWIDTH;
9020 if (len == 1 && UNI_IS_INVARIANT(ender))
9021 *flagp |= SIMPLE;
9022
9023 if (SIZE_ONLY)
9024 RExC_size += STR_SZ(len);
9025 else {
9026 STR_LEN(ret) = len;
9027 RExC_emit += STR_SZ(len);
9028 }
9029 }
9030 break;
9031 }
9032
9033 return(ret);
9034
9035/* Jumped to when an unrecognized character set is encountered */
9036bad_charset:
9037 Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9038 return(NULL);
9039}
9040
9041STATIC char *
9042S_regwhite( RExC_state_t *pRExC_state, char *p )
9043{
9044 const char *e = RExC_end;
9045
9046 PERL_ARGS_ASSERT_REGWHITE;
9047
9048 while (p < e) {
9049 if (isSPACE(*p))
9050 ++p;
9051 else if (*p == '#') {
9052 bool ended = 0;
9053 do {
9054 if (*p++ == '\n') {
9055 ended = 1;
9056 break;
9057 }
9058 } while (p < e);
9059 if (!ended)
9060 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9061 }
9062 else
9063 break;
9064 }
9065 return p;
9066}
9067
9068/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9069 Character classes ([:foo:]) can also be negated ([:^foo:]).
9070 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9071 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9072 but trigger failures because they are currently unimplemented. */
9073
9074#define POSIXCC_DONE(c) ((c) == ':')
9075#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9076#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9077
9078STATIC I32
9079S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9080{
9081 dVAR;
9082 I32 namedclass = OOB_NAMEDCLASS;
9083
9084 PERL_ARGS_ASSERT_REGPPOSIXCC;
9085
9086 if (value == '[' && RExC_parse + 1 < RExC_end &&
9087 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9088 POSIXCC(UCHARAT(RExC_parse))) {
9089 const char c = UCHARAT(RExC_parse);
9090 char* const s = RExC_parse++;
9091
9092 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9093 RExC_parse++;
9094 if (RExC_parse == RExC_end)
9095 /* Grandfather lone [:, [=, [. */
9096 RExC_parse = s;
9097 else {
9098 const char* const t = RExC_parse++; /* skip over the c */
9099 assert(*t == c);
9100
9101 if (UCHARAT(RExC_parse) == ']') {
9102 const char *posixcc = s + 1;
9103 RExC_parse++; /* skip over the ending ] */
9104
9105 if (*s == ':') {
9106 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9107 const I32 skip = t - posixcc;
9108
9109 /* Initially switch on the length of the name. */
9110 switch (skip) {
9111 case 4:
9112 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9113 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9114 break;
9115 case 5:
9116 /* Names all of length 5. */
9117 /* alnum alpha ascii blank cntrl digit graph lower
9118 print punct space upper */
9119 /* Offset 4 gives the best switch position. */
9120 switch (posixcc[4]) {
9121 case 'a':
9122 if (memEQ(posixcc, "alph", 4)) /* alpha */
9123 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
9124 break;
9125 case 'e':
9126 if (memEQ(posixcc, "spac", 4)) /* space */
9127 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
9128 break;
9129 case 'h':
9130 if (memEQ(posixcc, "grap", 4)) /* graph */
9131 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
9132 break;
9133 case 'i':
9134 if (memEQ(posixcc, "asci", 4)) /* ascii */
9135 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
9136 break;
9137 case 'k':
9138 if (memEQ(posixcc, "blan", 4)) /* blank */
9139 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
9140 break;
9141 case 'l':
9142 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9143 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
9144 break;
9145 case 'm':
9146 if (memEQ(posixcc, "alnu", 4)) /* alnum */
9147 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
9148 break;
9149 case 'r':
9150 if (memEQ(posixcc, "lowe", 4)) /* lower */
9151 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9152 else if (memEQ(posixcc, "uppe", 4)) /* upper */
9153 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
9154 break;
9155 case 't':
9156 if (memEQ(posixcc, "digi", 4)) /* digit */
9157 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9158 else if (memEQ(posixcc, "prin", 4)) /* print */
9159 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9160 else if (memEQ(posixcc, "punc", 4)) /* punct */
9161 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
9162 break;
9163 }
9164 break;
9165 case 6:
9166 if (memEQ(posixcc, "xdigit", 6))
9167 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
9168 break;
9169 }
9170
9171 if (namedclass == OOB_NAMEDCLASS)
9172 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9173 t - s - 1, s + 1);
9174 assert (posixcc[skip] == ':');
9175 assert (posixcc[skip+1] == ']');
9176 } else if (!SIZE_ONLY) {
9177 /* [[=foo=]] and [[.foo.]] are still future. */
9178
9179 /* adjust RExC_parse so the warning shows after
9180 the class closes */
9181 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
9182 RExC_parse++;
9183 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9184 }
9185 } else {
9186 /* Maternal grandfather:
9187 * "[:" ending in ":" but not in ":]" */
9188 RExC_parse = s;
9189 }
9190 }
9191 }
9192
9193 return namedclass;
9194}
9195
9196STATIC void
9197S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
9198{
9199 dVAR;
9200
9201 PERL_ARGS_ASSERT_CHECKPOSIXCC;
9202
9203 if (POSIXCC(UCHARAT(RExC_parse))) {
9204 const char *s = RExC_parse;
9205 const char c = *s++;
9206
9207 while (isALNUM(*s))
9208 s++;
9209 if (*s && c == *s && s[1] == ']') {
9210 ckWARN3reg(s+2,
9211 "POSIX syntax [%c %c] belongs inside character classes",
9212 c, c);
9213
9214 /* [[=foo=]] and [[.foo.]] are still future. */
9215 if (POSIXCC_NOTYET(c)) {
9216 /* adjust RExC_parse so the error shows after
9217 the class closes */
9218 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
9219 NOOP;
9220 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9221 }
9222 }
9223 }
9224}
9225
9226/* No locale test, and always Unicode semantics */
9227#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
9228ANYOF_##NAME: \
9229 for (value = 0; value < 256; value++) \
9230 if (TEST) \
9231 stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
9232 yesno = '+'; \
9233 what = WORD; \
9234 break; \
9235case ANYOF_N##NAME: \
9236 for (value = 0; value < 256; value++) \
9237 if (!TEST) \
9238 stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
9239 yesno = '!'; \
9240 what = WORD; \
9241 break
9242
9243/* Like the above, but there are differences if we are in uni-8-bit or not, so
9244 * there are two tests passed in, to use depending on that. There aren't any
9245 * cases where the label is different from the name, so no need for that
9246 * parameter */
9247#define _C_C_T_(NAME, TEST_8, TEST_7, WORD) \
9248ANYOF_##NAME: \
9249 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
9250 else if (UNI_SEMANTICS) { \
9251 for (value = 0; value < 256; value++) { \
9252 if (TEST_8(value)) stored += \
9253 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
9254 } \
9255 } \
9256 else { \
9257 for (value = 0; value < 128; value++) { \
9258 if (TEST_7(UNI_TO_NATIVE(value))) stored += \
9259 set_regclass_bit(pRExC_state, ret, \
9260 (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9261 } \
9262 } \
9263 yesno = '+'; \
9264 what = WORD; \
9265 break; \
9266case ANYOF_N##NAME: \
9267 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
9268 else if (UNI_SEMANTICS) { \
9269 for (value = 0; value < 256; value++) { \
9270 if (! TEST_8(value)) stored += \
9271 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
9272 } \
9273 } \
9274 else { \
9275 for (value = 0; value < 128; value++) { \
9276 if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \
9277 pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9278 } \
9279 if (AT_LEAST_ASCII_RESTRICTED) { \
9280 for (value = 128; value < 256; value++) { \
9281 stored += set_regclass_bit( \
9282 pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9283 } \
9284 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; \
9285 } \
9286 else { \
9287 /* For a non-ut8 target string with DEPENDS semantics, all above \
9288 * ASCII Latin1 code points match the complement of any of the \
9289 * classes. But in utf8, they have their Unicode semantics, so \
9290 * can't just set them in the bitmap, or else regexec.c will think \
9291 * they matched when they shouldn't. */ \
9292 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; \
9293 } \
9294 } \
9295 yesno = '!'; \
9296 what = WORD; \
9297 break
9298
9299STATIC U8
9300S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9301{
9302
9303 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
9304 * Locale folding is done at run-time, so this function should not be
9305 * called for nodes that are for locales.
9306 *
9307 * This function sets the bit corresponding to the fold of the input
9308 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
9309 * 'F' is 'f'.
9310 *
9311 * It also knows about the characters that are in the bitmap that have
9312 * folds that are matchable only outside it, and sets the appropriate lists
9313 * and flags.
9314 *
9315 * It returns the number of bits that actually changed from 0 to 1 */
9316
9317 U8 stored = 0;
9318 U8 fold;
9319
9320 PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
9321
9322 fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
9323 : PL_fold[value];
9324
9325 /* It assumes the bit for 'value' has already been set */
9326 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
9327 ANYOF_BITMAP_SET(node, fold);
9328 stored++;
9329 }
9330 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
9331 /* Certain Latin1 characters have matches outside the bitmap. To get
9332 * here, 'value' is one of those characters. None of these matches is
9333 * valid for ASCII characters under /aa, which have been excluded by
9334 * the 'if' above. The matches fall into three categories:
9335 * 1) They are singly folded-to or -from an above 255 character, as
9336 * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
9337 * WITH DIAERESIS;
9338 * 2) They are part of a multi-char fold with another character in the
9339 * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
9340 * 3) They are part of a multi-char fold with a character not in the
9341 * bitmap, such as various ligatures.
9342 * We aren't dealing fully with multi-char folds, except we do deal
9343 * with the pattern containing a character that has a multi-char fold
9344 * (not so much the inverse).
9345 * For types 1) and 3), the matches only happen when the target string
9346 * is utf8; that's not true for 2), and we set a flag for it.
9347 *
9348 * The code below adds to the passed in inversion list the single fold
9349 * closures for 'value'. The values are hard-coded here so that an
9350 * innocent-looking character class, like /[ks]/i won't have to go out
9351 * to disk to find the possible matches. XXX It would be better to
9352 * generate these via regen, in case a new version of the Unicode
9353 * standard adds new mappings, though that is not really likely. */
9354 switch (value) {
9355 case 'k':
9356 case 'K':
9357 /* KELVIN SIGN */
9358 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
9359 break;
9360 case 's':
9361 case 'S':
9362 /* LATIN SMALL LETTER LONG S */
9363 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
9364 break;
9365 case MICRO_SIGN:
9366 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9367 GREEK_SMALL_LETTER_MU);
9368 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9369 GREEK_CAPITAL_LETTER_MU);
9370 break;
9371 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
9372 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
9373 /* ANGSTROM SIGN */
9374 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
9375 if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
9376 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9377 PL_fold_latin1[value]);
9378 }
9379 break;
9380 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
9381 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9382 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
9383 break;
9384 case LATIN_SMALL_LETTER_SHARP_S:
9385 /* 0x1E9E is LATIN CAPITAL LETTER SHARP S */
9386 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x1E9E);
9387
9388 /* Under /a, /d, and /u, this can match the two chars "ss" */
9389 if (! MORE_ASCII_RESTRICTED) {
9390 add_alternate(alternate_ptr, (U8 *) "ss", 2);
9391
9392 /* And under /u or /a, it can match even if the target is
9393 * not utf8 */
9394 if (AT_LEAST_UNI_SEMANTICS) {
9395 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9396 }
9397 }
9398 break;
9399 case 'F': case 'f':
9400 case 'I': case 'i':
9401 case 'L': case 'l':
9402 case 'T': case 't':
9403 /* These all are targets of multi-character folds, which can
9404 * occur with only non-Latin1 characters in the fold, so they
9405 * can match if the target string isn't UTF-8 */
9406 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9407 break;
9408 case 'A': case 'a':
9409 case 'H': case 'h':
9410 case 'J': case 'j':
9411 case 'N': case 'n':
9412 case 'W': case 'w':
9413 case 'Y': case 'y':
9414 /* These all are targets of multi-character folds, which occur
9415 * only with a non-Latin1 character as part of the fold, so
9416 * they can't match unless the target string is in UTF-8, so no
9417 * action here is necessary */
9418 break;
9419 default:
9420 /* Use deprecated warning to increase the chances of this
9421 * being output */
9422 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
9423 break;
9424 }
9425 }
9426 else if (DEPENDS_SEMANTICS
9427 && ! isASCII(value)
9428 && PL_fold_latin1[value] != value)
9429 {
9430 /* Under DEPENDS rules, non-ASCII Latin1 characters match their
9431 * folds only when the target string is in UTF-8. We add the fold
9432 * here to the list of things to match outside the bitmap, which
9433 * won't be looked at unless it is UTF8 (or else if something else
9434 * says to look even if not utf8, but those things better not happen
9435 * under DEPENDS semantics. */
9436 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
9437 }
9438
9439 return stored;
9440}
9441
9442
9443PERL_STATIC_INLINE U8
9444S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9445{
9446 /* This inline function sets a bit in the bitmap if not already set, and if
9447 * appropriate, its fold, returning the number of bits that actually
9448 * changed from 0 to 1 */
9449
9450 U8 stored;
9451
9452 PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
9453
9454 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
9455 return 0;
9456 }
9457
9458 ANYOF_BITMAP_SET(node, value);
9459 stored = 1;
9460
9461 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
9462 stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
9463 }
9464
9465 return stored;
9466}
9467
9468STATIC void
9469S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
9470{
9471 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
9472 * alternate list, pointed to by 'alternate_ptr'. This is an array of
9473 * the multi-character folds of characters in the node */
9474 SV *sv;
9475
9476 PERL_ARGS_ASSERT_ADD_ALTERNATE;
9477
9478 if (! *alternate_ptr) {
9479 *alternate_ptr = newAV();
9480 }
9481 sv = newSVpvn_utf8((char*)string, len, TRUE);
9482 av_push(*alternate_ptr, sv);
9483 return;
9484}
9485
9486/*
9487 parse a class specification and produce either an ANYOF node that
9488 matches the pattern or perhaps will be optimized into an EXACTish node
9489 instead. The node contains a bit map for the first 256 characters, with the
9490 corresponding bit set if that character is in the list. For characters
9491 above 255, a range list is used */
9492
9493STATIC regnode *
9494S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
9495{
9496 dVAR;
9497 register UV nextvalue;
9498 register IV prevvalue = OOB_UNICODE;
9499 register IV range = 0;
9500 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
9501 register regnode *ret;
9502 STRLEN numlen;
9503 IV namedclass;
9504 char *rangebegin = NULL;
9505 bool need_class = 0;
9506 SV *listsv = NULL;
9507 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
9508 than just initialized. */
9509 UV n;
9510
9511 /* code points this node matches that can't be stored in the bitmap */
9512 HV* nonbitmap = NULL;
9513
9514 /* The items that are to match that aren't stored in the bitmap, but are a
9515 * result of things that are stored there. This is the fold closure of
9516 * such a character, either because it has DEPENDS semantics and shouldn't
9517 * be matched unless the target string is utf8, or is a code point that is
9518 * too large for the bit map, as for example, the fold of the MICRO SIGN is
9519 * above 255. This all is solely for performance reasons. By having this
9520 * code know the outside-the-bitmap folds that the bitmapped characters are
9521 * involved with, we don't have to go out to disk to find the list of
9522 * matches, unless the character class includes code points that aren't
9523 * storable in the bit map. That means that a character class with an 's'
9524 * in it, for example, doesn't need to go out to disk to find everything
9525 * that matches. A 2nd list is used so that the 'nonbitmap' list is kept
9526 * empty unless there is something whose fold we don't know about, and will
9527 * have to go out to the disk to find. */
9528 HV* l1_fold_invlist = NULL;
9529
9530 /* List of multi-character folds that are matched by this node */
9531 AV* unicode_alternate = NULL;
9532#ifdef EBCDIC
9533 UV literal_endpoint = 0;
9534#endif
9535 UV stored = 0; /* how many chars stored in the bitmap */
9536
9537 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
9538 case we need to change the emitted regop to an EXACT. */
9539 const char * orig_parse = RExC_parse;
9540 GET_RE_DEBUG_FLAGS_DECL;
9541
9542 PERL_ARGS_ASSERT_REGCLASS;
9543#ifndef DEBUGGING
9544 PERL_UNUSED_ARG(depth);
9545#endif
9546
9547 DEBUG_PARSE("clas");
9548
9549 /* Assume we are going to generate an ANYOF node. */
9550 ret = reganode(pRExC_state, ANYOF, 0);
9551
9552
9553 if (!SIZE_ONLY) {
9554 ANYOF_FLAGS(ret) = 0;
9555 }
9556
9557 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
9558 RExC_naughty++;
9559 RExC_parse++;
9560 if (!SIZE_ONLY)
9561 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
9562 }
9563
9564 if (SIZE_ONLY) {
9565 RExC_size += ANYOF_SKIP;
9566 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
9567 }
9568 else {
9569 RExC_emit += ANYOF_SKIP;
9570 if (LOC) {
9571 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
9572 }
9573 ANYOF_BITMAP_ZERO(ret);
9574 listsv = newSVpvs("# comment\n");
9575 initial_listsv_len = SvCUR(listsv);
9576 }
9577
9578 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9579
9580 if (!SIZE_ONLY && POSIXCC(nextvalue))
9581 checkposixcc(pRExC_state);
9582
9583 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
9584 if (UCHARAT(RExC_parse) == ']')
9585 goto charclassloop;
9586
9587parseit:
9588 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
9589
9590 charclassloop:
9591
9592 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9593
9594 if (!range)
9595 rangebegin = RExC_parse;
9596 if (UTF) {
9597 value = utf8n_to_uvchr((U8*)RExC_parse,
9598 RExC_end - RExC_parse,
9599 &numlen, UTF8_ALLOW_DEFAULT);
9600 RExC_parse += numlen;
9601 }
9602 else
9603 value = UCHARAT(RExC_parse++);
9604
9605 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9606 if (value == '[' && POSIXCC(nextvalue))
9607 namedclass = regpposixcc(pRExC_state, value);
9608 else if (value == '\\') {
9609 if (UTF) {
9610 value = utf8n_to_uvchr((U8*)RExC_parse,
9611 RExC_end - RExC_parse,
9612 &numlen, UTF8_ALLOW_DEFAULT);
9613 RExC_parse += numlen;
9614 }
9615 else
9616 value = UCHARAT(RExC_parse++);
9617 /* Some compilers cannot handle switching on 64-bit integer
9618 * values, therefore value cannot be an UV. Yes, this will
9619 * be a problem later if we want switch on Unicode.
9620 * A similar issue a little bit later when switching on
9621 * namedclass. --jhi */
9622 switch ((I32)value) {
9623 case 'w': namedclass = ANYOF_ALNUM; break;
9624 case 'W': namedclass = ANYOF_NALNUM; break;
9625 case 's': namedclass = ANYOF_SPACE; break;
9626 case 'S': namedclass = ANYOF_NSPACE; break;
9627 case 'd': namedclass = ANYOF_DIGIT; break;
9628 case 'D': namedclass = ANYOF_NDIGIT; break;
9629 case 'v': namedclass = ANYOF_VERTWS; break;
9630 case 'V': namedclass = ANYOF_NVERTWS; break;
9631 case 'h': namedclass = ANYOF_HORIZWS; break;
9632 case 'H': namedclass = ANYOF_NHORIZWS; break;
9633 case 'N': /* Handle \N{NAME} in class */
9634 {
9635 /* We only pay attention to the first char of
9636 multichar strings being returned. I kinda wonder
9637 if this makes sense as it does change the behaviour
9638 from earlier versions, OTOH that behaviour was broken
9639 as well. */
9640 UV v; /* value is register so we cant & it /grrr */
9641 if (reg_namedseq(pRExC_state, &v, NULL)) {
9642 goto parseit;
9643 }
9644 value= v;
9645 }
9646 break;
9647 case 'p':
9648 case 'P':
9649 {
9650 char *e;
9651 if (RExC_parse >= RExC_end)
9652 vFAIL2("Empty \\%c{}", (U8)value);
9653 if (*RExC_parse == '{') {
9654 const U8 c = (U8)value;
9655 e = strchr(RExC_parse++, '}');
9656 if (!e)
9657 vFAIL2("Missing right brace on \\%c{}", c);
9658 while (isSPACE(UCHARAT(RExC_parse)))
9659 RExC_parse++;
9660 if (e == RExC_parse)
9661 vFAIL2("Empty \\%c{}", c);
9662 n = e - RExC_parse;
9663 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
9664 n--;
9665 }
9666 else {
9667 e = RExC_parse;
9668 n = 1;
9669 }
9670 if (!SIZE_ONLY) {
9671 if (UCHARAT(RExC_parse) == '^') {
9672 RExC_parse++;
9673 n--;
9674 value = value == 'p' ? 'P' : 'p'; /* toggle */
9675 while (isSPACE(UCHARAT(RExC_parse))) {
9676 RExC_parse++;
9677 n--;
9678 }
9679 }
9680
9681 /* Add the property name to the list. If /i matching, give
9682 * a different name which consists of the normal name
9683 * sandwiched between two underscores and '_i'. The design
9684 * is discussed in the commit message for this. */
9685 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
9686 (value=='p' ? '+' : '!'),
9687 (FOLD) ? "__" : "",
9688 (int)n,
9689 RExC_parse,
9690 (FOLD) ? "_i" : ""
9691 );
9692 }
9693 RExC_parse = e + 1;
9694
9695 /* The \p could match something in the Latin1 range, hence
9696 * something that isn't utf8 */
9697 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
9698 namedclass = ANYOF_MAX; /* no official name, but it's named */
9699
9700 /* \p means they want Unicode semantics */
9701 RExC_uni_semantics = 1;
9702 }
9703 break;
9704 case 'n': value = '\n'; break;
9705 case 'r': value = '\r'; break;
9706 case 't': value = '\t'; break;
9707 case 'f': value = '\f'; break;
9708 case 'b': value = '\b'; break;
9709 case 'e': value = ASCII_TO_NATIVE('\033');break;
9710 case 'a': value = ASCII_TO_NATIVE('\007');break;
9711 case 'o':
9712 RExC_parse--; /* function expects to be pointed at the 'o' */
9713 {
9714 const char* error_msg;
9715 bool valid = grok_bslash_o(RExC_parse,
9716 &value,
9717 &numlen,
9718 &error_msg,
9719 SIZE_ONLY);
9720 RExC_parse += numlen;
9721 if (! valid) {
9722 vFAIL(error_msg);
9723 }
9724 }
9725 if (PL_encoding && value < 0x100) {
9726 goto recode_encoding;
9727 }
9728 break;
9729 case 'x':
9730 if (*RExC_parse == '{') {
9731 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9732 | PERL_SCAN_DISALLOW_PREFIX;
9733 char * const e = strchr(RExC_parse++, '}');
9734 if (!e)
9735 vFAIL("Missing right brace on \\x{}");
9736
9737 numlen = e - RExC_parse;
9738 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9739 RExC_parse = e + 1;
9740 }
9741 else {
9742 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9743 numlen = 2;
9744 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9745 RExC_parse += numlen;
9746 }
9747 if (PL_encoding && value < 0x100)
9748 goto recode_encoding;
9749 break;
9750 case 'c':
9751 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
9752 break;
9753 case '0': case '1': case '2': case '3': case '4':
9754 case '5': case '6': case '7':
9755 {
9756 /* Take 1-3 octal digits */
9757 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9758 numlen = 3;
9759 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
9760 RExC_parse += numlen;
9761 if (PL_encoding && value < 0x100)
9762 goto recode_encoding;
9763 break;
9764 }
9765 recode_encoding:
9766 {
9767 SV* enc = PL_encoding;
9768 value = reg_recode((const char)(U8)value, &enc);
9769 if (!enc && SIZE_ONLY)
9770 ckWARNreg(RExC_parse,
9771 "Invalid escape in the specified encoding");
9772 break;
9773 }
9774 default:
9775 /* Allow \_ to not give an error */
9776 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
9777 ckWARN2reg(RExC_parse,
9778 "Unrecognized escape \\%c in character class passed through",
9779 (int)value);
9780 }
9781 break;
9782 }
9783 } /* end of \blah */
9784#ifdef EBCDIC
9785 else
9786 literal_endpoint++;
9787#endif
9788
9789 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
9790
9791 /* What matches in a locale is not known until runtime, so need to
9792 * (one time per class) allocate extra space to pass to regexec.
9793 * The space will contain a bit for each named class that is to be
9794 * matched against. This isn't needed for \p{} and pseudo-classes,
9795 * as they are not affected by locale, and hence are dealt with
9796 * separately */
9797 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
9798 need_class = 1;
9799 if (SIZE_ONLY) {
9800 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
9801 }
9802 else {
9803 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
9804 ANYOF_CLASS_ZERO(ret);
9805 }
9806 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
9807 }
9808
9809 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
9810 * literal, as is the character that began the false range, i.e.
9811 * the 'a' in the examples */
9812 if (range) {
9813 if (!SIZE_ONLY) {
9814 const int w =
9815 RExC_parse >= rangebegin ?
9816 RExC_parse - rangebegin : 0;
9817 ckWARN4reg(RExC_parse,
9818 "False [] range \"%*.*s\"",
9819 w, w, rangebegin);
9820
9821 stored +=
9822 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9823 if (prevvalue < 256) {
9824 stored +=
9825 set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
9826 }
9827 else {
9828 nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
9829 }
9830 }
9831
9832 range = 0; /* this was not a true range */
9833 }
9834
9835
9836
9837 if (!SIZE_ONLY) {
9838 const char *what = NULL;
9839 char yesno = 0;
9840
9841 /* Possible truncation here but in some 64-bit environments
9842 * the compiler gets heartburn about switch on 64-bit values.
9843 * A similar issue a little earlier when switching on value.
9844 * --jhi */
9845 switch ((I32)namedclass) {
9846
9847 case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
9848 case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
9849 case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
9850 case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
9851 case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
9852 case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
9853 case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
9854 case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
9855 case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
9856 case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
9857 /* \s, \w match all unicode if utf8. */
9858 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
9859 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
9860 case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
9861 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
9862 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
9863 case ANYOF_ASCII:
9864 if (LOC)
9865 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
9866 else {
9867 for (value = 0; value < 128; value++)
9868 stored +=
9869 set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9870 }
9871 yesno = '+';
9872 what = NULL; /* Doesn't match outside ascii, so
9873 don't want to add +utf8:: */
9874 break;
9875 case ANYOF_NASCII:
9876 if (LOC)
9877 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
9878 else {
9879 for (value = 128; value < 256; value++)
9880 stored +=
9881 set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9882 }
9883 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9884 yesno = '!';
9885 what = "ASCII";
9886 break;
9887 case ANYOF_DIGIT:
9888 if (LOC)
9889 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
9890 else {
9891 /* consecutive digits assumed */
9892 for (value = '0'; value <= '9'; value++)
9893 stored +=
9894 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9895 }
9896 yesno = '+';
9897 what = "Digit";
9898 break;
9899 case ANYOF_NDIGIT:
9900 if (LOC)
9901 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
9902 else {
9903 /* consecutive digits assumed */
9904 for (value = 0; value < '0'; value++)
9905 stored +=
9906 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9907 for (value = '9' + 1; value < 256; value++)
9908 stored +=
9909 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9910 }
9911 yesno = '!';
9912 what = "Digit";
9913 if (AT_LEAST_ASCII_RESTRICTED ) {
9914 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9915 }
9916 break;
9917 case ANYOF_MAX:
9918 /* this is to handle \p and \P */
9919 break;
9920 default:
9921 vFAIL("Invalid [::] class");
9922 break;
9923 }
9924 if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
9925 /* Strings such as "+utf8::isWord\n" */
9926 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
9927 }
9928
9929 continue;
9930 }
9931 } /* end of namedclass \blah */
9932
9933 if (range) {
9934 if (prevvalue > (IV)value) /* b-a */ {
9935 const int w = RExC_parse - rangebegin;
9936 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
9937 range = 0; /* not a valid range */
9938 }
9939 }
9940 else {
9941 prevvalue = value; /* save the beginning of the range */
9942 if (RExC_parse+1 < RExC_end
9943 && *RExC_parse == '-'
9944 && RExC_parse[1] != ']')
9945 {
9946 RExC_parse++;
9947
9948 /* a bad range like \w-, [:word:]- ? */
9949 if (namedclass > OOB_NAMEDCLASS) {
9950 if (ckWARN(WARN_REGEXP)) {
9951 const int w =
9952 RExC_parse >= rangebegin ?
9953 RExC_parse - rangebegin : 0;
9954 vWARN4(RExC_parse,
9955 "False [] range \"%*.*s\"",
9956 w, w, rangebegin);
9957 }
9958 if (!SIZE_ONLY)
9959 stored +=
9960 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9961 } else
9962 range = 1; /* yeah, it's a range! */
9963 continue; /* but do it the next time */
9964 }
9965 }
9966
9967 /* non-Latin1 code point implies unicode semantics. Must be set in
9968 * pass1 so is there for the whole of pass 2 */
9969 if (value > 255) {
9970 RExC_uni_semantics = 1;
9971 }
9972
9973 /* now is the next time */
9974 if (!SIZE_ONLY) {
9975 if (prevvalue < 256) {
9976 const IV ceilvalue = value < 256 ? value : 255;
9977 IV i;
9978#ifdef EBCDIC
9979 /* In EBCDIC [\x89-\x91] should include
9980 * the \x8e but [i-j] should not. */
9981 if (literal_endpoint == 2 &&
9982 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
9983 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
9984 {
9985 if (isLOWER(prevvalue)) {
9986 for (i = prevvalue; i <= ceilvalue; i++)
9987 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9988 stored +=
9989 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9990 }
9991 } else {
9992 for (i = prevvalue; i <= ceilvalue; i++)
9993 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9994 stored +=
9995 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9996 }
9997 }
9998 }
9999 else
10000#endif
10001 for (i = prevvalue; i <= ceilvalue; i++) {
10002 stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
10003 }
10004 }
10005 if (value > 255) {
10006 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
10007 const UV natvalue = NATIVE_TO_UNI(value);
10008 nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
10009 }
10010#ifdef EBCDIC
10011 literal_endpoint = 0;
10012#endif
10013 }
10014
10015 range = 0; /* this range (if it was one) is done now */
10016 }
10017
10018
10019
10020 if (SIZE_ONLY)
10021 return ret;
10022 /****** !SIZE_ONLY AFTER HERE *********/
10023
10024 /* If folding and there are code points above 255, we calculate all
10025 * characters that could fold to or from the ones already on the list */
10026 if (FOLD && nonbitmap) {
10027 UV i;
10028
10029 HV* fold_intersection;
10030 UV* fold_list;
10031
10032 /* This is a list of all the characters that participate in folds
10033 * (except marks, etc in multi-char folds */
10034 if (! PL_utf8_foldable) {
10035 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
10036 PL_utf8_foldable = _swash_to_invlist(swash);
10037 }
10038
10039 /* This is a hash that for a particular fold gives all characters
10040 * that are involved in it */
10041 if (! PL_utf8_foldclosures) {
10042
10043 /* If we were unable to find any folds, then we likely won't be
10044 * able to find the closures. So just create an empty list.
10045 * Folding will effectively be restricted to the non-Unicode rules
10046 * hard-coded into Perl. (This case happens legitimately during
10047 * compilation of Perl itself before the Unicode tables are
10048 * generated) */
10049 if (invlist_len(PL_utf8_foldable) == 0) {
10050 PL_utf8_foldclosures = _new_invlist(0);
10051 } else {
10052 /* If the folds haven't been read in, call a fold function
10053 * to force that */
10054 if (! PL_utf8_tofold) {
10055 U8 dummy[UTF8_MAXBYTES+1];
10056 STRLEN dummy_len;
10057 to_utf8_fold((U8*) "A", dummy, &dummy_len);
10058 }
10059 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
10060 }
10061 }
10062
10063 /* Only the characters in this class that participate in folds need
10064 * be checked. Get the intersection of this class and all the
10065 * possible characters that are foldable. This can quickly narrow
10066 * down a large class */
10067 fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
10068
10069 /* Now look at the foldable characters in this class individually */
10070 fold_list = invlist_array(fold_intersection);
10071 for (i = 0; i < invlist_len(fold_intersection); i++) {
10072 UV j;
10073
10074 /* The next entry is the beginning of the range that is in the
10075 * class */
10076 UV start = fold_list[i++];
10077
10078
10079 /* The next entry is the beginning of the next range, which
10080 * isn't in the class, so the end of the current range is one
10081 * less than that */
10082 UV end = fold_list[i] - 1;
10083
10084 /* Look at every character in the range */
10085 for (j = start; j <= end; j++) {
10086
10087 /* Get its fold */
10088 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10089 STRLEN foldlen;
10090 const UV f = to_uni_fold(j, foldbuf, &foldlen);
10091
10092 if (foldlen > (STRLEN)UNISKIP(f)) {
10093
10094 /* Any multicharacter foldings (disallowed in
10095 * lookbehind patterns) require the following
10096 * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
10097 * E folds into "pq" and F folds into "rst", all other
10098 * characters fold to single characters. We save away
10099 * these multicharacter foldings, to be later saved as
10100 * part of the additional "s" data. */
10101 if (! RExC_in_lookbehind) {
10102 U8* loc = foldbuf;
10103 U8* e = foldbuf + foldlen;
10104
10105 /* If any of the folded characters of this are in
10106 * the Latin1 range, tell the regex engine that
10107 * this can match a non-utf8 target string. The
10108 * only multi-byte fold whose source is in the
10109 * Latin1 range (U+00DF) applies only when the
10110 * target string is utf8, or under unicode rules */
10111 if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
10112 while (loc < e) {
10113
10114 /* Can't mix ascii with non- under /aa */
10115 if (MORE_ASCII_RESTRICTED
10116 && (isASCII(*loc) != isASCII(j)))
10117 {
10118 goto end_multi_fold;
10119 }
10120 if (UTF8_IS_INVARIANT(*loc)
10121 || UTF8_IS_DOWNGRADEABLE_START(*loc))
10122 {
10123 /* Can't mix above and below 256 under
10124 * LOC */
10125 if (LOC) {
10126 goto end_multi_fold;
10127 }
10128 ANYOF_FLAGS(ret)
10129 |= ANYOF_NONBITMAP_NON_UTF8;
10130 break;
10131 }
10132 loc += UTF8SKIP(loc);
10133 }
10134 }
10135
10136 add_alternate(&unicode_alternate, foldbuf, foldlen);
10137 end_multi_fold: ;
10138 }
10139 }
10140 else {
10141 /* Single character fold. Add everything in its fold
10142 * closure to the list that this node should match */
10143 SV** listp;
10144
10145 /* The fold closures data structure is a hash with the
10146 * keys being every character that is folded to, like
10147 * 'k', and the values each an array of everything that
10148 * folds to its key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
10149 if ((listp = hv_fetch(PL_utf8_foldclosures,
10150 (char *) foldbuf, foldlen, FALSE)))
10151 {
10152 AV* list = (AV*) *listp;
10153 IV k;
10154 for (k = 0; k <= av_len(list); k++) {
10155 SV** c_p = av_fetch(list, k, FALSE);
10156 UV c;
10157 if (c_p == NULL) {
10158 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
10159 }
10160 c = SvUV(*c_p);
10161
10162 /* /aa doesn't allow folds between ASCII and
10163 * non-; /l doesn't allow them between above
10164 * and below 256 */
10165 if ((MORE_ASCII_RESTRICTED
10166 && (isASCII(c) != isASCII(j)))
10167 || (LOC && ((c < 256) != (j < 256))))
10168 {
10169 continue;
10170 }
10171
10172 if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
10173 stored += set_regclass_bit(pRExC_state,
10174 ret,
10175 (U8) c,
10176 &l1_fold_invlist, &unicode_alternate);
10177 }
10178 /* It may be that the code point is already
10179 * in this range or already in the bitmap,
10180 * in which case we need do nothing */
10181 else if ((c < start || c > end)
10182 && (c > 255
10183 || ! ANYOF_BITMAP_TEST(ret, c)))
10184 {
10185 nonbitmap = add_cp_to_invlist(nonbitmap, c);
10186 }
10187 }
10188 }
10189 }
10190 }
10191 }
10192 invlist_destroy(fold_intersection);
10193 }
10194
10195 /* Combine the two lists into one. */
10196 if (l1_fold_invlist) {
10197 if (nonbitmap) {
10198 nonbitmap = invlist_union(nonbitmap, l1_fold_invlist);
10199 }
10200 else {
10201 nonbitmap = l1_fold_invlist;
10202 }
10203 }
10204
10205 /* Here, we have calculated what code points should be in the character
10206 * class. Now we can see about various optimizations. Fold calculation
10207 * needs to take place before inversion. Otherwise /[^k]/i would invert to
10208 * include K, which under /i would match k. */
10209
10210 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
10211 * set the FOLD flag yet, so this this does optimize those. It doesn't
10212 * optimize locale. Doing so perhaps could be done as long as there is
10213 * nothing like \w in it; some thought also would have to be given to the
10214 * interaction with above 0x100 chars */
10215 if (! LOC
10216 && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT
10217 && ! unicode_alternate
10218 && ! nonbitmap
10219 && SvCUR(listsv) == initial_listsv_len)
10220 {
10221 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
10222 ANYOF_BITMAP(ret)[value] ^= 0xFF;
10223 stored = 256 - stored;
10224
10225 /* The inversion means that everything above 255 is matched; and at the
10226 * same time we clear the invert flag */
10227 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
10228 }
10229
10230 /* Folding in the bitmap is taken care of above, but not for locale (for
10231 * which we have to wait to see what folding is in effect at runtime), and
10232 * for things not in the bitmap. Set run-time fold flag for these */
10233 if (FOLD && (LOC || nonbitmap || unicode_alternate)) {
10234 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
10235 }
10236
10237 /* A single character class can be "optimized" into an EXACTish node.
10238 * Note that since we don't currently count how many characters there are
10239 * outside the bitmap, we are XXX missing optimization possibilities for
10240 * them. This optimization can't happen unless this is a truly single
10241 * character class, which means that it can't be an inversion into a
10242 * many-character class, and there must be no possibility of there being
10243 * things outside the bitmap. 'stored' (only) for locales doesn't include
10244 * \w, etc, so have to make a special test that they aren't present
10245 *
10246 * Similarly A 2-character class of the very special form like [bB] can be
10247 * optimized into an EXACTFish node, but only for non-locales, and for
10248 * characters which only have the two folds; so things like 'fF' and 'Ii'
10249 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
10250 * FI'. */
10251 if (! nonbitmap
10252 && ! unicode_alternate
10253 && SvCUR(listsv) == initial_listsv_len
10254 && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
10255 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10256 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
10257 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10258 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
10259 /* If the latest code point has a fold whose
10260 * bit is set, it must be the only other one */
10261 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
10262 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
10263 {
10264 /* Note that the information needed to decide to do this optimization
10265 * is not currently available until the 2nd pass, and that the actually
10266 * used EXACTish node takes less space than the calculated ANYOF node,
10267 * and hence the amount of space calculated in the first pass is larger
10268 * than actually used, so this optimization doesn't gain us any space.
10269 * But an EXACT node is faster than an ANYOF node, and can be combined
10270 * with any adjacent EXACT nodes later by the optimizer for further
10271 * gains. The speed of executing an EXACTF is similar to an ANYOF
10272 * node, so the optimization advantage comes from the ability to join
10273 * it to adjacent EXACT nodes */
10274
10275 const char * cur_parse= RExC_parse;
10276 U8 op;
10277 RExC_emit = (regnode *)orig_emit;
10278 RExC_parse = (char *)orig_parse;
10279
10280 if (stored == 1) {
10281
10282 /* A locale node with one point can be folded; all the other cases
10283 * with folding will have two points, since we calculate them above
10284 */
10285 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
10286 op = EXACTFL;
10287 }
10288 else {
10289 op = EXACT;
10290 }
10291 } /* else 2 chars in the bit map: the folds of each other */
10292 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
10293
10294 /* To join adjacent nodes, they must be the exact EXACTish type.
10295 * Try to use the most likely type, by using EXACTFU if the regex
10296 * calls for them, or is required because the character is
10297 * non-ASCII */
10298 op = EXACTFU;
10299 }
10300 else { /* Otherwise, more likely to be EXACTF type */
10301 op = EXACTF;
10302 }
10303
10304 ret = reg_node(pRExC_state, op);
10305 RExC_parse = (char *)cur_parse;
10306 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
10307 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
10308 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
10309 STR_LEN(ret)= 2;
10310 RExC_emit += STR_SZ(2);
10311 }
10312 else {
10313 *STRING(ret)= (char)value;
10314 STR_LEN(ret)= 1;
10315 RExC_emit += STR_SZ(1);
10316 }
10317 SvREFCNT_dec(listsv);
10318 return ret;
10319 }
10320
10321 if (nonbitmap) {
10322 UV* nonbitmap_array = invlist_array(nonbitmap);
10323 UV nonbitmap_len = invlist_len(nonbitmap);
10324 UV i;
10325
10326 /* Here have the full list of items to match that aren't in the
10327 * bitmap. Convert to the structure that the rest of the code is
10328 * expecting. XXX That rest of the code should convert to this
10329 * structure */
10330 for (i = 0; i < nonbitmap_len; i++) {
10331
10332 /* The next entry is the beginning of the range that is in the
10333 * class */
10334 UV start = nonbitmap_array[i++];
10335 UV end;
10336
10337 /* The next entry is the beginning of the next range, which isn't
10338 * in the class, so the end of the current range is one less than
10339 * that. But if there is no next range, it means that the range
10340 * begun by 'start' extends to infinity, which for this platform
10341 * ends at UV_MAX */
10342 if (i == nonbitmap_len) {
10343 end = UV_MAX;
10344 }
10345 else {
10346 end = nonbitmap_array[i] - 1;
10347 }
10348
10349 if (start == end) {
10350 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
10351 }
10352 else {
10353 /* The \t sets the whole range */
10354 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
10355 /* XXX EBCDIC */
10356 start, end);
10357 }
10358 }
10359 invlist_destroy(nonbitmap);
10360 }
10361
10362 if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) {
10363 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
10364 SvREFCNT_dec(listsv);
10365 SvREFCNT_dec(unicode_alternate);
10366 }
10367 else {
10368
10369 AV * const av = newAV();
10370 SV *rv;
10371 /* The 0th element stores the character class description
10372 * in its textual form: used later (regexec.c:Perl_regclass_swash())
10373 * to initialize the appropriate swash (which gets stored in
10374 * the 1st element), and also useful for dumping the regnode.
10375 * The 2nd element stores the multicharacter foldings,
10376 * used later (regexec.c:S_reginclass()). */
10377 av_store(av, 0, listsv);
10378 av_store(av, 1, NULL);
10379 av_store(av, 2, MUTABLE_SV(unicode_alternate));
10380 if (unicode_alternate) { /* This node is variable length */
10381 OP(ret) = ANYOFV;
10382 }
10383 rv = newRV_noinc(MUTABLE_SV(av));
10384 n = add_data(pRExC_state, 1, "s");
10385 RExC_rxi->data->data[n] = (void*)rv;
10386 ARG_SET(ret, n);
10387 }
10388 return ret;
10389}
10390#undef _C_C_T_
10391
10392
10393/* reg_skipcomment()
10394
10395 Absorbs an /x style # comments from the input stream.
10396 Returns true if there is more text remaining in the stream.
10397 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
10398 terminates the pattern without including a newline.
10399
10400 Note its the callers responsibility to ensure that we are
10401 actually in /x mode
10402
10403*/
10404
10405STATIC bool
10406S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
10407{
10408 bool ended = 0;
10409
10410 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
10411
10412 while (RExC_parse < RExC_end)
10413 if (*RExC_parse++ == '\n') {
10414 ended = 1;
10415 break;
10416 }
10417 if (!ended) {
10418 /* we ran off the end of the pattern without ending
10419 the comment, so we have to add an \n when wrapping */
10420 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10421 return 0;
10422 } else
10423 return 1;
10424}
10425
10426/* nextchar()
10427
10428 Advances the parse position, and optionally absorbs
10429 "whitespace" from the inputstream.
10430
10431 Without /x "whitespace" means (?#...) style comments only,
10432 with /x this means (?#...) and # comments and whitespace proper.
10433
10434 Returns the RExC_parse point from BEFORE the scan occurs.
10435
10436 This is the /x friendly way of saying RExC_parse++.
10437*/
10438
10439STATIC char*
10440S_nextchar(pTHX_ RExC_state_t *pRExC_state)
10441{
10442 char* const retval = RExC_parse++;
10443
10444 PERL_ARGS_ASSERT_NEXTCHAR;
10445
10446 for (;;) {
10447 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
10448 RExC_parse[2] == '#') {
10449 while (*RExC_parse != ')') {
10450 if (RExC_parse == RExC_end)
10451 FAIL("Sequence (?#... not terminated");
10452 RExC_parse++;
10453 }
10454 RExC_parse++;
10455 continue;
10456 }
10457 if (RExC_flags & RXf_PMf_EXTENDED) {
10458 if (isSPACE(*RExC_parse)) {
10459 RExC_parse++;
10460 continue;
10461 }
10462 else if (*RExC_parse == '#') {
10463 if ( reg_skipcomment( pRExC_state ) )
10464 continue;
10465 }
10466 }
10467 return retval;
10468 }
10469}
10470
10471/*
10472- reg_node - emit a node
10473*/
10474STATIC regnode * /* Location. */
10475S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
10476{
10477 dVAR;
10478 register regnode *ptr;
10479 regnode * const ret = RExC_emit;
10480 GET_RE_DEBUG_FLAGS_DECL;
10481
10482 PERL_ARGS_ASSERT_REG_NODE;
10483
10484 if (SIZE_ONLY) {
10485 SIZE_ALIGN(RExC_size);
10486 RExC_size += 1;
10487 return(ret);
10488 }
10489 if (RExC_emit >= RExC_emit_bound)
10490 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10491
10492 NODE_ALIGN_FILL(ret);
10493 ptr = ret;
10494 FILL_ADVANCE_NODE(ptr, op);
10495#ifdef RE_TRACK_PATTERN_OFFSETS
10496 if (RExC_offsets) { /* MJD */
10497 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
10498 "reg_node", __LINE__,
10499 PL_reg_name[op],
10500 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
10501 ? "Overwriting end of array!\n" : "OK",
10502 (UV)(RExC_emit - RExC_emit_start),
10503 (UV)(RExC_parse - RExC_start),
10504 (UV)RExC_offsets[0]));
10505 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
10506 }
10507#endif
10508 RExC_emit = ptr;
10509 return(ret);
10510}
10511
10512/*
10513- reganode - emit a node with an argument
10514*/
10515STATIC regnode * /* Location. */
10516S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
10517{
10518 dVAR;
10519 register regnode *ptr;
10520 regnode * const ret = RExC_emit;
10521 GET_RE_DEBUG_FLAGS_DECL;
10522
10523 PERL_ARGS_ASSERT_REGANODE;
10524
10525 if (SIZE_ONLY) {
10526 SIZE_ALIGN(RExC_size);
10527 RExC_size += 2;
10528 /*
10529 We can't do this:
10530
10531 assert(2==regarglen[op]+1);
10532
10533 Anything larger than this has to allocate the extra amount.
10534 If we changed this to be:
10535
10536 RExC_size += (1 + regarglen[op]);
10537
10538 then it wouldn't matter. Its not clear what side effect
10539 might come from that so its not done so far.
10540 -- dmq
10541 */
10542 return(ret);
10543 }
10544 if (RExC_emit >= RExC_emit_bound)
10545 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10546
10547 NODE_ALIGN_FILL(ret);
10548 ptr = ret;
10549 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
10550#ifdef RE_TRACK_PATTERN_OFFSETS
10551 if (RExC_offsets) { /* MJD */
10552 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10553 "reganode",
10554 __LINE__,
10555 PL_reg_name[op],
10556 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
10557 "Overwriting end of array!\n" : "OK",
10558 (UV)(RExC_emit - RExC_emit_start),
10559 (UV)(RExC_parse - RExC_start),
10560 (UV)RExC_offsets[0]));
10561 Set_Cur_Node_Offset;
10562 }
10563#endif
10564 RExC_emit = ptr;
10565 return(ret);
10566}
10567
10568/*
10569- reguni - emit (if appropriate) a Unicode character
10570*/
10571STATIC STRLEN
10572S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
10573{
10574 dVAR;
10575
10576 PERL_ARGS_ASSERT_REGUNI;
10577
10578 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
10579}
10580
10581/*
10582- reginsert - insert an operator in front of already-emitted operand
10583*
10584* Means relocating the operand.
10585*/
10586STATIC void
10587S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
10588{
10589 dVAR;
10590 register regnode *src;
10591 register regnode *dst;
10592 register regnode *place;
10593 const int offset = regarglen[(U8)op];
10594 const int size = NODE_STEP_REGNODE + offset;
10595 GET_RE_DEBUG_FLAGS_DECL;
10596
10597 PERL_ARGS_ASSERT_REGINSERT;
10598 PERL_UNUSED_ARG(depth);
10599/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
10600 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
10601 if (SIZE_ONLY) {
10602 RExC_size += size;
10603 return;
10604 }
10605
10606 src = RExC_emit;
10607 RExC_emit += size;
10608 dst = RExC_emit;
10609 if (RExC_open_parens) {
10610 int paren;
10611 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
10612 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
10613 if ( RExC_open_parens[paren] >= opnd ) {
10614 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
10615 RExC_open_parens[paren] += size;
10616 } else {
10617 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
10618 }
10619 if ( RExC_close_parens[paren] >= opnd ) {
10620 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
10621 RExC_close_parens[paren] += size;
10622 } else {
10623 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
10624 }
10625 }
10626 }
10627
10628 while (src > opnd) {
10629 StructCopy(--src, --dst, regnode);
10630#ifdef RE_TRACK_PATTERN_OFFSETS
10631 if (RExC_offsets) { /* MJD 20010112 */
10632 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
10633 "reg_insert",
10634 __LINE__,
10635 PL_reg_name[op],
10636 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
10637 ? "Overwriting end of array!\n" : "OK",
10638 (UV)(src - RExC_emit_start),
10639 (UV)(dst - RExC_emit_start),
10640 (UV)RExC_offsets[0]));
10641 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
10642 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
10643 }
10644#endif
10645 }
10646
10647
10648 place = opnd; /* Op node, where operand used to be. */
10649#ifdef RE_TRACK_PATTERN_OFFSETS
10650 if (RExC_offsets) { /* MJD */
10651 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10652 "reginsert",
10653 __LINE__,
10654 PL_reg_name[op],
10655 (UV)(place - RExC_emit_start) > RExC_offsets[0]
10656 ? "Overwriting end of array!\n" : "OK",
10657 (UV)(place - RExC_emit_start),
10658 (UV)(RExC_parse - RExC_start),
10659 (UV)RExC_offsets[0]));
10660 Set_Node_Offset(place, RExC_parse);
10661 Set_Node_Length(place, 1);
10662 }
10663#endif
10664 src = NEXTOPER(place);
10665 FILL_ADVANCE_NODE(place, op);
10666 Zero(src, offset, regnode);
10667}
10668
10669/*
10670- regtail - set the next-pointer at the end of a node chain of p to val.
10671- SEE ALSO: regtail_study
10672*/
10673/* TODO: All three parms should be const */
10674STATIC void
10675S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10676{
10677 dVAR;
10678 register regnode *scan;
10679 GET_RE_DEBUG_FLAGS_DECL;
10680
10681 PERL_ARGS_ASSERT_REGTAIL;
10682#ifndef DEBUGGING
10683 PERL_UNUSED_ARG(depth);
10684#endif
10685
10686 if (SIZE_ONLY)
10687 return;
10688
10689 /* Find last node. */
10690 scan = p;
10691 for (;;) {
10692 regnode * const temp = regnext(scan);
10693 DEBUG_PARSE_r({
10694 SV * const mysv=sv_newmortal();
10695 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
10696 regprop(RExC_rx, mysv, scan);
10697 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
10698 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
10699 (temp == NULL ? "->" : ""),
10700 (temp == NULL ? PL_reg_name[OP(val)] : "")
10701 );
10702 });
10703 if (temp == NULL)
10704 break;
10705 scan = temp;
10706 }
10707
10708 if (reg_off_by_arg[OP(scan)]) {
10709 ARG_SET(scan, val - scan);
10710 }
10711 else {
10712 NEXT_OFF(scan) = val - scan;
10713 }
10714}
10715
10716#ifdef DEBUGGING
10717/*
10718- regtail_study - set the next-pointer at the end of a node chain of p to val.
10719- Look for optimizable sequences at the same time.
10720- currently only looks for EXACT chains.
10721
10722This is experimental code. The idea is to use this routine to perform
10723in place optimizations on branches and groups as they are constructed,
10724with the long term intention of removing optimization from study_chunk so
10725that it is purely analytical.
10726
10727Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
10728to control which is which.
10729
10730*/
10731/* TODO: All four parms should be const */
10732
10733STATIC U8
10734S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10735{
10736 dVAR;
10737 register regnode *scan;
10738 U8 exact = PSEUDO;
10739#ifdef EXPERIMENTAL_INPLACESCAN
10740 I32 min = 0;
10741#endif
10742 GET_RE_DEBUG_FLAGS_DECL;
10743
10744 PERL_ARGS_ASSERT_REGTAIL_STUDY;
10745
10746
10747 if (SIZE_ONLY)
10748 return exact;
10749
10750 /* Find last node. */
10751
10752 scan = p;
10753 for (;;) {
10754 regnode * const temp = regnext(scan);
10755#ifdef EXPERIMENTAL_INPLACESCAN
10756 if (PL_regkind[OP(scan)] == EXACT)
10757 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
10758 return EXACT;
10759#endif
10760 if ( exact ) {
10761 switch (OP(scan)) {
10762 case EXACT:
10763 case EXACTF:
10764 case EXACTFA:
10765 case EXACTFU:
10766 case EXACTFL:
10767 if( exact == PSEUDO )
10768 exact= OP(scan);
10769 else if ( exact != OP(scan) )
10770 exact= 0;
10771 case NOTHING:
10772 break;
10773 default:
10774 exact= 0;
10775 }
10776 }
10777 DEBUG_PARSE_r({
10778 SV * const mysv=sv_newmortal();
10779 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
10780 regprop(RExC_rx, mysv, scan);
10781 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
10782 SvPV_nolen_const(mysv),
10783 REG_NODE_NUM(scan),
10784 PL_reg_name[exact]);
10785 });
10786 if (temp == NULL)
10787 break;
10788 scan = temp;
10789 }
10790 DEBUG_PARSE_r({
10791 SV * const mysv_val=sv_newmortal();
10792 DEBUG_PARSE_MSG("");
10793 regprop(RExC_rx, mysv_val, val);
10794 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
10795 SvPV_nolen_const(mysv_val),
10796 (IV)REG_NODE_NUM(val),
10797 (IV)(val - scan)
10798 );
10799 });
10800 if (reg_off_by_arg[OP(scan)]) {
10801 ARG_SET(scan, val - scan);
10802 }
10803 else {
10804 NEXT_OFF(scan) = val - scan;
10805 }
10806
10807 return exact;
10808}
10809#endif
10810
10811/*
10812 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
10813 */
10814#ifdef DEBUGGING
10815static void
10816S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
10817{
10818 int bit;
10819 int set=0;
10820 regex_charset cs;
10821
10822 for (bit=0; bit<32; bit++) {
10823 if (flags & (1<<bit)) {
10824 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
10825 continue;
10826 }
10827 if (!set++ && lead)
10828 PerlIO_printf(Perl_debug_log, "%s",lead);
10829 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
10830 }
10831 }
10832 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
10833 if (!set++ && lead) {
10834 PerlIO_printf(Perl_debug_log, "%s",lead);
10835 }
10836 switch (cs) {
10837 case REGEX_UNICODE_CHARSET:
10838 PerlIO_printf(Perl_debug_log, "UNICODE");
10839 break;
10840 case REGEX_LOCALE_CHARSET:
10841 PerlIO_printf(Perl_debug_log, "LOCALE");
10842 break;
10843 case REGEX_ASCII_RESTRICTED_CHARSET:
10844 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
10845 break;
10846 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10847 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
10848 break;
10849 default:
10850 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
10851 break;
10852 }
10853 }
10854 if (lead) {
10855 if (set)
10856 PerlIO_printf(Perl_debug_log, "\n");
10857 else
10858 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
10859 }
10860}
10861#endif
10862
10863void
10864Perl_regdump(pTHX_ const regexp *r)
10865{
10866#ifdef DEBUGGING
10867 dVAR;
10868 SV * const sv = sv_newmortal();
10869 SV *dsv= sv_newmortal();
10870 RXi_GET_DECL(r,ri);
10871 GET_RE_DEBUG_FLAGS_DECL;
10872
10873 PERL_ARGS_ASSERT_REGDUMP;
10874
10875 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
10876
10877 /* Header fields of interest. */
10878 if (r->anchored_substr) {
10879 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
10880 RE_SV_DUMPLEN(r->anchored_substr), 30);
10881 PerlIO_printf(Perl_debug_log,
10882 "anchored %s%s at %"IVdf" ",
10883 s, RE_SV_TAIL(r->anchored_substr),
10884 (IV)r->anchored_offset);
10885 } else if (r->anchored_utf8) {
10886 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
10887 RE_SV_DUMPLEN(r->anchored_utf8), 30);
10888 PerlIO_printf(Perl_debug_log,
10889 "anchored utf8 %s%s at %"IVdf" ",
10890 s, RE_SV_TAIL(r->anchored_utf8),
10891 (IV)r->anchored_offset);
10892 }
10893 if (r->float_substr) {
10894 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
10895 RE_SV_DUMPLEN(r->float_substr), 30);
10896 PerlIO_printf(Perl_debug_log,
10897 "floating %s%s at %"IVdf"..%"UVuf" ",
10898 s, RE_SV_TAIL(r->float_substr),
10899 (IV)r->float_min_offset, (UV)r->float_max_offset);
10900 } else if (r->float_utf8) {
10901 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
10902 RE_SV_DUMPLEN(r->float_utf8), 30);
10903 PerlIO_printf(Perl_debug_log,
10904 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
10905 s, RE_SV_TAIL(r->float_utf8),
10906 (IV)r->float_min_offset, (UV)r->float_max_offset);
10907 }
10908 if (r->check_substr || r->check_utf8)
10909 PerlIO_printf(Perl_debug_log,
10910 (const char *)
10911 (r->check_substr == r->float_substr
10912 && r->check_utf8 == r->float_utf8
10913 ? "(checking floating" : "(checking anchored"));
10914 if (r->extflags & RXf_NOSCAN)
10915 PerlIO_printf(Perl_debug_log, " noscan");
10916 if (r->extflags & RXf_CHECK_ALL)
10917 PerlIO_printf(Perl_debug_log, " isall");
10918 if (r->check_substr || r->check_utf8)
10919 PerlIO_printf(Perl_debug_log, ") ");
10920
10921 if (ri->regstclass) {
10922 regprop(r, sv, ri->regstclass);
10923 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
10924 }
10925 if (r->extflags & RXf_ANCH) {
10926 PerlIO_printf(Perl_debug_log, "anchored");
10927 if (r->extflags & RXf_ANCH_BOL)
10928 PerlIO_printf(Perl_debug_log, "(BOL)");
10929 if (r->extflags & RXf_ANCH_MBOL)
10930 PerlIO_printf(Perl_debug_log, "(MBOL)");
10931 if (r->extflags & RXf_ANCH_SBOL)
10932 PerlIO_printf(Perl_debug_log, "(SBOL)");
10933 if (r->extflags & RXf_ANCH_GPOS)
10934 PerlIO_printf(Perl_debug_log, "(GPOS)");
10935 PerlIO_putc(Perl_debug_log, ' ');
10936 }
10937 if (r->extflags & RXf_GPOS_SEEN)
10938 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
10939 if (r->intflags & PREGf_SKIP)
10940 PerlIO_printf(Perl_debug_log, "plus ");
10941 if (r->intflags & PREGf_IMPLICIT)
10942 PerlIO_printf(Perl_debug_log, "implicit ");
10943 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
10944 if (r->extflags & RXf_EVAL_SEEN)
10945 PerlIO_printf(Perl_debug_log, "with eval ");
10946 PerlIO_printf(Perl_debug_log, "\n");
10947 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
10948#else
10949 PERL_ARGS_ASSERT_REGDUMP;
10950 PERL_UNUSED_CONTEXT;
10951 PERL_UNUSED_ARG(r);
10952#endif /* DEBUGGING */
10953}
10954
10955/*
10956- regprop - printable representation of opcode
10957*/
10958#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
10959STMT_START { \
10960 if (do_sep) { \
10961 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
10962 if (flags & ANYOF_INVERT) \
10963 /*make sure the invert info is in each */ \
10964 sv_catpvs(sv, "^"); \
10965 do_sep = 0; \
10966 } \
10967} STMT_END
10968
10969void
10970Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
10971{
10972#ifdef DEBUGGING
10973 dVAR;
10974 register int k;
10975 RXi_GET_DECL(prog,progi);
10976 GET_RE_DEBUG_FLAGS_DECL;
10977
10978 PERL_ARGS_ASSERT_REGPROP;
10979
10980 sv_setpvs(sv, "");
10981
10982 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
10983 /* It would be nice to FAIL() here, but this may be called from
10984 regexec.c, and it would be hard to supply pRExC_state. */
10985 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
10986 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
10987
10988 k = PL_regkind[OP(o)];
10989
10990 if (k == EXACT) {
10991 sv_catpvs(sv, " ");
10992 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
10993 * is a crude hack but it may be the best for now since
10994 * we have no flag "this EXACTish node was UTF-8"
10995 * --jhi */
10996 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
10997 PERL_PV_ESCAPE_UNI_DETECT |
10998 PERL_PV_ESCAPE_NONASCII |
10999 PERL_PV_PRETTY_ELLIPSES |
11000 PERL_PV_PRETTY_LTGT |
11001 PERL_PV_PRETTY_NOCLEAR
11002 );
11003 } else if (k == TRIE) {
11004 /* print the details of the trie in dumpuntil instead, as
11005 * progi->data isn't available here */
11006 const char op = OP(o);
11007 const U32 n = ARG(o);
11008 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
11009 (reg_ac_data *)progi->data->data[n] :
11010 NULL;
11011 const reg_trie_data * const trie
11012 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
11013
11014 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
11015 DEBUG_TRIE_COMPILE_r(
11016 Perl_sv_catpvf(aTHX_ sv,
11017 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
11018 (UV)trie->startstate,
11019 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
11020 (UV)trie->wordcount,
11021 (UV)trie->minlen,
11022 (UV)trie->maxlen,
11023 (UV)TRIE_CHARCOUNT(trie),
11024 (UV)trie->uniquecharcount
11025 )
11026 );
11027 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
11028 int i;
11029 int rangestart = -1;
11030 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
11031 sv_catpvs(sv, "[");
11032 for (i = 0; i <= 256; i++) {
11033 if (i < 256 && BITMAP_TEST(bitmap,i)) {
11034 if (rangestart == -1)
11035 rangestart = i;
11036 } else if (rangestart != -1) {
11037 if (i <= rangestart + 3)
11038 for (; rangestart < i; rangestart++)
11039 put_byte(sv, rangestart);
11040 else {
11041 put_byte(sv, rangestart);
11042 sv_catpvs(sv, "-");
11043 put_byte(sv, i - 1);
11044 }
11045 rangestart = -1;
11046 }
11047 }
11048 sv_catpvs(sv, "]");
11049 }
11050
11051 } else if (k == CURLY) {
11052 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
11053 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
11054 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
11055 }
11056 else if (k == WHILEM && o->flags) /* Ordinal/of */
11057 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
11058 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
11059 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
11060 if ( RXp_PAREN_NAMES(prog) ) {
11061 if ( k != REF || (OP(o) < NREF)) {
11062 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
11063 SV **name= av_fetch(list, ARG(o), 0 );
11064 if (name)
11065 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11066 }
11067 else {
11068 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
11069 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
11070 I32 *nums=(I32*)SvPVX(sv_dat);
11071 SV **name= av_fetch(list, nums[0], 0 );
11072 I32 n;
11073 if (name) {
11074 for ( n=0; n<SvIVX(sv_dat); n++ ) {
11075 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
11076 (n ? "," : ""), (IV)nums[n]);
11077 }
11078 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11079 }
11080 }
11081 }
11082 } else if (k == GOSUB)
11083 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
11084 else if (k == VERB) {
11085 if (!o->flags)
11086 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
11087 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
11088 } else if (k == LOGICAL)
11089 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
11090 else if (k == FOLDCHAR)
11091 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
11092 else if (k == ANYOF) {
11093 int i, rangestart = -1;
11094 const U8 flags = ANYOF_FLAGS(o);
11095 int do_sep = 0;
11096
11097 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
11098 static const char * const anyofs[] = {
11099 "\\w",
11100 "\\W",
11101 "\\s",
11102 "\\S",
11103 "\\d",
11104 "\\D",
11105 "[:alnum:]",
11106 "[:^alnum:]",
11107 "[:alpha:]",
11108 "[:^alpha:]",
11109 "[:ascii:]",
11110 "[:^ascii:]",
11111 "[:cntrl:]",
11112 "[:^cntrl:]",
11113 "[:graph:]",
11114 "[:^graph:]",
11115 "[:lower:]",
11116 "[:^lower:]",
11117 "[:print:]",
11118 "[:^print:]",
11119 "[:punct:]",
11120 "[:^punct:]",
11121 "[:upper:]",
11122 "[:^upper:]",
11123 "[:xdigit:]",
11124 "[:^xdigit:]",
11125 "[:space:]",
11126 "[:^space:]",
11127 "[:blank:]",
11128 "[:^blank:]"
11129 };
11130
11131 if (flags & ANYOF_LOCALE)
11132 sv_catpvs(sv, "{loc}");
11133 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
11134 sv_catpvs(sv, "{i}");
11135 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
11136 if (flags & ANYOF_INVERT)
11137 sv_catpvs(sv, "^");
11138
11139 /* output what the standard cp 0-255 bitmap matches */
11140 for (i = 0; i <= 256; i++) {
11141 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
11142 if (rangestart == -1)
11143 rangestart = i;
11144 } else if (rangestart != -1) {
11145 if (i <= rangestart + 3)
11146 for (; rangestart < i; rangestart++)
11147 put_byte(sv, rangestart);
11148 else {
11149 put_byte(sv, rangestart);
11150 sv_catpvs(sv, "-");
11151 put_byte(sv, i - 1);
11152 }
11153 do_sep = 1;
11154 rangestart = -1;
11155 }
11156 }
11157
11158 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11159 /* output any special charclass tests (used entirely under use locale) */
11160 if (ANYOF_CLASS_TEST_ANY_SET(o))
11161 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
11162 if (ANYOF_CLASS_TEST(o,i)) {
11163 sv_catpv(sv, anyofs[i]);
11164 do_sep = 1;
11165 }
11166
11167 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11168
11169 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
11170 sv_catpvs(sv, "{non-utf8-latin1-all}");
11171 }
11172
11173 /* output information about the unicode matching */
11174 if (flags & ANYOF_UNICODE_ALL)
11175 sv_catpvs(sv, "{unicode_all}");
11176 else if (ANYOF_NONBITMAP(o))
11177 sv_catpvs(sv, "{unicode}");
11178 if (flags & ANYOF_NONBITMAP_NON_UTF8)
11179 sv_catpvs(sv, "{outside bitmap}");
11180
11181 if (ANYOF_NONBITMAP(o)) {
11182 SV *lv;
11183 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
11184
11185 if (lv) {
11186 if (sw) {
11187 U8 s[UTF8_MAXBYTES_CASE+1];
11188
11189 for (i = 0; i <= 256; i++) { /* just the first 256 */
11190 uvchr_to_utf8(s, i);
11191
11192 if (i < 256 && swash_fetch(sw, s, TRUE)) {
11193 if (rangestart == -1)
11194 rangestart = i;
11195 } else if (rangestart != -1) {
11196 if (i <= rangestart + 3)
11197 for (; rangestart < i; rangestart++) {
11198 const U8 * const e = uvchr_to_utf8(s,rangestart);
11199 U8 *p;
11200 for(p = s; p < e; p++)
11201 put_byte(sv, *p);
11202 }
11203 else {
11204 const U8 *e = uvchr_to_utf8(s,rangestart);
11205 U8 *p;
11206 for (p = s; p < e; p++)
11207 put_byte(sv, *p);
11208 sv_catpvs(sv, "-");
11209 e = uvchr_to_utf8(s, i-1);
11210 for (p = s; p < e; p++)
11211 put_byte(sv, *p);
11212 }
11213 rangestart = -1;
11214 }
11215 }
11216
11217 sv_catpvs(sv, "..."); /* et cetera */
11218 }
11219
11220 {
11221 char *s = savesvpv(lv);
11222 char * const origs = s;
11223
11224 while (*s && *s != '\n')
11225 s++;
11226
11227 if (*s == '\n') {
11228 const char * const t = ++s;
11229
11230 while (*s) {
11231 if (*s == '\n')
11232 *s = ' ';
11233 s++;
11234 }
11235 if (s[-1] == ' ')
11236 s[-1] = 0;
11237
11238 sv_catpv(sv, t);
11239 }
11240
11241 Safefree(origs);
11242 }
11243 }
11244 }
11245
11246 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
11247 }
11248 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
11249 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
11250#else
11251 PERL_UNUSED_CONTEXT;
11252 PERL_UNUSED_ARG(sv);
11253 PERL_UNUSED_ARG(o);
11254 PERL_UNUSED_ARG(prog);
11255#endif /* DEBUGGING */
11256}
11257
11258SV *
11259Perl_re_intuit_string(pTHX_ REGEXP * const r)
11260{ /* Assume that RE_INTUIT is set */
11261 dVAR;
11262 struct regexp *const prog = (struct regexp *)SvANY(r);
11263 GET_RE_DEBUG_FLAGS_DECL;
11264
11265 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
11266 PERL_UNUSED_CONTEXT;
11267
11268 DEBUG_COMPILE_r(
11269 {
11270 const char * const s = SvPV_nolen_const(prog->check_substr
11271 ? prog->check_substr : prog->check_utf8);
11272
11273 if (!PL_colorset) reginitcolors();
11274 PerlIO_printf(Perl_debug_log,
11275 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
11276 PL_colors[4],
11277 prog->check_substr ? "" : "utf8 ",
11278 PL_colors[5],PL_colors[0],
11279 s,
11280 PL_colors[1],
11281 (strlen(s) > 60 ? "..." : ""));
11282 } );
11283
11284 return prog->check_substr ? prog->check_substr : prog->check_utf8;
11285}
11286
11287/*
11288 pregfree()
11289
11290 handles refcounting and freeing the perl core regexp structure. When
11291 it is necessary to actually free the structure the first thing it
11292 does is call the 'free' method of the regexp_engine associated to
11293 the regexp, allowing the handling of the void *pprivate; member
11294 first. (This routine is not overridable by extensions, which is why
11295 the extensions free is called first.)
11296
11297 See regdupe and regdupe_internal if you change anything here.
11298*/
11299#ifndef PERL_IN_XSUB_RE
11300void
11301Perl_pregfree(pTHX_ REGEXP *r)
11302{
11303 SvREFCNT_dec(r);
11304}
11305
11306void
11307Perl_pregfree2(pTHX_ REGEXP *rx)
11308{
11309 dVAR;
11310 struct regexp *const r = (struct regexp *)SvANY(rx);
11311 GET_RE_DEBUG_FLAGS_DECL;
11312
11313 PERL_ARGS_ASSERT_PREGFREE2;
11314
11315 if (r->mother_re) {
11316 ReREFCNT_dec(r->mother_re);
11317 } else {
11318 CALLREGFREE_PVT(rx); /* free the private data */
11319 SvREFCNT_dec(RXp_PAREN_NAMES(r));
11320 }
11321 if (r->substrs) {
11322 SvREFCNT_dec(r->anchored_substr);
11323 SvREFCNT_dec(r->anchored_utf8);
11324 SvREFCNT_dec(r->float_substr);
11325 SvREFCNT_dec(r->float_utf8);
11326 Safefree(r->substrs);
11327 }
11328 RX_MATCH_COPY_FREE(rx);
11329#ifdef PERL_OLD_COPY_ON_WRITE
11330 SvREFCNT_dec(r->saved_copy);
11331#endif
11332 Safefree(r->offs);
11333}
11334
11335/* reg_temp_copy()
11336
11337 This is a hacky workaround to the structural issue of match results
11338 being stored in the regexp structure which is in turn stored in
11339 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
11340 could be PL_curpm in multiple contexts, and could require multiple
11341 result sets being associated with the pattern simultaneously, such
11342 as when doing a recursive match with (??{$qr})
11343
11344 The solution is to make a lightweight copy of the regexp structure
11345 when a qr// is returned from the code executed by (??{$qr}) this
11346 lightweight copy doesn't actually own any of its data except for
11347 the starp/end and the actual regexp structure itself.
11348
11349*/
11350
11351
11352REGEXP *
11353Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
11354{
11355 struct regexp *ret;
11356 struct regexp *const r = (struct regexp *)SvANY(rx);
11357 register const I32 npar = r->nparens+1;
11358
11359 PERL_ARGS_ASSERT_REG_TEMP_COPY;
11360
11361 if (!ret_x)
11362 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
11363 ret = (struct regexp *)SvANY(ret_x);
11364
11365 (void)ReREFCNT_inc(rx);
11366 /* We can take advantage of the existing "copied buffer" mechanism in SVs
11367 by pointing directly at the buffer, but flagging that the allocated
11368 space in the copy is zero. As we've just done a struct copy, it's now
11369 a case of zero-ing that, rather than copying the current length. */
11370 SvPV_set(ret_x, RX_WRAPPED(rx));
11371 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
11372 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
11373 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
11374 SvLEN_set(ret_x, 0);
11375 SvSTASH_set(ret_x, NULL);
11376 SvMAGIC_set(ret_x, NULL);
11377 Newx(ret->offs, npar, regexp_paren_pair);
11378 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11379 if (r->substrs) {
11380 Newx(ret->substrs, 1, struct reg_substr_data);
11381 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11382
11383 SvREFCNT_inc_void(ret->anchored_substr);
11384 SvREFCNT_inc_void(ret->anchored_utf8);
11385 SvREFCNT_inc_void(ret->float_substr);
11386 SvREFCNT_inc_void(ret->float_utf8);
11387
11388 /* check_substr and check_utf8, if non-NULL, point to either their
11389 anchored or float namesakes, and don't hold a second reference. */
11390 }
11391 RX_MATCH_COPIED_off(ret_x);
11392#ifdef PERL_OLD_COPY_ON_WRITE
11393 ret->saved_copy = NULL;
11394#endif
11395 ret->mother_re = rx;
11396
11397 return ret_x;
11398}
11399#endif
11400
11401/* regfree_internal()
11402
11403 Free the private data in a regexp. This is overloadable by
11404 extensions. Perl takes care of the regexp structure in pregfree(),
11405 this covers the *pprivate pointer which technically perl doesn't
11406 know about, however of course we have to handle the
11407 regexp_internal structure when no extension is in use.
11408
11409 Note this is called before freeing anything in the regexp
11410 structure.
11411 */
11412
11413void
11414Perl_regfree_internal(pTHX_ REGEXP * const rx)
11415{
11416 dVAR;
11417 struct regexp *const r = (struct regexp *)SvANY(rx);
11418 RXi_GET_DECL(r,ri);
11419 GET_RE_DEBUG_FLAGS_DECL;
11420
11421 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
11422
11423 DEBUG_COMPILE_r({
11424 if (!PL_colorset)
11425 reginitcolors();
11426 {
11427 SV *dsv= sv_newmortal();
11428 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
11429 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
11430 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
11431 PL_colors[4],PL_colors[5],s);
11432 }
11433 });
11434#ifdef RE_TRACK_PATTERN_OFFSETS
11435 if (ri->u.offsets)
11436 Safefree(ri->u.offsets); /* 20010421 MJD */
11437#endif
11438 if (ri->data) {
11439 int n = ri->data->count;
11440 PAD* new_comppad = NULL;
11441 PAD* old_comppad;
11442 PADOFFSET refcnt;
11443
11444 while (--n >= 0) {
11445 /* If you add a ->what type here, update the comment in regcomp.h */
11446 switch (ri->data->what[n]) {
11447 case 'a':
11448 case 's':
11449 case 'S':
11450 case 'u':
11451 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
11452 break;
11453 case 'f':
11454 Safefree(ri->data->data[n]);
11455 break;
11456 case 'p':
11457 new_comppad = MUTABLE_AV(ri->data->data[n]);
11458 break;
11459 case 'o':
11460 if (new_comppad == NULL)
11461 Perl_croak(aTHX_ "panic: pregfree comppad");
11462 PAD_SAVE_LOCAL(old_comppad,
11463 /* Watch out for global destruction's random ordering. */
11464 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
11465 );
11466 OP_REFCNT_LOCK;
11467 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
11468 OP_REFCNT_UNLOCK;
11469 if (!refcnt)
11470 op_free((OP_4tree*)ri->data->data[n]);
11471
11472 PAD_RESTORE_LOCAL(old_comppad);
11473 SvREFCNT_dec(MUTABLE_SV(new_comppad));
11474 new_comppad = NULL;
11475 break;
11476 case 'n':
11477 break;
11478 case 'T':
11479 { /* Aho Corasick add-on structure for a trie node.
11480 Used in stclass optimization only */
11481 U32 refcount;
11482 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
11483 OP_REFCNT_LOCK;
11484 refcount = --aho->refcount;
11485 OP_REFCNT_UNLOCK;
11486 if ( !refcount ) {
11487 PerlMemShared_free(aho->states);
11488 PerlMemShared_free(aho->fail);
11489 /* do this last!!!! */
11490 PerlMemShared_free(ri->data->data[n]);
11491 PerlMemShared_free(ri->regstclass);
11492 }
11493 }
11494 break;
11495 case 't':
11496 {
11497 /* trie structure. */
11498 U32 refcount;
11499 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
11500 OP_REFCNT_LOCK;
11501 refcount = --trie->refcount;
11502 OP_REFCNT_UNLOCK;
11503 if ( !refcount ) {
11504 PerlMemShared_free(trie->charmap);
11505 PerlMemShared_free(trie->states);
11506 PerlMemShared_free(trie->trans);
11507 if (trie->bitmap)
11508 PerlMemShared_free(trie->bitmap);
11509 if (trie->jump)
11510 PerlMemShared_free(trie->jump);
11511 PerlMemShared_free(trie->wordinfo);
11512 /* do this last!!!! */
11513 PerlMemShared_free(ri->data->data[n]);
11514 }
11515 }
11516 break;
11517 default:
11518 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
11519 }
11520 }
11521 Safefree(ri->data->what);
11522 Safefree(ri->data);
11523 }
11524
11525 Safefree(ri);
11526}
11527
11528#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
11529#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
11530#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
11531
11532/*
11533 re_dup - duplicate a regexp.
11534
11535 This routine is expected to clone a given regexp structure. It is only
11536 compiled under USE_ITHREADS.
11537
11538 After all of the core data stored in struct regexp is duplicated
11539 the regexp_engine.dupe method is used to copy any private data
11540 stored in the *pprivate pointer. This allows extensions to handle
11541 any duplication it needs to do.
11542
11543 See pregfree() and regfree_internal() if you change anything here.
11544*/
11545#if defined(USE_ITHREADS)
11546#ifndef PERL_IN_XSUB_RE
11547void
11548Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
11549{
11550 dVAR;
11551 I32 npar;
11552 const struct regexp *r = (const struct regexp *)SvANY(sstr);
11553 struct regexp *ret = (struct regexp *)SvANY(dstr);
11554
11555 PERL_ARGS_ASSERT_RE_DUP_GUTS;
11556
11557 npar = r->nparens+1;
11558 Newx(ret->offs, npar, regexp_paren_pair);
11559 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11560 if(ret->swap) {
11561 /* no need to copy these */
11562 Newx(ret->swap, npar, regexp_paren_pair);
11563 }
11564
11565 if (ret->substrs) {
11566 /* Do it this way to avoid reading from *r after the StructCopy().
11567 That way, if any of the sv_dup_inc()s dislodge *r from the L1
11568 cache, it doesn't matter. */
11569 const bool anchored = r->check_substr
11570 ? r->check_substr == r->anchored_substr
11571 : r->check_utf8 == r->anchored_utf8;
11572 Newx(ret->substrs, 1, struct reg_substr_data);
11573 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11574
11575 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
11576 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
11577 ret->float_substr = sv_dup_inc(ret->float_substr, param);
11578 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
11579
11580 /* check_substr and check_utf8, if non-NULL, point to either their
11581 anchored or float namesakes, and don't hold a second reference. */
11582
11583 if (ret->check_substr) {
11584 if (anchored) {
11585 assert(r->check_utf8 == r->anchored_utf8);
11586 ret->check_substr = ret->anchored_substr;
11587 ret->check_utf8 = ret->anchored_utf8;
11588 } else {
11589 assert(r->check_substr == r->float_substr);
11590 assert(r->check_utf8 == r->float_utf8);
11591 ret->check_substr = ret->float_substr;
11592 ret->check_utf8 = ret->float_utf8;
11593 }
11594 } else if (ret->check_utf8) {
11595 if (anchored) {
11596 ret->check_utf8 = ret->anchored_utf8;
11597 } else {
11598 ret->check_utf8 = ret->float_utf8;
11599 }
11600 }
11601 }
11602
11603 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
11604
11605 if (ret->pprivate)
11606 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
11607
11608 if (RX_MATCH_COPIED(dstr))
11609 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
11610 else
11611 ret->subbeg = NULL;
11612#ifdef PERL_OLD_COPY_ON_WRITE
11613 ret->saved_copy = NULL;
11614#endif
11615
11616 if (ret->mother_re) {
11617 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
11618 /* Our storage points directly to our mother regexp, but that's
11619 1: a buffer in a different thread
11620 2: something we no longer hold a reference on
11621 so we need to copy it locally. */
11622 /* Note we need to sue SvCUR() on our mother_re, because it, in
11623 turn, may well be pointing to its own mother_re. */
11624 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
11625 SvCUR(ret->mother_re)+1));
11626 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
11627 }
11628 ret->mother_re = NULL;
11629 }
11630 ret->gofs = 0;
11631}
11632#endif /* PERL_IN_XSUB_RE */
11633
11634/*
11635 regdupe_internal()
11636
11637 This is the internal complement to regdupe() which is used to copy
11638 the structure pointed to by the *pprivate pointer in the regexp.
11639 This is the core version of the extension overridable cloning hook.
11640 The regexp structure being duplicated will be copied by perl prior
11641 to this and will be provided as the regexp *r argument, however
11642 with the /old/ structures pprivate pointer value. Thus this routine
11643 may override any copying normally done by perl.
11644
11645 It returns a pointer to the new regexp_internal structure.
11646*/
11647
11648void *
11649Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
11650{
11651 dVAR;
11652 struct regexp *const r = (struct regexp *)SvANY(rx);
11653 regexp_internal *reti;
11654 int len, npar;
11655 RXi_GET_DECL(r,ri);
11656
11657 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
11658
11659 npar = r->nparens+1;
11660 len = ProgLen(ri);
11661
11662 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
11663 Copy(ri->program, reti->program, len+1, regnode);
11664
11665
11666 reti->regstclass = NULL;
11667
11668 if (ri->data) {
11669 struct reg_data *d;
11670 const int count = ri->data->count;
11671 int i;
11672
11673 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
11674 char, struct reg_data);
11675 Newx(d->what, count, U8);
11676
11677 d->count = count;
11678 for (i = 0; i < count; i++) {
11679 d->what[i] = ri->data->what[i];
11680 switch (d->what[i]) {
11681 /* legal options are one of: sSfpontTua
11682 see also regcomp.h and pregfree() */
11683 case 'a': /* actually an AV, but the dup function is identical. */
11684 case 's':
11685 case 'S':
11686 case 'p': /* actually an AV, but the dup function is identical. */
11687 case 'u': /* actually an HV, but the dup function is identical. */
11688 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
11689 break;
11690 case 'f':
11691 /* This is cheating. */
11692 Newx(d->data[i], 1, struct regnode_charclass_class);
11693 StructCopy(ri->data->data[i], d->data[i],
11694 struct regnode_charclass_class);
11695 reti->regstclass = (regnode*)d->data[i];
11696 break;
11697 case 'o':
11698 /* Compiled op trees are readonly and in shared memory,
11699 and can thus be shared without duplication. */
11700 OP_REFCNT_LOCK;
11701 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
11702 OP_REFCNT_UNLOCK;
11703 break;
11704 case 'T':
11705 /* Trie stclasses are readonly and can thus be shared
11706 * without duplication. We free the stclass in pregfree
11707 * when the corresponding reg_ac_data struct is freed.
11708 */
11709 reti->regstclass= ri->regstclass;
11710 /* Fall through */
11711 case 't':
11712 OP_REFCNT_LOCK;
11713 ((reg_trie_data*)ri->data->data[i])->refcount++;
11714 OP_REFCNT_UNLOCK;
11715 /* Fall through */
11716 case 'n':
11717 d->data[i] = ri->data->data[i];
11718 break;
11719 default:
11720 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
11721 }
11722 }
11723
11724 reti->data = d;
11725 }
11726 else
11727 reti->data = NULL;
11728
11729 reti->name_list_idx = ri->name_list_idx;
11730
11731#ifdef RE_TRACK_PATTERN_OFFSETS
11732 if (ri->u.offsets) {
11733 Newx(reti->u.offsets, 2*len+1, U32);
11734 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
11735 }
11736#else
11737 SetProgLen(reti,len);
11738#endif
11739
11740 return (void*)reti;
11741}
11742
11743#endif /* USE_ITHREADS */
11744
11745#ifndef PERL_IN_XSUB_RE
11746
11747/*
11748 - regnext - dig the "next" pointer out of a node
11749 */
11750regnode *
11751Perl_regnext(pTHX_ register regnode *p)
11752{
11753 dVAR;
11754 register I32 offset;
11755
11756 if (!p)
11757 return(NULL);
11758
11759 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
11760 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
11761 }
11762
11763 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
11764 if (offset == 0)
11765 return(NULL);
11766
11767 return(p+offset);
11768}
11769#endif
11770
11771STATIC void
11772S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
11773{
11774 va_list args;
11775 STRLEN l1 = strlen(pat1);
11776 STRLEN l2 = strlen(pat2);
11777 char buf[512];
11778 SV *msv;
11779 const char *message;
11780
11781 PERL_ARGS_ASSERT_RE_CROAK2;
11782
11783 if (l1 > 510)
11784 l1 = 510;
11785 if (l1 + l2 > 510)
11786 l2 = 510 - l1;
11787 Copy(pat1, buf, l1 , char);
11788 Copy(pat2, buf + l1, l2 , char);
11789 buf[l1 + l2] = '\n';
11790 buf[l1 + l2 + 1] = '\0';
11791#ifdef I_STDARG
11792 /* ANSI variant takes additional second argument */
11793 va_start(args, pat2);
11794#else
11795 va_start(args);
11796#endif
11797 msv = vmess(buf, &args);
11798 va_end(args);
11799 message = SvPV_const(msv,l1);
11800 if (l1 > 512)
11801 l1 = 512;
11802 Copy(message, buf, l1 , char);
11803 buf[l1-1] = '\0'; /* Overwrite \n */
11804 Perl_croak(aTHX_ "%s", buf);
11805}
11806
11807/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
11808
11809#ifndef PERL_IN_XSUB_RE
11810void
11811Perl_save_re_context(pTHX)
11812{
11813 dVAR;
11814
11815 struct re_save_state *state;
11816
11817 SAVEVPTR(PL_curcop);
11818 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
11819
11820 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
11821 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
11822 SSPUSHUV(SAVEt_RE_STATE);
11823
11824 Copy(&PL_reg_state, state, 1, struct re_save_state);
11825
11826 PL_reg_start_tmp = 0;
11827 PL_reg_start_tmpl = 0;
11828 PL_reg_oldsaved = NULL;
11829 PL_reg_oldsavedlen = 0;
11830 PL_reg_maxiter = 0;
11831 PL_reg_leftiter = 0;
11832 PL_reg_poscache = NULL;
11833 PL_reg_poscache_size = 0;
11834#ifdef PERL_OLD_COPY_ON_WRITE
11835 PL_nrs = NULL;
11836#endif
11837
11838 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
11839 if (PL_curpm) {
11840 const REGEXP * const rx = PM_GETRE(PL_curpm);
11841 if (rx) {
11842 U32 i;
11843 for (i = 1; i <= RX_NPARENS(rx); i++) {
11844 char digits[TYPE_CHARS(long)];
11845 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
11846 GV *const *const gvp
11847 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
11848
11849 if (gvp) {
11850 GV * const gv = *gvp;
11851 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
11852 save_scalar(gv);
11853 }
11854 }
11855 }
11856 }
11857}
11858#endif
11859
11860static void
11861clear_re(pTHX_ void *r)
11862{
11863 dVAR;
11864 ReREFCNT_dec((REGEXP *)r);
11865}
11866
11867#ifdef DEBUGGING
11868
11869STATIC void
11870S_put_byte(pTHX_ SV *sv, int c)
11871{
11872 PERL_ARGS_ASSERT_PUT_BYTE;
11873
11874 /* Our definition of isPRINT() ignores locales, so only bytes that are
11875 not part of UTF-8 are considered printable. I assume that the same
11876 holds for UTF-EBCDIC.
11877 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
11878 which Wikipedia says:
11879
11880 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
11881 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
11882 identical, to the ASCII delete (DEL) or rubout control character.
11883 ) So the old condition can be simplified to !isPRINT(c) */
11884 if (!isPRINT(c)) {
11885 if (c < 256) {
11886 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
11887 }
11888 else {
11889 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
11890 }
11891 }
11892 else {
11893 const char string = c;
11894 if (c == '-' || c == ']' || c == '\\' || c == '^')
11895 sv_catpvs(sv, "\\");
11896 sv_catpvn(sv, &string, 1);
11897 }
11898}
11899
11900
11901#define CLEAR_OPTSTART \
11902 if (optstart) STMT_START { \
11903 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
11904 optstart=NULL; \
11905 } STMT_END
11906
11907#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
11908
11909STATIC const regnode *
11910S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
11911 const regnode *last, const regnode *plast,
11912 SV* sv, I32 indent, U32 depth)
11913{
11914 dVAR;
11915 register U8 op = PSEUDO; /* Arbitrary non-END op. */
11916 register const regnode *next;
11917 const regnode *optstart= NULL;
11918
11919 RXi_GET_DECL(r,ri);
11920 GET_RE_DEBUG_FLAGS_DECL;
11921
11922 PERL_ARGS_ASSERT_DUMPUNTIL;
11923
11924#ifdef DEBUG_DUMPUNTIL
11925 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
11926 last ? last-start : 0,plast ? plast-start : 0);
11927#endif
11928
11929 if (plast && plast < last)
11930 last= plast;
11931
11932 while (PL_regkind[op] != END && (!last || node < last)) {
11933 /* While that wasn't END last time... */
11934 NODE_ALIGN(node);
11935 op = OP(node);
11936 if (op == CLOSE || op == WHILEM)
11937 indent--;
11938 next = regnext((regnode *)node);
11939
11940 /* Where, what. */
11941 if (OP(node) == OPTIMIZED) {
11942 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
11943 optstart = node;
11944 else
11945 goto after_print;
11946 } else
11947 CLEAR_OPTSTART;
11948
11949 regprop(r, sv, node);
11950 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
11951 (int)(2*indent + 1), "", SvPVX_const(sv));
11952
11953 if (OP(node) != OPTIMIZED) {
11954 if (next == NULL) /* Next ptr. */
11955 PerlIO_printf(Perl_debug_log, " (0)");
11956 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
11957 PerlIO_printf(Perl_debug_log, " (FAIL)");
11958 else
11959 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
11960 (void)PerlIO_putc(Perl_debug_log, '\n');
11961 }
11962
11963 after_print:
11964 if (PL_regkind[(U8)op] == BRANCHJ) {
11965 assert(next);
11966 {
11967 register const regnode *nnode = (OP(next) == LONGJMP
11968 ? regnext((regnode *)next)
11969 : next);
11970 if (last && nnode > last)
11971 nnode = last;
11972 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
11973 }
11974 }
11975 else if (PL_regkind[(U8)op] == BRANCH) {
11976 assert(next);
11977 DUMPUNTIL(NEXTOPER(node), next);
11978 }
11979 else if ( PL_regkind[(U8)op] == TRIE ) {
11980 const regnode *this_trie = node;
11981 const char op = OP(node);
11982 const U32 n = ARG(node);
11983 const reg_ac_data * const ac = op>=AHOCORASICK ?
11984 (reg_ac_data *)ri->data->data[n] :
11985 NULL;
11986 const reg_trie_data * const trie =
11987 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
11988#ifdef DEBUGGING
11989 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
11990#endif
11991 const regnode *nextbranch= NULL;
11992 I32 word_idx;
11993 sv_setpvs(sv, "");
11994 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
11995 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
11996
11997 PerlIO_printf(Perl_debug_log, "%*s%s ",
11998 (int)(2*(indent+3)), "",
11999 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
12000 PL_colors[0], PL_colors[1],
12001 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
12002 PERL_PV_PRETTY_ELLIPSES |
12003 PERL_PV_PRETTY_LTGT
12004 )
12005 : "???"
12006 );
12007 if (trie->jump) {
12008 U16 dist= trie->jump[word_idx+1];
12009 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
12010 (UV)((dist ? this_trie + dist : next) - start));
12011 if (dist) {
12012 if (!nextbranch)
12013 nextbranch= this_trie + trie->jump[0];
12014 DUMPUNTIL(this_trie + dist, nextbranch);
12015 }
12016 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
12017 nextbranch= regnext((regnode *)nextbranch);
12018 } else {
12019 PerlIO_printf(Perl_debug_log, "\n");
12020 }
12021 }
12022 if (last && next > last)
12023 node= last;
12024 else
12025 node= next;
12026 }
12027 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
12028 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
12029 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
12030 }
12031 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
12032 assert(next);
12033 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
12034 }
12035 else if ( op == PLUS || op == STAR) {
12036 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
12037 }
12038 else if (PL_regkind[(U8)op] == ANYOF) {
12039 /* arglen 1 + class block */
12040 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
12041 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
12042 node = NEXTOPER(node);
12043 }
12044 else if (PL_regkind[(U8)op] == EXACT) {
12045 /* Literal string, where present. */
12046 node += NODE_SZ_STR(node) - 1;
12047 node = NEXTOPER(node);
12048 }
12049 else {
12050 node = NEXTOPER(node);
12051 node += regarglen[(U8)op];
12052 }
12053 if (op == CURLYX || op == OPEN)
12054 indent++;
12055 }
12056 CLEAR_OPTSTART;
12057#ifdef DEBUG_DUMPUNTIL
12058 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
12059#endif
12060 return node;
12061}
12062
12063#endif /* DEBUGGING */
12064
12065/*
12066 * Local variables:
12067 * c-indentation-style: bsd
12068 * c-basic-offset: 4
12069 * indent-tabs-mode: t
12070 * End:
12071 *
12072 * ex: set ts=8 sts=4 sw=4 noet:
12073 */