This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Put a cap on op slab sizes
[perl5.git] / regcomp.c
... / ...
CommitLineData
1/* regcomp.c
2 */
3
4/*
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8 */
9
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
18 */
19
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
29/* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
32*/
33
34#ifdef PERL_EXT_RE_BUILD
35#include "re_top.h"
36#endif
37
38/*
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
64 ****
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
74#define PERL_IN_REGCOMP_C
75#include "perl.h"
76
77#ifndef PERL_IN_XSUB_RE
78# include "INTERN.h"
79#endif
80
81#define REG_COMP_C
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
87
88#include "dquote_static.c"
89#ifndef PERL_IN_XSUB_RE
90# include "charclass_invlists.h"
91#endif
92
93#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
94
95#ifdef op
96#undef op
97#endif /* op */
98
99#ifdef MSDOS
100# if defined(BUGGY_MSC6)
101 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
102# pragma optimize("a",off)
103 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
104# pragma optimize("w",on )
105# endif /* BUGGY_MSC6 */
106#endif /* MSDOS */
107
108#ifndef STATIC
109#define STATIC static
110#endif
111
112
113typedef struct RExC_state_t {
114 U32 flags; /* RXf_* are we folding, multilining? */
115 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
116 char *precomp; /* uncompiled string. */
117 REGEXP *rx_sv; /* The SV that is the regexp. */
118 regexp *rx; /* perl core regexp structure */
119 regexp_internal *rxi; /* internal data for regexp object pprivate field */
120 char *start; /* Start of input for compile */
121 char *end; /* End of input for compile */
122 char *parse; /* Input-scan pointer. */
123 I32 whilem_seen; /* number of WHILEM in this expr */
124 regnode *emit_start; /* Start of emitted-code area */
125 regnode *emit_bound; /* First regnode outside of the allocated space */
126 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
127 I32 naughty; /* How bad is this pattern? */
128 I32 sawback; /* Did we see \1, ...? */
129 U32 seen;
130 I32 size; /* Code size. */
131 I32 npar; /* Capture buffer count, (OPEN). */
132 I32 cpar; /* Capture buffer count, (CLOSE). */
133 I32 nestroot; /* root parens we are in - used by accept */
134 I32 extralen;
135 I32 seen_zerolen;
136 regnode **open_parens; /* pointers to open parens */
137 regnode **close_parens; /* pointers to close parens */
138 regnode *opend; /* END node in program */
139 I32 utf8; /* whether the pattern is utf8 or not */
140 I32 orig_utf8; /* whether the pattern was originally in utf8 */
141 /* XXX use this for future optimisation of case
142 * where pattern must be upgraded to utf8. */
143 I32 uni_semantics; /* If a d charset modifier should use unicode
144 rules, even if the pattern is not in
145 utf8 */
146 HV *paren_names; /* Paren names */
147
148 regnode **recurse; /* Recurse regops */
149 I32 recurse_count; /* Number of recurse regops */
150 I32 in_lookbehind;
151 I32 contains_locale;
152 I32 override_recoding;
153 struct reg_code_block *code_blocks; /* positions of literal (?{})
154 within pattern */
155 int num_code_blocks; /* size of code_blocks[] */
156 int code_index; /* next code_blocks[] slot */
157#if ADD_TO_REGEXEC
158 char *starttry; /* -Dr: where regtry was called. */
159#define RExC_starttry (pRExC_state->starttry)
160#endif
161 SV *runtime_code_qr; /* qr with the runtime code blocks */
162#ifdef DEBUGGING
163 const char *lastparse;
164 I32 lastnum;
165 AV *paren_name_list; /* idx -> name */
166#define RExC_lastparse (pRExC_state->lastparse)
167#define RExC_lastnum (pRExC_state->lastnum)
168#define RExC_paren_name_list (pRExC_state->paren_name_list)
169#endif
170} RExC_state_t;
171
172#define RExC_flags (pRExC_state->flags)
173#define RExC_pm_flags (pRExC_state->pm_flags)
174#define RExC_precomp (pRExC_state->precomp)
175#define RExC_rx_sv (pRExC_state->rx_sv)
176#define RExC_rx (pRExC_state->rx)
177#define RExC_rxi (pRExC_state->rxi)
178#define RExC_start (pRExC_state->start)
179#define RExC_end (pRExC_state->end)
180#define RExC_parse (pRExC_state->parse)
181#define RExC_whilem_seen (pRExC_state->whilem_seen)
182#ifdef RE_TRACK_PATTERN_OFFSETS
183#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
184#endif
185#define RExC_emit (pRExC_state->emit)
186#define RExC_emit_start (pRExC_state->emit_start)
187#define RExC_emit_bound (pRExC_state->emit_bound)
188#define RExC_naughty (pRExC_state->naughty)
189#define RExC_sawback (pRExC_state->sawback)
190#define RExC_seen (pRExC_state->seen)
191#define RExC_size (pRExC_state->size)
192#define RExC_npar (pRExC_state->npar)
193#define RExC_nestroot (pRExC_state->nestroot)
194#define RExC_extralen (pRExC_state->extralen)
195#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
196#define RExC_utf8 (pRExC_state->utf8)
197#define RExC_uni_semantics (pRExC_state->uni_semantics)
198#define RExC_orig_utf8 (pRExC_state->orig_utf8)
199#define RExC_open_parens (pRExC_state->open_parens)
200#define RExC_close_parens (pRExC_state->close_parens)
201#define RExC_opend (pRExC_state->opend)
202#define RExC_paren_names (pRExC_state->paren_names)
203#define RExC_recurse (pRExC_state->recurse)
204#define RExC_recurse_count (pRExC_state->recurse_count)
205#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
206#define RExC_contains_locale (pRExC_state->contains_locale)
207#define RExC_override_recoding (pRExC_state->override_recoding)
208
209
210#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
211#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
212 ((*s) == '{' && regcurly(s)))
213
214#ifdef SPSTART
215#undef SPSTART /* dratted cpp namespace... */
216#endif
217/*
218 * Flags to be passed up and down.
219 */
220#define WORST 0 /* Worst case. */
221#define HASWIDTH 0x01 /* Known to match non-null strings. */
222
223/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
224 * character, and if utf8, must be invariant. Note that this is not the same
225 * thing as REGNODE_SIMPLE */
226#define SIMPLE 0x02
227#define SPSTART 0x04 /* Starts with * or +. */
228#define TRYAGAIN 0x08 /* Weeded out a declaration. */
229#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
230
231#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
232
233/* whether trie related optimizations are enabled */
234#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
235#define TRIE_STUDY_OPT
236#define FULL_TRIE_STUDY
237#define TRIE_STCLASS
238#endif
239
240
241
242#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
243#define PBITVAL(paren) (1 << ((paren) & 7))
244#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
245#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
246#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
247
248/* If not already in utf8, do a longjmp back to the beginning */
249#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
250#define REQUIRE_UTF8 STMT_START { \
251 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
252 } STMT_END
253
254/* About scan_data_t.
255
256 During optimisation we recurse through the regexp program performing
257 various inplace (keyhole style) optimisations. In addition study_chunk
258 and scan_commit populate this data structure with information about
259 what strings MUST appear in the pattern. We look for the longest
260 string that must appear at a fixed location, and we look for the
261 longest string that may appear at a floating location. So for instance
262 in the pattern:
263
264 /FOO[xX]A.*B[xX]BAR/
265
266 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
267 strings (because they follow a .* construct). study_chunk will identify
268 both FOO and BAR as being the longest fixed and floating strings respectively.
269
270 The strings can be composites, for instance
271
272 /(f)(o)(o)/
273
274 will result in a composite fixed substring 'foo'.
275
276 For each string some basic information is maintained:
277
278 - offset or min_offset
279 This is the position the string must appear at, or not before.
280 It also implicitly (when combined with minlenp) tells us how many
281 characters must match before the string we are searching for.
282 Likewise when combined with minlenp and the length of the string it
283 tells us how many characters must appear after the string we have
284 found.
285
286 - max_offset
287 Only used for floating strings. This is the rightmost point that
288 the string can appear at. If set to I32 max it indicates that the
289 string can occur infinitely far to the right.
290
291 - minlenp
292 A pointer to the minimum length of the pattern that the string
293 was found inside. This is important as in the case of positive
294 lookahead or positive lookbehind we can have multiple patterns
295 involved. Consider
296
297 /(?=FOO).*F/
298
299 The minimum length of the pattern overall is 3, the minimum length
300 of the lookahead part is 3, but the minimum length of the part that
301 will actually match is 1. So 'FOO's minimum length is 3, but the
302 minimum length for the F is 1. This is important as the minimum length
303 is used to determine offsets in front of and behind the string being
304 looked for. Since strings can be composites this is the length of the
305 pattern at the time it was committed with a scan_commit. Note that
306 the length is calculated by study_chunk, so that the minimum lengths
307 are not known until the full pattern has been compiled, thus the
308 pointer to the value.
309
310 - lookbehind
311
312 In the case of lookbehind the string being searched for can be
313 offset past the start point of the final matching string.
314 If this value was just blithely removed from the min_offset it would
315 invalidate some of the calculations for how many chars must match
316 before or after (as they are derived from min_offset and minlen and
317 the length of the string being searched for).
318 When the final pattern is compiled and the data is moved from the
319 scan_data_t structure into the regexp structure the information
320 about lookbehind is factored in, with the information that would
321 have been lost precalculated in the end_shift field for the
322 associated string.
323
324 The fields pos_min and pos_delta are used to store the minimum offset
325 and the delta to the maximum offset at the current point in the pattern.
326
327*/
328
329typedef struct scan_data_t {
330 /*I32 len_min; unused */
331 /*I32 len_delta; unused */
332 I32 pos_min;
333 I32 pos_delta;
334 SV *last_found;
335 I32 last_end; /* min value, <0 unless valid. */
336 I32 last_start_min;
337 I32 last_start_max;
338 SV **longest; /* Either &l_fixed, or &l_float. */
339 SV *longest_fixed; /* longest fixed string found in pattern */
340 I32 offset_fixed; /* offset where it starts */
341 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
342 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
343 SV *longest_float; /* longest floating string found in pattern */
344 I32 offset_float_min; /* earliest point in string it can appear */
345 I32 offset_float_max; /* latest point in string it can appear */
346 I32 *minlen_float; /* pointer to the minlen relevant to the string */
347 I32 lookbehind_float; /* is the position of the string modified by LB */
348 I32 flags;
349 I32 whilem_c;
350 I32 *last_closep;
351 struct regnode_charclass_class *start_class;
352} scan_data_t;
353
354/*
355 * Forward declarations for pregcomp()'s friends.
356 */
357
358static const scan_data_t zero_scan_data =
359 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
360
361#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
362#define SF_BEFORE_SEOL 0x0001
363#define SF_BEFORE_MEOL 0x0002
364#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
365#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
366
367#ifdef NO_UNARY_PLUS
368# define SF_FIX_SHIFT_EOL (0+2)
369# define SF_FL_SHIFT_EOL (0+4)
370#else
371# define SF_FIX_SHIFT_EOL (+2)
372# define SF_FL_SHIFT_EOL (+4)
373#endif
374
375#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
376#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
377
378#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
379#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
380#define SF_IS_INF 0x0040
381#define SF_HAS_PAR 0x0080
382#define SF_IN_PAR 0x0100
383#define SF_HAS_EVAL 0x0200
384#define SCF_DO_SUBSTR 0x0400
385#define SCF_DO_STCLASS_AND 0x0800
386#define SCF_DO_STCLASS_OR 0x1000
387#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
388#define SCF_WHILEM_VISITED_POS 0x2000
389
390#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
391#define SCF_SEEN_ACCEPT 0x8000
392
393#define UTF cBOOL(RExC_utf8)
394
395/* The enums for all these are ordered so things work out correctly */
396#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
397#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
398#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
399#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
400#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
401#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
402#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
403
404#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
405
406#define OOB_UNICODE 12345678
407#define OOB_NAMEDCLASS -1
408
409#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
410#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
411
412
413/* length of regex to show in messages that don't mark a position within */
414#define RegexLengthToShowInErrorMessages 127
415
416/*
417 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
418 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
419 * op/pragma/warn/regcomp.
420 */
421#define MARKER1 "<-- HERE" /* marker as it appears in the description */
422#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
423
424#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
425
426/*
427 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
428 * arg. Show regex, up to a maximum length. If it's too long, chop and add
429 * "...".
430 */
431#define _FAIL(code) STMT_START { \
432 const char *ellipses = ""; \
433 IV len = RExC_end - RExC_precomp; \
434 \
435 if (!SIZE_ONLY) \
436 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
437 if (len > RegexLengthToShowInErrorMessages) { \
438 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
439 len = RegexLengthToShowInErrorMessages - 10; \
440 ellipses = "..."; \
441 } \
442 code; \
443} STMT_END
444
445#define FAIL(msg) _FAIL( \
446 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
447 msg, (int)len, RExC_precomp, ellipses))
448
449#define FAIL2(msg,arg) _FAIL( \
450 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
451 arg, (int)len, RExC_precomp, ellipses))
452
453/*
454 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
455 */
456#define Simple_vFAIL(m) STMT_START { \
457 const IV offset = RExC_parse - RExC_precomp; \
458 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
459 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
460} STMT_END
461
462/*
463 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
464 */
465#define vFAIL(m) STMT_START { \
466 if (!SIZE_ONLY) \
467 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
468 Simple_vFAIL(m); \
469} STMT_END
470
471/*
472 * Like Simple_vFAIL(), but accepts two arguments.
473 */
474#define Simple_vFAIL2(m,a1) STMT_START { \
475 const IV offset = RExC_parse - RExC_precomp; \
476 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
477 (int)offset, RExC_precomp, RExC_precomp + offset); \
478} STMT_END
479
480/*
481 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
482 */
483#define vFAIL2(m,a1) STMT_START { \
484 if (!SIZE_ONLY) \
485 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
486 Simple_vFAIL2(m, a1); \
487} STMT_END
488
489
490/*
491 * Like Simple_vFAIL(), but accepts three arguments.
492 */
493#define Simple_vFAIL3(m, a1, a2) STMT_START { \
494 const IV offset = RExC_parse - RExC_precomp; \
495 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
496 (int)offset, RExC_precomp, RExC_precomp + offset); \
497} STMT_END
498
499/*
500 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
501 */
502#define vFAIL3(m,a1,a2) STMT_START { \
503 if (!SIZE_ONLY) \
504 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
505 Simple_vFAIL3(m, a1, a2); \
506} STMT_END
507
508/*
509 * Like Simple_vFAIL(), but accepts four arguments.
510 */
511#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
512 const IV offset = RExC_parse - RExC_precomp; \
513 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
514 (int)offset, RExC_precomp, RExC_precomp + offset); \
515} STMT_END
516
517#define ckWARNreg(loc,m) STMT_START { \
518 const IV offset = loc - RExC_precomp; \
519 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
520 (int)offset, RExC_precomp, RExC_precomp + offset); \
521} STMT_END
522
523#define ckWARNregdep(loc,m) STMT_START { \
524 const IV offset = loc - RExC_precomp; \
525 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
526 m REPORT_LOCATION, \
527 (int)offset, RExC_precomp, RExC_precomp + offset); \
528} STMT_END
529
530#define ckWARN2regdep(loc,m, a1) STMT_START { \
531 const IV offset = loc - RExC_precomp; \
532 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
533 m REPORT_LOCATION, \
534 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
535} STMT_END
536
537#define ckWARN2reg(loc, m, a1) STMT_START { \
538 const IV offset = loc - RExC_precomp; \
539 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
540 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
541} STMT_END
542
543#define vWARN3(loc, m, a1, a2) STMT_START { \
544 const IV offset = loc - RExC_precomp; \
545 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
546 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
547} STMT_END
548
549#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
550 const IV offset = loc - RExC_precomp; \
551 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
552 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
553} STMT_END
554
555#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
556 const IV offset = loc - RExC_precomp; \
557 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
558 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
559} STMT_END
560
561#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
562 const IV offset = loc - RExC_precomp; \
563 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
564 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
565} STMT_END
566
567#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
568 const IV offset = loc - RExC_precomp; \
569 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
570 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
571} STMT_END
572
573
574/* Allow for side effects in s */
575#define REGC(c,s) STMT_START { \
576 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
577} STMT_END
578
579/* Macros for recording node offsets. 20001227 mjd@plover.com
580 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
581 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
582 * Element 0 holds the number n.
583 * Position is 1 indexed.
584 */
585#ifndef RE_TRACK_PATTERN_OFFSETS
586#define Set_Node_Offset_To_R(node,byte)
587#define Set_Node_Offset(node,byte)
588#define Set_Cur_Node_Offset
589#define Set_Node_Length_To_R(node,len)
590#define Set_Node_Length(node,len)
591#define Set_Node_Cur_Length(node)
592#define Node_Offset(n)
593#define Node_Length(n)
594#define Set_Node_Offset_Length(node,offset,len)
595#define ProgLen(ri) ri->u.proglen
596#define SetProgLen(ri,x) ri->u.proglen = x
597#else
598#define ProgLen(ri) ri->u.offsets[0]
599#define SetProgLen(ri,x) ri->u.offsets[0] = x
600#define Set_Node_Offset_To_R(node,byte) STMT_START { \
601 if (! SIZE_ONLY) { \
602 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
603 __LINE__, (int)(node), (int)(byte))); \
604 if((node) < 0) { \
605 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
606 } else { \
607 RExC_offsets[2*(node)-1] = (byte); \
608 } \
609 } \
610} STMT_END
611
612#define Set_Node_Offset(node,byte) \
613 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
614#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
615
616#define Set_Node_Length_To_R(node,len) STMT_START { \
617 if (! SIZE_ONLY) { \
618 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
619 __LINE__, (int)(node), (int)(len))); \
620 if((node) < 0) { \
621 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
622 } else { \
623 RExC_offsets[2*(node)] = (len); \
624 } \
625 } \
626} STMT_END
627
628#define Set_Node_Length(node,len) \
629 Set_Node_Length_To_R((node)-RExC_emit_start, len)
630#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
631#define Set_Node_Cur_Length(node) \
632 Set_Node_Length(node, RExC_parse - parse_start)
633
634/* Get offsets and lengths */
635#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
636#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
637
638#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
639 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
640 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
641} STMT_END
642#endif
643
644#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
645#define EXPERIMENTAL_INPLACESCAN
646#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
647
648#define DEBUG_STUDYDATA(str,data,depth) \
649DEBUG_OPTIMISE_MORE_r(if(data){ \
650 PerlIO_printf(Perl_debug_log, \
651 "%*s" str "Pos:%"IVdf"/%"IVdf \
652 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
653 (int)(depth)*2, "", \
654 (IV)((data)->pos_min), \
655 (IV)((data)->pos_delta), \
656 (UV)((data)->flags), \
657 (IV)((data)->whilem_c), \
658 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
659 is_inf ? "INF " : "" \
660 ); \
661 if ((data)->last_found) \
662 PerlIO_printf(Perl_debug_log, \
663 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
664 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
665 SvPVX_const((data)->last_found), \
666 (IV)((data)->last_end), \
667 (IV)((data)->last_start_min), \
668 (IV)((data)->last_start_max), \
669 ((data)->longest && \
670 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
671 SvPVX_const((data)->longest_fixed), \
672 (IV)((data)->offset_fixed), \
673 ((data)->longest && \
674 (data)->longest==&((data)->longest_float)) ? "*" : "", \
675 SvPVX_const((data)->longest_float), \
676 (IV)((data)->offset_float_min), \
677 (IV)((data)->offset_float_max) \
678 ); \
679 PerlIO_printf(Perl_debug_log,"\n"); \
680});
681
682static void clear_re(pTHX_ void *r);
683
684/* Mark that we cannot extend a found fixed substring at this point.
685 Update the longest found anchored substring and the longest found
686 floating substrings if needed. */
687
688STATIC void
689S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
690{
691 const STRLEN l = CHR_SVLEN(data->last_found);
692 const STRLEN old_l = CHR_SVLEN(*data->longest);
693 GET_RE_DEBUG_FLAGS_DECL;
694
695 PERL_ARGS_ASSERT_SCAN_COMMIT;
696
697 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
698 SvSetMagicSV(*data->longest, data->last_found);
699 if (*data->longest == data->longest_fixed) {
700 data->offset_fixed = l ? data->last_start_min : data->pos_min;
701 if (data->flags & SF_BEFORE_EOL)
702 data->flags
703 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
704 else
705 data->flags &= ~SF_FIX_BEFORE_EOL;
706 data->minlen_fixed=minlenp;
707 data->lookbehind_fixed=0;
708 }
709 else { /* *data->longest == data->longest_float */
710 data->offset_float_min = l ? data->last_start_min : data->pos_min;
711 data->offset_float_max = (l
712 ? data->last_start_max
713 : data->pos_min + data->pos_delta);
714 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
715 data->offset_float_max = I32_MAX;
716 if (data->flags & SF_BEFORE_EOL)
717 data->flags
718 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
719 else
720 data->flags &= ~SF_FL_BEFORE_EOL;
721 data->minlen_float=minlenp;
722 data->lookbehind_float=0;
723 }
724 }
725 SvCUR_set(data->last_found, 0);
726 {
727 SV * const sv = data->last_found;
728 if (SvUTF8(sv) && SvMAGICAL(sv)) {
729 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
730 if (mg)
731 mg->mg_len = 0;
732 }
733 }
734 data->last_end = -1;
735 data->flags &= ~SF_BEFORE_EOL;
736 DEBUG_STUDYDATA("commit: ",data,0);
737}
738
739/* Can match anything (initialization) */
740STATIC void
741S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
742{
743 PERL_ARGS_ASSERT_CL_ANYTHING;
744
745 ANYOF_BITMAP_SETALL(cl);
746 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
747 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
748
749 /* If any portion of the regex is to operate under locale rules,
750 * initialization includes it. The reason this isn't done for all regexes
751 * is that the optimizer was written under the assumption that locale was
752 * all-or-nothing. Given the complexity and lack of documentation in the
753 * optimizer, and that there are inadequate test cases for locale, so many
754 * parts of it may not work properly, it is safest to avoid locale unless
755 * necessary. */
756 if (RExC_contains_locale) {
757 ANYOF_CLASS_SETALL(cl); /* /l uses class */
758 cl->flags |= ANYOF_LOCALE;
759 }
760 else {
761 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
762 }
763}
764
765/* Can match anything (initialization) */
766STATIC int
767S_cl_is_anything(const struct regnode_charclass_class *cl)
768{
769 int value;
770
771 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
772
773 for (value = 0; value <= ANYOF_MAX; value += 2)
774 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
775 return 1;
776 if (!(cl->flags & ANYOF_UNICODE_ALL))
777 return 0;
778 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
779 return 0;
780 return 1;
781}
782
783/* Can match anything (initialization) */
784STATIC void
785S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
786{
787 PERL_ARGS_ASSERT_CL_INIT;
788
789 Zero(cl, 1, struct regnode_charclass_class);
790 cl->type = ANYOF;
791 cl_anything(pRExC_state, cl);
792 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
793}
794
795/* These two functions currently do the exact same thing */
796#define cl_init_zero S_cl_init
797
798/* 'AND' a given class with another one. Can create false positives. 'cl'
799 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
800 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
801STATIC void
802S_cl_and(struct regnode_charclass_class *cl,
803 const struct regnode_charclass_class *and_with)
804{
805 PERL_ARGS_ASSERT_CL_AND;
806
807 assert(and_with->type == ANYOF);
808
809 /* I (khw) am not sure all these restrictions are necessary XXX */
810 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
811 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
812 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
813 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
814 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
815 int i;
816
817 if (and_with->flags & ANYOF_INVERT)
818 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
819 cl->bitmap[i] &= ~and_with->bitmap[i];
820 else
821 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
822 cl->bitmap[i] &= and_with->bitmap[i];
823 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
824
825 if (and_with->flags & ANYOF_INVERT) {
826
827 /* Here, the and'ed node is inverted. Get the AND of the flags that
828 * aren't affected by the inversion. Those that are affected are
829 * handled individually below */
830 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
831 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
832 cl->flags |= affected_flags;
833
834 /* We currently don't know how to deal with things that aren't in the
835 * bitmap, but we know that the intersection is no greater than what
836 * is already in cl, so let there be false positives that get sorted
837 * out after the synthetic start class succeeds, and the node is
838 * matched for real. */
839
840 /* The inversion of these two flags indicate that the resulting
841 * intersection doesn't have them */
842 if (and_with->flags & ANYOF_UNICODE_ALL) {
843 cl->flags &= ~ANYOF_UNICODE_ALL;
844 }
845 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
846 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
847 }
848 }
849 else { /* and'd node is not inverted */
850 U8 outside_bitmap_but_not_utf8; /* Temp variable */
851
852 if (! ANYOF_NONBITMAP(and_with)) {
853
854 /* Here 'and_with' doesn't match anything outside the bitmap
855 * (except possibly ANYOF_UNICODE_ALL), which means the
856 * intersection can't either, except for ANYOF_UNICODE_ALL, in
857 * which case we don't know what the intersection is, but it's no
858 * greater than what cl already has, so can just leave it alone,
859 * with possible false positives */
860 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
861 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
862 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
863 }
864 }
865 else if (! ANYOF_NONBITMAP(cl)) {
866
867 /* Here, 'and_with' does match something outside the bitmap, and cl
868 * doesn't have a list of things to match outside the bitmap. If
869 * cl can match all code points above 255, the intersection will
870 * be those above-255 code points that 'and_with' matches. If cl
871 * can't match all Unicode code points, it means that it can't
872 * match anything outside the bitmap (since the 'if' that got us
873 * into this block tested for that), so we leave the bitmap empty.
874 */
875 if (cl->flags & ANYOF_UNICODE_ALL) {
876 ARG_SET(cl, ARG(and_with));
877
878 /* and_with's ARG may match things that don't require UTF8.
879 * And now cl's will too, in spite of this being an 'and'. See
880 * the comments below about the kludge */
881 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
882 }
883 }
884 else {
885 /* Here, both 'and_with' and cl match something outside the
886 * bitmap. Currently we do not do the intersection, so just match
887 * whatever cl had at the beginning. */
888 }
889
890
891 /* Take the intersection of the two sets of flags. However, the
892 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
893 * kludge around the fact that this flag is not treated like the others
894 * which are initialized in cl_anything(). The way the optimizer works
895 * is that the synthetic start class (SSC) is initialized to match
896 * anything, and then the first time a real node is encountered, its
897 * values are AND'd with the SSC's with the result being the values of
898 * the real node. However, there are paths through the optimizer where
899 * the AND never gets called, so those initialized bits are set
900 * inappropriately, which is not usually a big deal, as they just cause
901 * false positives in the SSC, which will just mean a probably
902 * imperceptible slow down in execution. However this bit has a
903 * higher false positive consequence in that it can cause utf8.pm,
904 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
905 * bigger slowdown and also causes significant extra memory to be used.
906 * In order to prevent this, the code now takes a different tack. The
907 * bit isn't set unless some part of the regular expression needs it,
908 * but once set it won't get cleared. This means that these extra
909 * modules won't get loaded unless there was some path through the
910 * pattern that would have required them anyway, and so any false
911 * positives that occur by not ANDing them out when they could be
912 * aren't as severe as they would be if we treated this bit like all
913 * the others */
914 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
915 & ANYOF_NONBITMAP_NON_UTF8;
916 cl->flags &= and_with->flags;
917 cl->flags |= outside_bitmap_but_not_utf8;
918 }
919}
920
921/* 'OR' a given class with another one. Can create false positives. 'cl'
922 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
923 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
924STATIC void
925S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
926{
927 PERL_ARGS_ASSERT_CL_OR;
928
929 if (or_with->flags & ANYOF_INVERT) {
930
931 /* Here, the or'd node is to be inverted. This means we take the
932 * complement of everything not in the bitmap, but currently we don't
933 * know what that is, so give up and match anything */
934 if (ANYOF_NONBITMAP(or_with)) {
935 cl_anything(pRExC_state, cl);
936 }
937 /* We do not use
938 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
939 * <= (B1 | !B2) | (CL1 | !CL2)
940 * which is wasteful if CL2 is small, but we ignore CL2:
941 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
942 * XXXX Can we handle case-fold? Unclear:
943 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
944 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
945 */
946 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
947 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
948 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
949 int i;
950
951 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
952 cl->bitmap[i] |= ~or_with->bitmap[i];
953 } /* XXXX: logic is complicated otherwise */
954 else {
955 cl_anything(pRExC_state, cl);
956 }
957
958 /* And, we can just take the union of the flags that aren't affected
959 * by the inversion */
960 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
961
962 /* For the remaining flags:
963 ANYOF_UNICODE_ALL and inverted means to not match anything above
964 255, which means that the union with cl should just be
965 what cl has in it, so can ignore this flag
966 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
967 is 127-255 to match them, but then invert that, so the
968 union with cl should just be what cl has in it, so can
969 ignore this flag
970 */
971 } else { /* 'or_with' is not inverted */
972 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
973 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
974 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
975 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
976 int i;
977
978 /* OR char bitmap and class bitmap separately */
979 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
980 cl->bitmap[i] |= or_with->bitmap[i];
981 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
982 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
983 cl->classflags[i] |= or_with->classflags[i];
984 cl->flags |= ANYOF_CLASS;
985 }
986 }
987 else { /* XXXX: logic is complicated, leave it along for a moment. */
988 cl_anything(pRExC_state, cl);
989 }
990
991 if (ANYOF_NONBITMAP(or_with)) {
992
993 /* Use the added node's outside-the-bit-map match if there isn't a
994 * conflict. If there is a conflict (both nodes match something
995 * outside the bitmap, but what they match outside is not the same
996 * pointer, and hence not easily compared until XXX we extend
997 * inversion lists this far), give up and allow the start class to
998 * match everything outside the bitmap. If that stuff is all above
999 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1000 if (! ANYOF_NONBITMAP(cl)) {
1001 ARG_SET(cl, ARG(or_with));
1002 }
1003 else if (ARG(cl) != ARG(or_with)) {
1004
1005 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1006 cl_anything(pRExC_state, cl);
1007 }
1008 else {
1009 cl->flags |= ANYOF_UNICODE_ALL;
1010 }
1011 }
1012 }
1013
1014 /* Take the union */
1015 cl->flags |= or_with->flags;
1016 }
1017}
1018
1019#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1020#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1021#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1022#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1023
1024
1025#ifdef DEBUGGING
1026/*
1027 dump_trie(trie,widecharmap,revcharmap)
1028 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1029 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1030
1031 These routines dump out a trie in a somewhat readable format.
1032 The _interim_ variants are used for debugging the interim
1033 tables that are used to generate the final compressed
1034 representation which is what dump_trie expects.
1035
1036 Part of the reason for their existence is to provide a form
1037 of documentation as to how the different representations function.
1038
1039*/
1040
1041/*
1042 Dumps the final compressed table form of the trie to Perl_debug_log.
1043 Used for debugging make_trie().
1044*/
1045
1046STATIC void
1047S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1048 AV *revcharmap, U32 depth)
1049{
1050 U32 state;
1051 SV *sv=sv_newmortal();
1052 int colwidth= widecharmap ? 6 : 4;
1053 U16 word;
1054 GET_RE_DEBUG_FLAGS_DECL;
1055
1056 PERL_ARGS_ASSERT_DUMP_TRIE;
1057
1058 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1059 (int)depth * 2 + 2,"",
1060 "Match","Base","Ofs" );
1061
1062 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1063 SV ** const tmp = av_fetch( revcharmap, state, 0);
1064 if ( tmp ) {
1065 PerlIO_printf( Perl_debug_log, "%*s",
1066 colwidth,
1067 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1068 PL_colors[0], PL_colors[1],
1069 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1070 PERL_PV_ESCAPE_FIRSTCHAR
1071 )
1072 );
1073 }
1074 }
1075 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1076 (int)depth * 2 + 2,"");
1077
1078 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1079 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1080 PerlIO_printf( Perl_debug_log, "\n");
1081
1082 for( state = 1 ; state < trie->statecount ; state++ ) {
1083 const U32 base = trie->states[ state ].trans.base;
1084
1085 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1086
1087 if ( trie->states[ state ].wordnum ) {
1088 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1089 } else {
1090 PerlIO_printf( Perl_debug_log, "%6s", "" );
1091 }
1092
1093 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1094
1095 if ( base ) {
1096 U32 ofs = 0;
1097
1098 while( ( base + ofs < trie->uniquecharcount ) ||
1099 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1100 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1101 ofs++;
1102
1103 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1104
1105 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1106 if ( ( base + ofs >= trie->uniquecharcount ) &&
1107 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1108 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1109 {
1110 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1111 colwidth,
1112 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1113 } else {
1114 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1115 }
1116 }
1117
1118 PerlIO_printf( Perl_debug_log, "]");
1119
1120 }
1121 PerlIO_printf( Perl_debug_log, "\n" );
1122 }
1123 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1124 for (word=1; word <= trie->wordcount; word++) {
1125 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1126 (int)word, (int)(trie->wordinfo[word].prev),
1127 (int)(trie->wordinfo[word].len));
1128 }
1129 PerlIO_printf(Perl_debug_log, "\n" );
1130}
1131/*
1132 Dumps a fully constructed but uncompressed trie in list form.
1133 List tries normally only are used for construction when the number of
1134 possible chars (trie->uniquecharcount) is very high.
1135 Used for debugging make_trie().
1136*/
1137STATIC void
1138S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1139 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1140 U32 depth)
1141{
1142 U32 state;
1143 SV *sv=sv_newmortal();
1144 int colwidth= widecharmap ? 6 : 4;
1145 GET_RE_DEBUG_FLAGS_DECL;
1146
1147 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1148
1149 /* print out the table precompression. */
1150 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1151 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1152 "------:-----+-----------------\n" );
1153
1154 for( state=1 ; state < next_alloc ; state ++ ) {
1155 U16 charid;
1156
1157 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1158 (int)depth * 2 + 2,"", (UV)state );
1159 if ( ! trie->states[ state ].wordnum ) {
1160 PerlIO_printf( Perl_debug_log, "%5s| ","");
1161 } else {
1162 PerlIO_printf( Perl_debug_log, "W%4x| ",
1163 trie->states[ state ].wordnum
1164 );
1165 }
1166 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1167 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1168 if ( tmp ) {
1169 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1170 colwidth,
1171 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1172 PL_colors[0], PL_colors[1],
1173 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1174 PERL_PV_ESCAPE_FIRSTCHAR
1175 ) ,
1176 TRIE_LIST_ITEM(state,charid).forid,
1177 (UV)TRIE_LIST_ITEM(state,charid).newstate
1178 );
1179 if (!(charid % 10))
1180 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1181 (int)((depth * 2) + 14), "");
1182 }
1183 }
1184 PerlIO_printf( Perl_debug_log, "\n");
1185 }
1186}
1187
1188/*
1189 Dumps a fully constructed but uncompressed trie in table form.
1190 This is the normal DFA style state transition table, with a few
1191 twists to facilitate compression later.
1192 Used for debugging make_trie().
1193*/
1194STATIC void
1195S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1196 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1197 U32 depth)
1198{
1199 U32 state;
1200 U16 charid;
1201 SV *sv=sv_newmortal();
1202 int colwidth= widecharmap ? 6 : 4;
1203 GET_RE_DEBUG_FLAGS_DECL;
1204
1205 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1206
1207 /*
1208 print out the table precompression so that we can do a visual check
1209 that they are identical.
1210 */
1211
1212 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1213
1214 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1215 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1216 if ( tmp ) {
1217 PerlIO_printf( Perl_debug_log, "%*s",
1218 colwidth,
1219 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1220 PL_colors[0], PL_colors[1],
1221 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1222 PERL_PV_ESCAPE_FIRSTCHAR
1223 )
1224 );
1225 }
1226 }
1227
1228 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1229
1230 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1231 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1232 }
1233
1234 PerlIO_printf( Perl_debug_log, "\n" );
1235
1236 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1237
1238 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1239 (int)depth * 2 + 2,"",
1240 (UV)TRIE_NODENUM( state ) );
1241
1242 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1243 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1244 if (v)
1245 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1246 else
1247 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1248 }
1249 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1250 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1251 } else {
1252 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1253 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1254 }
1255 }
1256}
1257
1258#endif
1259
1260
1261/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1262 startbranch: the first branch in the whole branch sequence
1263 first : start branch of sequence of branch-exact nodes.
1264 May be the same as startbranch
1265 last : Thing following the last branch.
1266 May be the same as tail.
1267 tail : item following the branch sequence
1268 count : words in the sequence
1269 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1270 depth : indent depth
1271
1272Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1273
1274A trie is an N'ary tree where the branches are determined by digital
1275decomposition of the key. IE, at the root node you look up the 1st character and
1276follow that branch repeat until you find the end of the branches. Nodes can be
1277marked as "accepting" meaning they represent a complete word. Eg:
1278
1279 /he|she|his|hers/
1280
1281would convert into the following structure. Numbers represent states, letters
1282following numbers represent valid transitions on the letter from that state, if
1283the number is in square brackets it represents an accepting state, otherwise it
1284will be in parenthesis.
1285
1286 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1287 | |
1288 | (2)
1289 | |
1290 (1) +-i->(6)-+-s->[7]
1291 |
1292 +-s->(3)-+-h->(4)-+-e->[5]
1293
1294 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1295
1296This shows that when matching against the string 'hers' we will begin at state 1
1297read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1298then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1299is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1300single traverse. We store a mapping from accepting to state to which word was
1301matched, and then when we have multiple possibilities we try to complete the
1302rest of the regex in the order in which they occured in the alternation.
1303
1304The only prior NFA like behaviour that would be changed by the TRIE support is
1305the silent ignoring of duplicate alternations which are of the form:
1306
1307 / (DUPE|DUPE) X? (?{ ... }) Y /x
1308
1309Thus EVAL blocks following a trie may be called a different number of times with
1310and without the optimisation. With the optimisations dupes will be silently
1311ignored. This inconsistent behaviour of EVAL type nodes is well established as
1312the following demonstrates:
1313
1314 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1315
1316which prints out 'word' three times, but
1317
1318 'words'=~/(word|word|word)(?{ print $1 })S/
1319
1320which doesnt print it out at all. This is due to other optimisations kicking in.
1321
1322Example of what happens on a structural level:
1323
1324The regexp /(ac|ad|ab)+/ will produce the following debug output:
1325
1326 1: CURLYM[1] {1,32767}(18)
1327 5: BRANCH(8)
1328 6: EXACT <ac>(16)
1329 8: BRANCH(11)
1330 9: EXACT <ad>(16)
1331 11: BRANCH(14)
1332 12: EXACT <ab>(16)
1333 16: SUCCEED(0)
1334 17: NOTHING(18)
1335 18: END(0)
1336
1337This would be optimizable with startbranch=5, first=5, last=16, tail=16
1338and should turn into:
1339
1340 1: CURLYM[1] {1,32767}(18)
1341 5: TRIE(16)
1342 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1343 <ac>
1344 <ad>
1345 <ab>
1346 16: SUCCEED(0)
1347 17: NOTHING(18)
1348 18: END(0)
1349
1350Cases where tail != last would be like /(?foo|bar)baz/:
1351
1352 1: BRANCH(4)
1353 2: EXACT <foo>(8)
1354 4: BRANCH(7)
1355 5: EXACT <bar>(8)
1356 7: TAIL(8)
1357 8: EXACT <baz>(10)
1358 10: END(0)
1359
1360which would be optimizable with startbranch=1, first=1, last=7, tail=8
1361and would end up looking like:
1362
1363 1: TRIE(8)
1364 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1365 <foo>
1366 <bar>
1367 7: TAIL(8)
1368 8: EXACT <baz>(10)
1369 10: END(0)
1370
1371 d = uvuni_to_utf8_flags(d, uv, 0);
1372
1373is the recommended Unicode-aware way of saying
1374
1375 *(d++) = uv;
1376*/
1377
1378#define TRIE_STORE_REVCHAR(val) \
1379 STMT_START { \
1380 if (UTF) { \
1381 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1382 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1383 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1384 SvCUR_set(zlopp, kapow - flrbbbbb); \
1385 SvPOK_on(zlopp); \
1386 SvUTF8_on(zlopp); \
1387 av_push(revcharmap, zlopp); \
1388 } else { \
1389 char ooooff = (char)val; \
1390 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1391 } \
1392 } STMT_END
1393
1394#define TRIE_READ_CHAR STMT_START { \
1395 wordlen++; \
1396 if ( UTF ) { \
1397 /* if it is UTF then it is either already folded, or does not need folding */ \
1398 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1399 } \
1400 else if (folder == PL_fold_latin1) { \
1401 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1402 if ( foldlen > 0 ) { \
1403 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1404 foldlen -= len; \
1405 scan += len; \
1406 len = 0; \
1407 } else { \
1408 len = 1; \
1409 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1410 skiplen = UNISKIP(uvc); \
1411 foldlen -= skiplen; \
1412 scan = foldbuf + skiplen; \
1413 } \
1414 } else { \
1415 /* raw data, will be folded later if needed */ \
1416 uvc = (U32)*uc; \
1417 len = 1; \
1418 } \
1419} STMT_END
1420
1421
1422
1423#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1424 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1425 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1426 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1427 } \
1428 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1429 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1430 TRIE_LIST_CUR( state )++; \
1431} STMT_END
1432
1433#define TRIE_LIST_NEW(state) STMT_START { \
1434 Newxz( trie->states[ state ].trans.list, \
1435 4, reg_trie_trans_le ); \
1436 TRIE_LIST_CUR( state ) = 1; \
1437 TRIE_LIST_LEN( state ) = 4; \
1438} STMT_END
1439
1440#define TRIE_HANDLE_WORD(state) STMT_START { \
1441 U16 dupe= trie->states[ state ].wordnum; \
1442 regnode * const noper_next = regnext( noper ); \
1443 \
1444 DEBUG_r({ \
1445 /* store the word for dumping */ \
1446 SV* tmp; \
1447 if (OP(noper) != NOTHING) \
1448 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1449 else \
1450 tmp = newSVpvn_utf8( "", 0, UTF ); \
1451 av_push( trie_words, tmp ); \
1452 }); \
1453 \
1454 curword++; \
1455 trie->wordinfo[curword].prev = 0; \
1456 trie->wordinfo[curword].len = wordlen; \
1457 trie->wordinfo[curword].accept = state; \
1458 \
1459 if ( noper_next < tail ) { \
1460 if (!trie->jump) \
1461 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1462 trie->jump[curword] = (U16)(noper_next - convert); \
1463 if (!jumper) \
1464 jumper = noper_next; \
1465 if (!nextbranch) \
1466 nextbranch= regnext(cur); \
1467 } \
1468 \
1469 if ( dupe ) { \
1470 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1471 /* chain, so that when the bits of chain are later */\
1472 /* linked together, the dups appear in the chain */\
1473 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1474 trie->wordinfo[dupe].prev = curword; \
1475 } else { \
1476 /* we haven't inserted this word yet. */ \
1477 trie->states[ state ].wordnum = curword; \
1478 } \
1479} STMT_END
1480
1481
1482#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1483 ( ( base + charid >= ucharcount \
1484 && base + charid < ubound \
1485 && state == trie->trans[ base - ucharcount + charid ].check \
1486 && trie->trans[ base - ucharcount + charid ].next ) \
1487 ? trie->trans[ base - ucharcount + charid ].next \
1488 : ( state==1 ? special : 0 ) \
1489 )
1490
1491#define MADE_TRIE 1
1492#define MADE_JUMP_TRIE 2
1493#define MADE_EXACT_TRIE 4
1494
1495STATIC I32
1496S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1497{
1498 dVAR;
1499 /* first pass, loop through and scan words */
1500 reg_trie_data *trie;
1501 HV *widecharmap = NULL;
1502 AV *revcharmap = newAV();
1503 regnode *cur;
1504 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1505 STRLEN len = 0;
1506 UV uvc = 0;
1507 U16 curword = 0;
1508 U32 next_alloc = 0;
1509 regnode *jumper = NULL;
1510 regnode *nextbranch = NULL;
1511 regnode *convert = NULL;
1512 U32 *prev_states; /* temp array mapping each state to previous one */
1513 /* we just use folder as a flag in utf8 */
1514 const U8 * folder = NULL;
1515
1516#ifdef DEBUGGING
1517 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1518 AV *trie_words = NULL;
1519 /* along with revcharmap, this only used during construction but both are
1520 * useful during debugging so we store them in the struct when debugging.
1521 */
1522#else
1523 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1524 STRLEN trie_charcount=0;
1525#endif
1526 SV *re_trie_maxbuff;
1527 GET_RE_DEBUG_FLAGS_DECL;
1528
1529 PERL_ARGS_ASSERT_MAKE_TRIE;
1530#ifndef DEBUGGING
1531 PERL_UNUSED_ARG(depth);
1532#endif
1533
1534 switch (flags) {
1535 case EXACT: break;
1536 case EXACTFA:
1537 case EXACTFU_SS:
1538 case EXACTFU_TRICKYFOLD:
1539 case EXACTFU: folder = PL_fold_latin1; break;
1540 case EXACTF: folder = PL_fold; break;
1541 case EXACTFL: folder = PL_fold_locale; break;
1542 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1543 }
1544
1545 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1546 trie->refcount = 1;
1547 trie->startstate = 1;
1548 trie->wordcount = word_count;
1549 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1550 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1551 if (flags == EXACT)
1552 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1553 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1554 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1555
1556 DEBUG_r({
1557 trie_words = newAV();
1558 });
1559
1560 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1561 if (!SvIOK(re_trie_maxbuff)) {
1562 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1563 }
1564 DEBUG_TRIE_COMPILE_r({
1565 PerlIO_printf( Perl_debug_log,
1566 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1567 (int)depth * 2 + 2, "",
1568 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1569 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1570 (int)depth);
1571 });
1572
1573 /* Find the node we are going to overwrite */
1574 if ( first == startbranch && OP( last ) != BRANCH ) {
1575 /* whole branch chain */
1576 convert = first;
1577 } else {
1578 /* branch sub-chain */
1579 convert = NEXTOPER( first );
1580 }
1581
1582 /* -- First loop and Setup --
1583
1584 We first traverse the branches and scan each word to determine if it
1585 contains widechars, and how many unique chars there are, this is
1586 important as we have to build a table with at least as many columns as we
1587 have unique chars.
1588
1589 We use an array of integers to represent the character codes 0..255
1590 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1591 native representation of the character value as the key and IV's for the
1592 coded index.
1593
1594 *TODO* If we keep track of how many times each character is used we can
1595 remap the columns so that the table compression later on is more
1596 efficient in terms of memory by ensuring the most common value is in the
1597 middle and the least common are on the outside. IMO this would be better
1598 than a most to least common mapping as theres a decent chance the most
1599 common letter will share a node with the least common, meaning the node
1600 will not be compressible. With a middle is most common approach the worst
1601 case is when we have the least common nodes twice.
1602
1603 */
1604
1605 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1606 regnode *noper = NEXTOPER( cur );
1607 const U8 *uc = (U8*)STRING( noper );
1608 const U8 *e = uc + STR_LEN( noper );
1609 STRLEN foldlen = 0;
1610 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1611 STRLEN skiplen = 0;
1612 const U8 *scan = (U8*)NULL;
1613 U32 wordlen = 0; /* required init */
1614 STRLEN chars = 0;
1615 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1616
1617 if (OP(noper) == NOTHING) {
1618 regnode *noper_next= regnext(noper);
1619 if (noper_next != tail && OP(noper_next) == flags) {
1620 noper = noper_next;
1621 uc= (U8*)STRING(noper);
1622 e= uc + STR_LEN(noper);
1623 trie->minlen= STR_LEN(noper);
1624 } else {
1625 trie->minlen= 0;
1626 continue;
1627 }
1628 }
1629
1630 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1631 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1632 regardless of encoding */
1633 if (OP( noper ) == EXACTFU_SS) {
1634 /* false positives are ok, so just set this */
1635 TRIE_BITMAP_SET(trie,0xDF);
1636 }
1637 }
1638 for ( ; uc < e ; uc += len ) {
1639 TRIE_CHARCOUNT(trie)++;
1640 TRIE_READ_CHAR;
1641 chars++;
1642 if ( uvc < 256 ) {
1643 if ( folder ) {
1644 U8 folded= folder[ (U8) uvc ];
1645 if ( !trie->charmap[ folded ] ) {
1646 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1647 TRIE_STORE_REVCHAR( folded );
1648 }
1649 }
1650 if ( !trie->charmap[ uvc ] ) {
1651 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1652 TRIE_STORE_REVCHAR( uvc );
1653 }
1654 if ( set_bit ) {
1655 /* store the codepoint in the bitmap, and its folded
1656 * equivalent. */
1657 TRIE_BITMAP_SET(trie, uvc);
1658
1659 /* store the folded codepoint */
1660 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1661
1662 if ( !UTF ) {
1663 /* store first byte of utf8 representation of
1664 variant codepoints */
1665 if (! UNI_IS_INVARIANT(uvc)) {
1666 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1667 }
1668 }
1669 set_bit = 0; /* We've done our bit :-) */
1670 }
1671 } else {
1672 SV** svpp;
1673 if ( !widecharmap )
1674 widecharmap = newHV();
1675
1676 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1677
1678 if ( !svpp )
1679 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1680
1681 if ( !SvTRUE( *svpp ) ) {
1682 sv_setiv( *svpp, ++trie->uniquecharcount );
1683 TRIE_STORE_REVCHAR(uvc);
1684 }
1685 }
1686 }
1687 if( cur == first ) {
1688 trie->minlen = chars;
1689 trie->maxlen = chars;
1690 } else if (chars < trie->minlen) {
1691 trie->minlen = chars;
1692 } else if (chars > trie->maxlen) {
1693 trie->maxlen = chars;
1694 }
1695 if (OP( noper ) == EXACTFU_SS) {
1696 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1697 if (trie->minlen > 1)
1698 trie->minlen= 1;
1699 }
1700 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1701 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1702 * - We assume that any such sequence might match a 2 byte string */
1703 if (trie->minlen > 2 )
1704 trie->minlen= 2;
1705 }
1706
1707 } /* end first pass */
1708 DEBUG_TRIE_COMPILE_r(
1709 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1710 (int)depth * 2 + 2,"",
1711 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1712 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1713 (int)trie->minlen, (int)trie->maxlen )
1714 );
1715
1716 /*
1717 We now know what we are dealing with in terms of unique chars and
1718 string sizes so we can calculate how much memory a naive
1719 representation using a flat table will take. If it's over a reasonable
1720 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1721 conservative but potentially much slower representation using an array
1722 of lists.
1723
1724 At the end we convert both representations into the same compressed
1725 form that will be used in regexec.c for matching with. The latter
1726 is a form that cannot be used to construct with but has memory
1727 properties similar to the list form and access properties similar
1728 to the table form making it both suitable for fast searches and
1729 small enough that its feasable to store for the duration of a program.
1730
1731 See the comment in the code where the compressed table is produced
1732 inplace from the flat tabe representation for an explanation of how
1733 the compression works.
1734
1735 */
1736
1737
1738 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1739 prev_states[1] = 0;
1740
1741 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1742 /*
1743 Second Pass -- Array Of Lists Representation
1744
1745 Each state will be represented by a list of charid:state records
1746 (reg_trie_trans_le) the first such element holds the CUR and LEN
1747 points of the allocated array. (See defines above).
1748
1749 We build the initial structure using the lists, and then convert
1750 it into the compressed table form which allows faster lookups
1751 (but cant be modified once converted).
1752 */
1753
1754 STRLEN transcount = 1;
1755
1756 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1757 "%*sCompiling trie using list compiler\n",
1758 (int)depth * 2 + 2, ""));
1759
1760 trie->states = (reg_trie_state *)
1761 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1762 sizeof(reg_trie_state) );
1763 TRIE_LIST_NEW(1);
1764 next_alloc = 2;
1765
1766 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1767
1768 regnode *noper = NEXTOPER( cur );
1769 U8 *uc = (U8*)STRING( noper );
1770 const U8 *e = uc + STR_LEN( noper );
1771 U32 state = 1; /* required init */
1772 U16 charid = 0; /* sanity init */
1773 U8 *scan = (U8*)NULL; /* sanity init */
1774 STRLEN foldlen = 0; /* required init */
1775 U32 wordlen = 0; /* required init */
1776 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1777 STRLEN skiplen = 0;
1778
1779 if (OP(noper) == NOTHING) {
1780 regnode *noper_next= regnext(noper);
1781 if (noper_next != tail && OP(noper_next) == flags) {
1782 noper = noper_next;
1783 uc= (U8*)STRING(noper);
1784 e= uc + STR_LEN(noper);
1785 }
1786 }
1787
1788 if (OP(noper) != NOTHING) {
1789 for ( ; uc < e ; uc += len ) {
1790
1791 TRIE_READ_CHAR;
1792
1793 if ( uvc < 256 ) {
1794 charid = trie->charmap[ uvc ];
1795 } else {
1796 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1797 if ( !svpp ) {
1798 charid = 0;
1799 } else {
1800 charid=(U16)SvIV( *svpp );
1801 }
1802 }
1803 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1804 if ( charid ) {
1805
1806 U16 check;
1807 U32 newstate = 0;
1808
1809 charid--;
1810 if ( !trie->states[ state ].trans.list ) {
1811 TRIE_LIST_NEW( state );
1812 }
1813 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1814 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1815 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1816 break;
1817 }
1818 }
1819 if ( ! newstate ) {
1820 newstate = next_alloc++;
1821 prev_states[newstate] = state;
1822 TRIE_LIST_PUSH( state, charid, newstate );
1823 transcount++;
1824 }
1825 state = newstate;
1826 } else {
1827 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1828 }
1829 }
1830 }
1831 TRIE_HANDLE_WORD(state);
1832
1833 } /* end second pass */
1834
1835 /* next alloc is the NEXT state to be allocated */
1836 trie->statecount = next_alloc;
1837 trie->states = (reg_trie_state *)
1838 PerlMemShared_realloc( trie->states,
1839 next_alloc
1840 * sizeof(reg_trie_state) );
1841
1842 /* and now dump it out before we compress it */
1843 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1844 revcharmap, next_alloc,
1845 depth+1)
1846 );
1847
1848 trie->trans = (reg_trie_trans *)
1849 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1850 {
1851 U32 state;
1852 U32 tp = 0;
1853 U32 zp = 0;
1854
1855
1856 for( state=1 ; state < next_alloc ; state ++ ) {
1857 U32 base=0;
1858
1859 /*
1860 DEBUG_TRIE_COMPILE_MORE_r(
1861 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1862 );
1863 */
1864
1865 if (trie->states[state].trans.list) {
1866 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1867 U16 maxid=minid;
1868 U16 idx;
1869
1870 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1871 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1872 if ( forid < minid ) {
1873 minid=forid;
1874 } else if ( forid > maxid ) {
1875 maxid=forid;
1876 }
1877 }
1878 if ( transcount < tp + maxid - minid + 1) {
1879 transcount *= 2;
1880 trie->trans = (reg_trie_trans *)
1881 PerlMemShared_realloc( trie->trans,
1882 transcount
1883 * sizeof(reg_trie_trans) );
1884 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1885 }
1886 base = trie->uniquecharcount + tp - minid;
1887 if ( maxid == minid ) {
1888 U32 set = 0;
1889 for ( ; zp < tp ; zp++ ) {
1890 if ( ! trie->trans[ zp ].next ) {
1891 base = trie->uniquecharcount + zp - minid;
1892 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1893 trie->trans[ zp ].check = state;
1894 set = 1;
1895 break;
1896 }
1897 }
1898 if ( !set ) {
1899 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1900 trie->trans[ tp ].check = state;
1901 tp++;
1902 zp = tp;
1903 }
1904 } else {
1905 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1906 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1907 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1908 trie->trans[ tid ].check = state;
1909 }
1910 tp += ( maxid - minid + 1 );
1911 }
1912 Safefree(trie->states[ state ].trans.list);
1913 }
1914 /*
1915 DEBUG_TRIE_COMPILE_MORE_r(
1916 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1917 );
1918 */
1919 trie->states[ state ].trans.base=base;
1920 }
1921 trie->lasttrans = tp + 1;
1922 }
1923 } else {
1924 /*
1925 Second Pass -- Flat Table Representation.
1926
1927 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1928 We know that we will need Charcount+1 trans at most to store the data
1929 (one row per char at worst case) So we preallocate both structures
1930 assuming worst case.
1931
1932 We then construct the trie using only the .next slots of the entry
1933 structs.
1934
1935 We use the .check field of the first entry of the node temporarily to
1936 make compression both faster and easier by keeping track of how many non
1937 zero fields are in the node.
1938
1939 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1940 transition.
1941
1942 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1943 number representing the first entry of the node, and state as a
1944 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1945 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1946 are 2 entrys per node. eg:
1947
1948 A B A B
1949 1. 2 4 1. 3 7
1950 2. 0 3 3. 0 5
1951 3. 0 0 5. 0 0
1952 4. 0 0 7. 0 0
1953
1954 The table is internally in the right hand, idx form. However as we also
1955 have to deal with the states array which is indexed by nodenum we have to
1956 use TRIE_NODENUM() to convert.
1957
1958 */
1959 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1960 "%*sCompiling trie using table compiler\n",
1961 (int)depth * 2 + 2, ""));
1962
1963 trie->trans = (reg_trie_trans *)
1964 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1965 * trie->uniquecharcount + 1,
1966 sizeof(reg_trie_trans) );
1967 trie->states = (reg_trie_state *)
1968 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1969 sizeof(reg_trie_state) );
1970 next_alloc = trie->uniquecharcount + 1;
1971
1972
1973 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1974
1975 regnode *noper = NEXTOPER( cur );
1976 const U8 *uc = (U8*)STRING( noper );
1977 const U8 *e = uc + STR_LEN( noper );
1978
1979 U32 state = 1; /* required init */
1980
1981 U16 charid = 0; /* sanity init */
1982 U32 accept_state = 0; /* sanity init */
1983 U8 *scan = (U8*)NULL; /* sanity init */
1984
1985 STRLEN foldlen = 0; /* required init */
1986 U32 wordlen = 0; /* required init */
1987 STRLEN skiplen = 0;
1988 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1989
1990 if (OP(noper) == NOTHING) {
1991 regnode *noper_next= regnext(noper);
1992 if (noper_next != tail && OP(noper_next) == flags) {
1993 noper = noper_next;
1994 uc= (U8*)STRING(noper);
1995 e= uc + STR_LEN(noper);
1996 }
1997 }
1998
1999 if ( OP(noper) != NOTHING ) {
2000 for ( ; uc < e ; uc += len ) {
2001
2002 TRIE_READ_CHAR;
2003
2004 if ( uvc < 256 ) {
2005 charid = trie->charmap[ uvc ];
2006 } else {
2007 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2008 charid = svpp ? (U16)SvIV(*svpp) : 0;
2009 }
2010 if ( charid ) {
2011 charid--;
2012 if ( !trie->trans[ state + charid ].next ) {
2013 trie->trans[ state + charid ].next = next_alloc;
2014 trie->trans[ state ].check++;
2015 prev_states[TRIE_NODENUM(next_alloc)]
2016 = TRIE_NODENUM(state);
2017 next_alloc += trie->uniquecharcount;
2018 }
2019 state = trie->trans[ state + charid ].next;
2020 } else {
2021 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2022 }
2023 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2024 }
2025 }
2026 accept_state = TRIE_NODENUM( state );
2027 TRIE_HANDLE_WORD(accept_state);
2028
2029 } /* end second pass */
2030
2031 /* and now dump it out before we compress it */
2032 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2033 revcharmap,
2034 next_alloc, depth+1));
2035
2036 {
2037 /*
2038 * Inplace compress the table.*
2039
2040 For sparse data sets the table constructed by the trie algorithm will
2041 be mostly 0/FAIL transitions or to put it another way mostly empty.
2042 (Note that leaf nodes will not contain any transitions.)
2043
2044 This algorithm compresses the tables by eliminating most such
2045 transitions, at the cost of a modest bit of extra work during lookup:
2046
2047 - Each states[] entry contains a .base field which indicates the
2048 index in the state[] array wheres its transition data is stored.
2049
2050 - If .base is 0 there are no valid transitions from that node.
2051
2052 - If .base is nonzero then charid is added to it to find an entry in
2053 the trans array.
2054
2055 -If trans[states[state].base+charid].check!=state then the
2056 transition is taken to be a 0/Fail transition. Thus if there are fail
2057 transitions at the front of the node then the .base offset will point
2058 somewhere inside the previous nodes data (or maybe even into a node
2059 even earlier), but the .check field determines if the transition is
2060 valid.
2061
2062 XXX - wrong maybe?
2063 The following process inplace converts the table to the compressed
2064 table: We first do not compress the root node 1,and mark all its
2065 .check pointers as 1 and set its .base pointer as 1 as well. This
2066 allows us to do a DFA construction from the compressed table later,
2067 and ensures that any .base pointers we calculate later are greater
2068 than 0.
2069
2070 - We set 'pos' to indicate the first entry of the second node.
2071
2072 - We then iterate over the columns of the node, finding the first and
2073 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2074 and set the .check pointers accordingly, and advance pos
2075 appropriately and repreat for the next node. Note that when we copy
2076 the next pointers we have to convert them from the original
2077 NODEIDX form to NODENUM form as the former is not valid post
2078 compression.
2079
2080 - If a node has no transitions used we mark its base as 0 and do not
2081 advance the pos pointer.
2082
2083 - If a node only has one transition we use a second pointer into the
2084 structure to fill in allocated fail transitions from other states.
2085 This pointer is independent of the main pointer and scans forward
2086 looking for null transitions that are allocated to a state. When it
2087 finds one it writes the single transition into the "hole". If the
2088 pointer doesnt find one the single transition is appended as normal.
2089
2090 - Once compressed we can Renew/realloc the structures to release the
2091 excess space.
2092
2093 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2094 specifically Fig 3.47 and the associated pseudocode.
2095
2096 demq
2097 */
2098 const U32 laststate = TRIE_NODENUM( next_alloc );
2099 U32 state, charid;
2100 U32 pos = 0, zp=0;
2101 trie->statecount = laststate;
2102
2103 for ( state = 1 ; state < laststate ; state++ ) {
2104 U8 flag = 0;
2105 const U32 stateidx = TRIE_NODEIDX( state );
2106 const U32 o_used = trie->trans[ stateidx ].check;
2107 U32 used = trie->trans[ stateidx ].check;
2108 trie->trans[ stateidx ].check = 0;
2109
2110 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2111 if ( flag || trie->trans[ stateidx + charid ].next ) {
2112 if ( trie->trans[ stateidx + charid ].next ) {
2113 if (o_used == 1) {
2114 for ( ; zp < pos ; zp++ ) {
2115 if ( ! trie->trans[ zp ].next ) {
2116 break;
2117 }
2118 }
2119 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2120 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2121 trie->trans[ zp ].check = state;
2122 if ( ++zp > pos ) pos = zp;
2123 break;
2124 }
2125 used--;
2126 }
2127 if ( !flag ) {
2128 flag = 1;
2129 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2130 }
2131 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2132 trie->trans[ pos ].check = state;
2133 pos++;
2134 }
2135 }
2136 }
2137 trie->lasttrans = pos + 1;
2138 trie->states = (reg_trie_state *)
2139 PerlMemShared_realloc( trie->states, laststate
2140 * sizeof(reg_trie_state) );
2141 DEBUG_TRIE_COMPILE_MORE_r(
2142 PerlIO_printf( Perl_debug_log,
2143 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2144 (int)depth * 2 + 2,"",
2145 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2146 (IV)next_alloc,
2147 (IV)pos,
2148 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2149 );
2150
2151 } /* end table compress */
2152 }
2153 DEBUG_TRIE_COMPILE_MORE_r(
2154 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2155 (int)depth * 2 + 2, "",
2156 (UV)trie->statecount,
2157 (UV)trie->lasttrans)
2158 );
2159 /* resize the trans array to remove unused space */
2160 trie->trans = (reg_trie_trans *)
2161 PerlMemShared_realloc( trie->trans, trie->lasttrans
2162 * sizeof(reg_trie_trans) );
2163
2164 { /* Modify the program and insert the new TRIE node */
2165 U8 nodetype =(U8)(flags & 0xFF);
2166 char *str=NULL;
2167
2168#ifdef DEBUGGING
2169 regnode *optimize = NULL;
2170#ifdef RE_TRACK_PATTERN_OFFSETS
2171
2172 U32 mjd_offset = 0;
2173 U32 mjd_nodelen = 0;
2174#endif /* RE_TRACK_PATTERN_OFFSETS */
2175#endif /* DEBUGGING */
2176 /*
2177 This means we convert either the first branch or the first Exact,
2178 depending on whether the thing following (in 'last') is a branch
2179 or not and whther first is the startbranch (ie is it a sub part of
2180 the alternation or is it the whole thing.)
2181 Assuming its a sub part we convert the EXACT otherwise we convert
2182 the whole branch sequence, including the first.
2183 */
2184 /* Find the node we are going to overwrite */
2185 if ( first != startbranch || OP( last ) == BRANCH ) {
2186 /* branch sub-chain */
2187 NEXT_OFF( first ) = (U16)(last - first);
2188#ifdef RE_TRACK_PATTERN_OFFSETS
2189 DEBUG_r({
2190 mjd_offset= Node_Offset((convert));
2191 mjd_nodelen= Node_Length((convert));
2192 });
2193#endif
2194 /* whole branch chain */
2195 }
2196#ifdef RE_TRACK_PATTERN_OFFSETS
2197 else {
2198 DEBUG_r({
2199 const regnode *nop = NEXTOPER( convert );
2200 mjd_offset= Node_Offset((nop));
2201 mjd_nodelen= Node_Length((nop));
2202 });
2203 }
2204 DEBUG_OPTIMISE_r(
2205 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2206 (int)depth * 2 + 2, "",
2207 (UV)mjd_offset, (UV)mjd_nodelen)
2208 );
2209#endif
2210 /* But first we check to see if there is a common prefix we can
2211 split out as an EXACT and put in front of the TRIE node. */
2212 trie->startstate= 1;
2213 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2214 U32 state;
2215 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2216 U32 ofs = 0;
2217 I32 idx = -1;
2218 U32 count = 0;
2219 const U32 base = trie->states[ state ].trans.base;
2220
2221 if ( trie->states[state].wordnum )
2222 count = 1;
2223
2224 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2225 if ( ( base + ofs >= trie->uniquecharcount ) &&
2226 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2227 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2228 {
2229 if ( ++count > 1 ) {
2230 SV **tmp = av_fetch( revcharmap, ofs, 0);
2231 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2232 if ( state == 1 ) break;
2233 if ( count == 2 ) {
2234 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2235 DEBUG_OPTIMISE_r(
2236 PerlIO_printf(Perl_debug_log,
2237 "%*sNew Start State=%"UVuf" Class: [",
2238 (int)depth * 2 + 2, "",
2239 (UV)state));
2240 if (idx >= 0) {
2241 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2242 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2243
2244 TRIE_BITMAP_SET(trie,*ch);
2245 if ( folder )
2246 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2247 DEBUG_OPTIMISE_r(
2248 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2249 );
2250 }
2251 }
2252 TRIE_BITMAP_SET(trie,*ch);
2253 if ( folder )
2254 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2255 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2256 }
2257 idx = ofs;
2258 }
2259 }
2260 if ( count == 1 ) {
2261 SV **tmp = av_fetch( revcharmap, idx, 0);
2262 STRLEN len;
2263 char *ch = SvPV( *tmp, len );
2264 DEBUG_OPTIMISE_r({
2265 SV *sv=sv_newmortal();
2266 PerlIO_printf( Perl_debug_log,
2267 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2268 (int)depth * 2 + 2, "",
2269 (UV)state, (UV)idx,
2270 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2271 PL_colors[0], PL_colors[1],
2272 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2273 PERL_PV_ESCAPE_FIRSTCHAR
2274 )
2275 );
2276 });
2277 if ( state==1 ) {
2278 OP( convert ) = nodetype;
2279 str=STRING(convert);
2280 STR_LEN(convert)=0;
2281 }
2282 STR_LEN(convert) += len;
2283 while (len--)
2284 *str++ = *ch++;
2285 } else {
2286#ifdef DEBUGGING
2287 if (state>1)
2288 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2289#endif
2290 break;
2291 }
2292 }
2293 trie->prefixlen = (state-1);
2294 if (str) {
2295 regnode *n = convert+NODE_SZ_STR(convert);
2296 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2297 trie->startstate = state;
2298 trie->minlen -= (state - 1);
2299 trie->maxlen -= (state - 1);
2300#ifdef DEBUGGING
2301 /* At least the UNICOS C compiler choked on this
2302 * being argument to DEBUG_r(), so let's just have
2303 * it right here. */
2304 if (
2305#ifdef PERL_EXT_RE_BUILD
2306 1
2307#else
2308 DEBUG_r_TEST
2309#endif
2310 ) {
2311 regnode *fix = convert;
2312 U32 word = trie->wordcount;
2313 mjd_nodelen++;
2314 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2315 while( ++fix < n ) {
2316 Set_Node_Offset_Length(fix, 0, 0);
2317 }
2318 while (word--) {
2319 SV ** const tmp = av_fetch( trie_words, word, 0 );
2320 if (tmp) {
2321 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2322 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2323 else
2324 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2325 }
2326 }
2327 }
2328#endif
2329 if (trie->maxlen) {
2330 convert = n;
2331 } else {
2332 NEXT_OFF(convert) = (U16)(tail - convert);
2333 DEBUG_r(optimize= n);
2334 }
2335 }
2336 }
2337 if (!jumper)
2338 jumper = last;
2339 if ( trie->maxlen ) {
2340 NEXT_OFF( convert ) = (U16)(tail - convert);
2341 ARG_SET( convert, data_slot );
2342 /* Store the offset to the first unabsorbed branch in
2343 jump[0], which is otherwise unused by the jump logic.
2344 We use this when dumping a trie and during optimisation. */
2345 if (trie->jump)
2346 trie->jump[0] = (U16)(nextbranch - convert);
2347
2348 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2349 * and there is a bitmap
2350 * and the first "jump target" node we found leaves enough room
2351 * then convert the TRIE node into a TRIEC node, with the bitmap
2352 * embedded inline in the opcode - this is hypothetically faster.
2353 */
2354 if ( !trie->states[trie->startstate].wordnum
2355 && trie->bitmap
2356 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2357 {
2358 OP( convert ) = TRIEC;
2359 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2360 PerlMemShared_free(trie->bitmap);
2361 trie->bitmap= NULL;
2362 } else
2363 OP( convert ) = TRIE;
2364
2365 /* store the type in the flags */
2366 convert->flags = nodetype;
2367 DEBUG_r({
2368 optimize = convert
2369 + NODE_STEP_REGNODE
2370 + regarglen[ OP( convert ) ];
2371 });
2372 /* XXX We really should free up the resource in trie now,
2373 as we won't use them - (which resources?) dmq */
2374 }
2375 /* needed for dumping*/
2376 DEBUG_r(if (optimize) {
2377 regnode *opt = convert;
2378
2379 while ( ++opt < optimize) {
2380 Set_Node_Offset_Length(opt,0,0);
2381 }
2382 /*
2383 Try to clean up some of the debris left after the
2384 optimisation.
2385 */
2386 while( optimize < jumper ) {
2387 mjd_nodelen += Node_Length((optimize));
2388 OP( optimize ) = OPTIMIZED;
2389 Set_Node_Offset_Length(optimize,0,0);
2390 optimize++;
2391 }
2392 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2393 });
2394 } /* end node insert */
2395
2396 /* Finish populating the prev field of the wordinfo array. Walk back
2397 * from each accept state until we find another accept state, and if
2398 * so, point the first word's .prev field at the second word. If the
2399 * second already has a .prev field set, stop now. This will be the
2400 * case either if we've already processed that word's accept state,
2401 * or that state had multiple words, and the overspill words were
2402 * already linked up earlier.
2403 */
2404 {
2405 U16 word;
2406 U32 state;
2407 U16 prev;
2408
2409 for (word=1; word <= trie->wordcount; word++) {
2410 prev = 0;
2411 if (trie->wordinfo[word].prev)
2412 continue;
2413 state = trie->wordinfo[word].accept;
2414 while (state) {
2415 state = prev_states[state];
2416 if (!state)
2417 break;
2418 prev = trie->states[state].wordnum;
2419 if (prev)
2420 break;
2421 }
2422 trie->wordinfo[word].prev = prev;
2423 }
2424 Safefree(prev_states);
2425 }
2426
2427
2428 /* and now dump out the compressed format */
2429 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2430
2431 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2432#ifdef DEBUGGING
2433 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2434 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2435#else
2436 SvREFCNT_dec(revcharmap);
2437#endif
2438 return trie->jump
2439 ? MADE_JUMP_TRIE
2440 : trie->startstate>1
2441 ? MADE_EXACT_TRIE
2442 : MADE_TRIE;
2443}
2444
2445STATIC void
2446S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2447{
2448/* The Trie is constructed and compressed now so we can build a fail array if it's needed
2449
2450 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2451 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2452 ISBN 0-201-10088-6
2453
2454 We find the fail state for each state in the trie, this state is the longest proper
2455 suffix of the current state's 'word' that is also a proper prefix of another word in our
2456 trie. State 1 represents the word '' and is thus the default fail state. This allows
2457 the DFA not to have to restart after its tried and failed a word at a given point, it
2458 simply continues as though it had been matching the other word in the first place.
2459 Consider
2460 'abcdgu'=~/abcdefg|cdgu/
2461 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2462 fail, which would bring us to the state representing 'd' in the second word where we would
2463 try 'g' and succeed, proceeding to match 'cdgu'.
2464 */
2465 /* add a fail transition */
2466 const U32 trie_offset = ARG(source);
2467 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2468 U32 *q;
2469 const U32 ucharcount = trie->uniquecharcount;
2470 const U32 numstates = trie->statecount;
2471 const U32 ubound = trie->lasttrans + ucharcount;
2472 U32 q_read = 0;
2473 U32 q_write = 0;
2474 U32 charid;
2475 U32 base = trie->states[ 1 ].trans.base;
2476 U32 *fail;
2477 reg_ac_data *aho;
2478 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2479 GET_RE_DEBUG_FLAGS_DECL;
2480
2481 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2482#ifndef DEBUGGING
2483 PERL_UNUSED_ARG(depth);
2484#endif
2485
2486
2487 ARG_SET( stclass, data_slot );
2488 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2489 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2490 aho->trie=trie_offset;
2491 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2492 Copy( trie->states, aho->states, numstates, reg_trie_state );
2493 Newxz( q, numstates, U32);
2494 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2495 aho->refcount = 1;
2496 fail = aho->fail;
2497 /* initialize fail[0..1] to be 1 so that we always have
2498 a valid final fail state */
2499 fail[ 0 ] = fail[ 1 ] = 1;
2500
2501 for ( charid = 0; charid < ucharcount ; charid++ ) {
2502 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2503 if ( newstate ) {
2504 q[ q_write ] = newstate;
2505 /* set to point at the root */
2506 fail[ q[ q_write++ ] ]=1;
2507 }
2508 }
2509 while ( q_read < q_write) {
2510 const U32 cur = q[ q_read++ % numstates ];
2511 base = trie->states[ cur ].trans.base;
2512
2513 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2514 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2515 if (ch_state) {
2516 U32 fail_state = cur;
2517 U32 fail_base;
2518 do {
2519 fail_state = fail[ fail_state ];
2520 fail_base = aho->states[ fail_state ].trans.base;
2521 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2522
2523 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2524 fail[ ch_state ] = fail_state;
2525 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2526 {
2527 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2528 }
2529 q[ q_write++ % numstates] = ch_state;
2530 }
2531 }
2532 }
2533 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2534 when we fail in state 1, this allows us to use the
2535 charclass scan to find a valid start char. This is based on the principle
2536 that theres a good chance the string being searched contains lots of stuff
2537 that cant be a start char.
2538 */
2539 fail[ 0 ] = fail[ 1 ] = 0;
2540 DEBUG_TRIE_COMPILE_r({
2541 PerlIO_printf(Perl_debug_log,
2542 "%*sStclass Failtable (%"UVuf" states): 0",
2543 (int)(depth * 2), "", (UV)numstates
2544 );
2545 for( q_read=1; q_read<numstates; q_read++ ) {
2546 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2547 }
2548 PerlIO_printf(Perl_debug_log, "\n");
2549 });
2550 Safefree(q);
2551 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2552}
2553
2554
2555/*
2556 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2557 * These need to be revisited when a newer toolchain becomes available.
2558 */
2559#if defined(__sparc64__) && defined(__GNUC__)
2560# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2561# undef SPARC64_GCC_WORKAROUND
2562# define SPARC64_GCC_WORKAROUND 1
2563# endif
2564#endif
2565
2566#define DEBUG_PEEP(str,scan,depth) \
2567 DEBUG_OPTIMISE_r({if (scan){ \
2568 SV * const mysv=sv_newmortal(); \
2569 regnode *Next = regnext(scan); \
2570 regprop(RExC_rx, mysv, scan); \
2571 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2572 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2573 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2574 }});
2575
2576
2577/* The below joins as many adjacent EXACTish nodes as possible into a single
2578 * one, and looks for problematic sequences of characters whose folds vs.
2579 * non-folds have sufficiently different lengths, that the optimizer would be
2580 * fooled into rejecting legitimate matches of them, and the trie construction
2581 * code can't cope with them. The joining is only done if:
2582 * 1) there is room in the current conglomerated node to entirely contain the
2583 * next one.
2584 * 2) they are the exact same node type
2585 *
2586 * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2587 * these get optimized out
2588 *
2589 * If there are problematic code sequences, *min_subtract is set to the delta
2590 * that the minimum size of the node can be less than its actual size. And,
2591 * the node type of the result is changed to reflect that it contains these
2592 * sequences.
2593 *
2594 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2595 * and contains LATIN SMALL LETTER SHARP S
2596 *
2597 * This is as good a place as any to discuss the design of handling these
2598 * problematic sequences. It's been wrong in Perl for a very long time. There
2599 * are three code points in Unicode whose folded lengths differ so much from
2600 * the un-folded lengths that it causes problems for the optimizer and trie
2601 * construction. Why only these are problematic, and not others where lengths
2602 * also differ is something I (khw) do not understand. New versions of Unicode
2603 * might add more such code points. Hopefully the logic in fold_grind.t that
2604 * figures out what to test (in part by verifying that each size-combination
2605 * gets tested) will catch any that do come along, so they can be added to the
2606 * special handling below. The chances of new ones are actually rather small,
2607 * as most, if not all, of the world's scripts that have casefolding have
2608 * already been encoded by Unicode. Also, a number of Unicode's decisions were
2609 * made to allow compatibility with pre-existing standards, and almost all of
2610 * those have already been dealt with. These would otherwise be the most
2611 * likely candidates for generating further tricky sequences. In other words,
2612 * Unicode by itself is unlikely to add new ones unless it is for compatibility
2613 * with pre-existing standards, and there aren't many of those left.
2614 *
2615 * The previous designs for dealing with these involved assigning a special
2616 * node for them. This approach doesn't work, as evidenced by this example:
2617 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2618 * Both these fold to "sss", but if the pattern is parsed to create a node of
2619 * that would match just the \xDF, it won't be able to handle the case where a
2620 * successful match would have to cross the node's boundary. The new approach
2621 * that hopefully generally solves the problem generates an EXACTFU_SS node
2622 * that is "sss".
2623 *
2624 * There are a number of components to the approach (a lot of work for just
2625 * three code points!):
2626 * 1) This routine examines each EXACTFish node that could contain the
2627 * problematic sequences. It returns in *min_subtract how much to
2628 * subtract from the the actual length of the string to get a real minimum
2629 * for one that could match it. This number is usually 0 except for the
2630 * problematic sequences. This delta is used by the caller to adjust the
2631 * min length of the match, and the delta between min and max, so that the
2632 * optimizer doesn't reject these possibilities based on size constraints.
2633 * 2) These sequences are not currently correctly handled by the trie code
2634 * either, so it changes the joined node type to ops that are not handled
2635 * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
2636 * 3) This is sufficient for the two Greek sequences (described below), but
2637 * the one involving the Sharp s (\xDF) needs more. The node type
2638 * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2639 * sequence in it. For non-UTF-8 patterns and strings, this is the only
2640 * case where there is a possible fold length change. That means that a
2641 * regular EXACTFU node without UTF-8 involvement doesn't have to concern
2642 * itself with length changes, and so can be processed faster. regexec.c
2643 * takes advantage of this. Generally, an EXACTFish node that is in UTF-8
2644 * is pre-folded by regcomp.c. This saves effort in regex matching.
2645 * However, probably mostly for historical reasons, the pre-folding isn't
2646 * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2647 * nodes, as what they fold to isn't known until runtime.) The fold
2648 * possibilities for the non-UTF8 patterns are quite simple, except for
2649 * the sharp s. All the ones that don't involve a UTF-8 target string
2650 * are members of a fold-pair, and arrays are set up for all of them
2651 * that quickly find the other member of the pair. It might actually
2652 * be faster to pre-fold these, but it isn't currently done, except for
2653 * the sharp s. Code elsewhere in this file makes sure that it gets
2654 * folded to 'ss', even if the pattern isn't UTF-8. This avoids the
2655 * issues described in the next item.
2656 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2657 * 'ss' or not is not knowable at compile time. It will match iff the
2658 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2659 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2660 * it can't be folded to "ss" at compile time, unlike EXACTFU does as
2661 * described in item 3). An assumption that the optimizer part of
2662 * regexec.c (probably unwittingly) makes is that a character in the
2663 * pattern corresponds to at most a single character in the target string.
2664 * (And I do mean character, and not byte here, unlike other parts of the
2665 * documentation that have never been updated to account for multibyte
2666 * Unicode.) This assumption is wrong only in this case, as all other
2667 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2668 * virtue of having this file pre-fold UTF-8 patterns. I'm
2669 * reluctant to try to change this assumption, so instead the code punts.
2670 * This routine examines EXACTF nodes for the sharp s, and returns a
2671 * boolean indicating whether or not the node is an EXACTF node that
2672 * contains a sharp s. When it is true, the caller sets a flag that later
2673 * causes the optimizer in this file to not set values for the floating
2674 * and fixed string lengths, and thus avoids the optimizer code in
2675 * regexec.c that makes the invalid assumption. Thus, there is no
2676 * optimization based on string lengths for EXACTF nodes that contain the
2677 * sharp s. This only happens for /id rules (which means the pattern
2678 * isn't in UTF-8).
2679 */
2680
2681#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2682 if (PL_regkind[OP(scan)] == EXACT) \
2683 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2684
2685STATIC U32
2686S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2687 /* Merge several consecutive EXACTish nodes into one. */
2688 regnode *n = regnext(scan);
2689 U32 stringok = 1;
2690 regnode *next = scan + NODE_SZ_STR(scan);
2691 U32 merged = 0;
2692 U32 stopnow = 0;
2693#ifdef DEBUGGING
2694 regnode *stop = scan;
2695 GET_RE_DEBUG_FLAGS_DECL;
2696#else
2697 PERL_UNUSED_ARG(depth);
2698#endif
2699
2700 PERL_ARGS_ASSERT_JOIN_EXACT;
2701#ifndef EXPERIMENTAL_INPLACESCAN
2702 PERL_UNUSED_ARG(flags);
2703 PERL_UNUSED_ARG(val);
2704#endif
2705 DEBUG_PEEP("join",scan,depth);
2706
2707 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2708 * EXACT ones that are mergeable to the current one. */
2709 while (n
2710 && (PL_regkind[OP(n)] == NOTHING
2711 || (stringok && OP(n) == OP(scan)))
2712 && NEXT_OFF(n)
2713 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2714 {
2715
2716 if (OP(n) == TAIL || n > next)
2717 stringok = 0;
2718 if (PL_regkind[OP(n)] == NOTHING) {
2719 DEBUG_PEEP("skip:",n,depth);
2720 NEXT_OFF(scan) += NEXT_OFF(n);
2721 next = n + NODE_STEP_REGNODE;
2722#ifdef DEBUGGING
2723 if (stringok)
2724 stop = n;
2725#endif
2726 n = regnext(n);
2727 }
2728 else if (stringok) {
2729 const unsigned int oldl = STR_LEN(scan);
2730 regnode * const nnext = regnext(n);
2731
2732 if (oldl + STR_LEN(n) > U8_MAX)
2733 break;
2734
2735 DEBUG_PEEP("merg",n,depth);
2736 merged++;
2737
2738 NEXT_OFF(scan) += NEXT_OFF(n);
2739 STR_LEN(scan) += STR_LEN(n);
2740 next = n + NODE_SZ_STR(n);
2741 /* Now we can overwrite *n : */
2742 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2743#ifdef DEBUGGING
2744 stop = next - 1;
2745#endif
2746 n = nnext;
2747 if (stopnow) break;
2748 }
2749
2750#ifdef EXPERIMENTAL_INPLACESCAN
2751 if (flags && !NEXT_OFF(n)) {
2752 DEBUG_PEEP("atch", val, depth);
2753 if (reg_off_by_arg[OP(n)]) {
2754 ARG_SET(n, val - n);
2755 }
2756 else {
2757 NEXT_OFF(n) = val - n;
2758 }
2759 stopnow = 1;
2760 }
2761#endif
2762 }
2763
2764 *min_subtract = 0;
2765 *has_exactf_sharp_s = FALSE;
2766
2767 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2768 * can now analyze for sequences of problematic code points. (Prior to
2769 * this final joining, sequences could have been split over boundaries, and
2770 * hence missed). The sequences only happen in folding, hence for any
2771 * non-EXACT EXACTish node */
2772 if (OP(scan) != EXACT) {
2773 U8 *s;
2774 U8 * s0 = (U8*) STRING(scan);
2775 U8 * const s_end = s0 + STR_LEN(scan);
2776
2777 /* The below is perhaps overboard, but this allows us to save a test
2778 * each time through the loop at the expense of a mask. This is
2779 * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2780 * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
2781 * This uses an exclusive 'or' to find that bit and then inverts it to
2782 * form a mask, with just a single 0, in the bit position where 'S' and
2783 * 's' differ. */
2784 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2785 const U8 s_masked = 's' & S_or_s_mask;
2786
2787 /* One pass is made over the node's string looking for all the
2788 * possibilities. to avoid some tests in the loop, there are two main
2789 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2790 * non-UTF-8 */
2791 if (UTF) {
2792
2793 /* There are two problematic Greek code points in Unicode
2794 * casefolding
2795 *
2796 * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2797 * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2798 *
2799 * which casefold to
2800 *
2801 * Unicode UTF-8
2802 *
2803 * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2804 * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2805 *
2806 * This means that in case-insensitive matching (or "loose
2807 * matching", as Unicode calls it), an EXACTF of length six (the
2808 * UTF-8 encoded byte length of the above casefolded versions) can
2809 * match a target string of length two (the byte length of UTF-8
2810 * encoded U+0390 or U+03B0). This would rather mess up the
2811 * minimum length computation. (there are other code points that
2812 * also fold to these two sequences, but the delta is smaller)
2813 *
2814 * If these sequences are found, the minimum length is decreased by
2815 * four (six minus two).
2816 *
2817 * Similarly, 'ss' may match the single char and byte LATIN SMALL
2818 * LETTER SHARP S. We decrease the min length by 1 for each
2819 * occurrence of 'ss' found */
2820
2821#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2822# define U390_first_byte 0xb4
2823 const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2824# define U3B0_first_byte 0xb5
2825 const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2826#else
2827# define U390_first_byte 0xce
2828 const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2829# define U3B0_first_byte 0xcf
2830 const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2831#endif
2832 const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2833 yields a net of 0 */
2834 /* Examine the string for one of the problematic sequences */
2835 for (s = s0;
2836 s < s_end - 1; /* Can stop 1 before the end, as minimum length
2837 * sequence we are looking for is 2 */
2838 s += UTF8SKIP(s))
2839 {
2840
2841 /* Look for the first byte in each problematic sequence */
2842 switch (*s) {
2843 /* We don't have to worry about other things that fold to
2844 * 's' (such as the long s, U+017F), as all above-latin1
2845 * code points have been pre-folded */
2846 case 's':
2847 case 'S':
2848
2849 /* Current character is an 's' or 'S'. If next one is
2850 * as well, we have the dreaded sequence */
2851 if (((*(s+1) & S_or_s_mask) == s_masked)
2852 /* These two node types don't have special handling
2853 * for 'ss' */
2854 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2855 {
2856 *min_subtract += 1;
2857 OP(scan) = EXACTFU_SS;
2858 s++; /* No need to look at this character again */
2859 }
2860 break;
2861
2862 case U390_first_byte:
2863 if (s_end - s >= len
2864
2865 /* The 1's are because are skipping comparing the
2866 * first byte */
2867 && memEQ(s + 1, U390_tail, len - 1))
2868 {
2869 goto greek_sequence;
2870 }
2871 break;
2872
2873 case U3B0_first_byte:
2874 if (! (s_end - s >= len
2875 && memEQ(s + 1, U3B0_tail, len - 1)))
2876 {
2877 break;
2878 }
2879 greek_sequence:
2880 *min_subtract += 4;
2881
2882 /* This can't currently be handled by trie's, so change
2883 * the node type to indicate this. If EXACTFA and
2884 * EXACTFL were ever to be handled by trie's, this
2885 * would have to be changed. If this node has already
2886 * been changed to EXACTFU_SS in this loop, leave it as
2887 * is. (I (khw) think it doesn't matter in regexec.c
2888 * for UTF patterns, but no need to change it */
2889 if (OP(scan) == EXACTFU) {
2890 OP(scan) = EXACTFU_TRICKYFOLD;
2891 }
2892 s += 6; /* We already know what this sequence is. Skip
2893 the rest of it */
2894 break;
2895 }
2896 }
2897 }
2898 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2899
2900 /* Here, the pattern is not UTF-8. We need to look only for the
2901 * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2902 * in the final position. Otherwise we can stop looking 1 byte
2903 * earlier because have to find both the first and second 's' */
2904 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2905
2906 for (s = s0; s < upper; s++) {
2907 switch (*s) {
2908 case 'S':
2909 case 's':
2910 if (s_end - s > 1
2911 && ((*(s+1) & S_or_s_mask) == s_masked))
2912 {
2913 *min_subtract += 1;
2914
2915 /* EXACTF nodes need to know that the minimum
2916 * length changed so that a sharp s in the string
2917 * can match this ss in the pattern, but they
2918 * remain EXACTF nodes, as they are not trie'able,
2919 * so don't have to invent a new node type to
2920 * exclude them from the trie code */
2921 if (OP(scan) != EXACTF) {
2922 OP(scan) = EXACTFU_SS;
2923 }
2924 s++;
2925 }
2926 break;
2927 case LATIN_SMALL_LETTER_SHARP_S:
2928 if (OP(scan) == EXACTF) {
2929 *has_exactf_sharp_s = TRUE;
2930 }
2931 break;
2932 }
2933 }
2934 }
2935 }
2936
2937#ifdef DEBUGGING
2938 /* Allow dumping but overwriting the collection of skipped
2939 * ops and/or strings with fake optimized ops */
2940 n = scan + NODE_SZ_STR(scan);
2941 while (n <= stop) {
2942 OP(n) = OPTIMIZED;
2943 FLAGS(n) = 0;
2944 NEXT_OFF(n) = 0;
2945 n++;
2946 }
2947#endif
2948 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2949 return stopnow;
2950}
2951
2952/* REx optimizer. Converts nodes into quicker variants "in place".
2953 Finds fixed substrings. */
2954
2955/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2956 to the position after last scanned or to NULL. */
2957
2958#define INIT_AND_WITHP \
2959 assert(!and_withp); \
2960 Newx(and_withp,1,struct regnode_charclass_class); \
2961 SAVEFREEPV(and_withp)
2962
2963/* this is a chain of data about sub patterns we are processing that
2964 need to be handled separately/specially in study_chunk. Its so
2965 we can simulate recursion without losing state. */
2966struct scan_frame;
2967typedef struct scan_frame {
2968 regnode *last; /* last node to process in this frame */
2969 regnode *next; /* next node to process when last is reached */
2970 struct scan_frame *prev; /*previous frame*/
2971 I32 stop; /* what stopparen do we use */
2972} scan_frame;
2973
2974
2975#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2976
2977#define CASE_SYNST_FNC(nAmE) \
2978case nAmE: \
2979 if (flags & SCF_DO_STCLASS_AND) { \
2980 for (value = 0; value < 256; value++) \
2981 if (!is_ ## nAmE ## _cp(value)) \
2982 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2983 } \
2984 else { \
2985 for (value = 0; value < 256; value++) \
2986 if (is_ ## nAmE ## _cp(value)) \
2987 ANYOF_BITMAP_SET(data->start_class, value); \
2988 } \
2989 break; \
2990case N ## nAmE: \
2991 if (flags & SCF_DO_STCLASS_AND) { \
2992 for (value = 0; value < 256; value++) \
2993 if (is_ ## nAmE ## _cp(value)) \
2994 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2995 } \
2996 else { \
2997 for (value = 0; value < 256; value++) \
2998 if (!is_ ## nAmE ## _cp(value)) \
2999 ANYOF_BITMAP_SET(data->start_class, value); \
3000 } \
3001 break
3002
3003
3004
3005STATIC I32
3006S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3007 I32 *minlenp, I32 *deltap,
3008 regnode *last,
3009 scan_data_t *data,
3010 I32 stopparen,
3011 U8* recursed,
3012 struct regnode_charclass_class *and_withp,
3013 U32 flags, U32 depth)
3014 /* scanp: Start here (read-write). */
3015 /* deltap: Write maxlen-minlen here. */
3016 /* last: Stop before this one. */
3017 /* data: string data about the pattern */
3018 /* stopparen: treat close N as END */
3019 /* recursed: which subroutines have we recursed into */
3020 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3021{
3022 dVAR;
3023 I32 min = 0, pars = 0, code;
3024 regnode *scan = *scanp, *next;
3025 I32 delta = 0;
3026 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3027 int is_inf_internal = 0; /* The studied chunk is infinite */
3028 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3029 scan_data_t data_fake;
3030 SV *re_trie_maxbuff = NULL;
3031 regnode *first_non_open = scan;
3032 I32 stopmin = I32_MAX;
3033 scan_frame *frame = NULL;
3034 GET_RE_DEBUG_FLAGS_DECL;
3035
3036 PERL_ARGS_ASSERT_STUDY_CHUNK;
3037
3038#ifdef DEBUGGING
3039 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3040#endif
3041
3042 if ( depth == 0 ) {
3043 while (first_non_open && OP(first_non_open) == OPEN)
3044 first_non_open=regnext(first_non_open);
3045 }
3046
3047
3048 fake_study_recurse:
3049 while ( scan && OP(scan) != END && scan < last ){
3050 UV min_subtract = 0; /* How much to subtract from the minimum node
3051 length to get a real minimum (because the
3052 folded version may be shorter) */
3053 bool has_exactf_sharp_s = FALSE;
3054 /* Peephole optimizer: */
3055 DEBUG_STUDYDATA("Peep:", data,depth);
3056 DEBUG_PEEP("Peep",scan,depth);
3057
3058 /* Its not clear to khw or hv why this is done here, and not in the
3059 * clauses that deal with EXACT nodes. khw's guess is that it's
3060 * because of a previous design */
3061 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3062
3063 /* Follow the next-chain of the current node and optimize
3064 away all the NOTHINGs from it. */
3065 if (OP(scan) != CURLYX) {
3066 const int max = (reg_off_by_arg[OP(scan)]
3067 ? I32_MAX
3068 /* I32 may be smaller than U16 on CRAYs! */
3069 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3070 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3071 int noff;
3072 regnode *n = scan;
3073
3074 /* Skip NOTHING and LONGJMP. */
3075 while ((n = regnext(n))
3076 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3077 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3078 && off + noff < max)
3079 off += noff;
3080 if (reg_off_by_arg[OP(scan)])
3081 ARG(scan) = off;
3082 else
3083 NEXT_OFF(scan) = off;
3084 }
3085
3086
3087
3088 /* The principal pseudo-switch. Cannot be a switch, since we
3089 look into several different things. */
3090 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3091 || OP(scan) == IFTHEN) {
3092 next = regnext(scan);
3093 code = OP(scan);
3094 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3095
3096 if (OP(next) == code || code == IFTHEN) {
3097 /* NOTE - There is similar code to this block below for handling
3098 TRIE nodes on a re-study. If you change stuff here check there
3099 too. */
3100 I32 max1 = 0, min1 = I32_MAX, num = 0;
3101 struct regnode_charclass_class accum;
3102 regnode * const startbranch=scan;
3103
3104 if (flags & SCF_DO_SUBSTR)
3105 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3106 if (flags & SCF_DO_STCLASS)
3107 cl_init_zero(pRExC_state, &accum);
3108
3109 while (OP(scan) == code) {
3110 I32 deltanext, minnext, f = 0, fake;
3111 struct regnode_charclass_class this_class;
3112
3113 num++;
3114 data_fake.flags = 0;
3115 if (data) {
3116 data_fake.whilem_c = data->whilem_c;
3117 data_fake.last_closep = data->last_closep;
3118 }
3119 else
3120 data_fake.last_closep = &fake;
3121
3122 data_fake.pos_delta = delta;
3123 next = regnext(scan);
3124 scan = NEXTOPER(scan);
3125 if (code != BRANCH)
3126 scan = NEXTOPER(scan);
3127 if (flags & SCF_DO_STCLASS) {
3128 cl_init(pRExC_state, &this_class);
3129 data_fake.start_class = &this_class;
3130 f = SCF_DO_STCLASS_AND;
3131 }
3132 if (flags & SCF_WHILEM_VISITED_POS)
3133 f |= SCF_WHILEM_VISITED_POS;
3134
3135 /* we suppose the run is continuous, last=next...*/
3136 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3137 next, &data_fake,
3138 stopparen, recursed, NULL, f,depth+1);
3139 if (min1 > minnext)
3140 min1 = minnext;
3141 if (max1 < minnext + deltanext)
3142 max1 = minnext + deltanext;
3143 if (deltanext == I32_MAX)
3144 is_inf = is_inf_internal = 1;
3145 scan = next;
3146 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3147 pars++;
3148 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3149 if ( stopmin > minnext)
3150 stopmin = min + min1;
3151 flags &= ~SCF_DO_SUBSTR;
3152 if (data)
3153 data->flags |= SCF_SEEN_ACCEPT;
3154 }
3155 if (data) {
3156 if (data_fake.flags & SF_HAS_EVAL)
3157 data->flags |= SF_HAS_EVAL;
3158 data->whilem_c = data_fake.whilem_c;
3159 }
3160 if (flags & SCF_DO_STCLASS)
3161 cl_or(pRExC_state, &accum, &this_class);
3162 }
3163 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3164 min1 = 0;
3165 if (flags & SCF_DO_SUBSTR) {
3166 data->pos_min += min1;
3167 data->pos_delta += max1 - min1;
3168 if (max1 != min1 || is_inf)
3169 data->longest = &(data->longest_float);
3170 }
3171 min += min1;
3172 delta += max1 - min1;
3173 if (flags & SCF_DO_STCLASS_OR) {
3174 cl_or(pRExC_state, data->start_class, &accum);
3175 if (min1) {
3176 cl_and(data->start_class, and_withp);
3177 flags &= ~SCF_DO_STCLASS;
3178 }
3179 }
3180 else if (flags & SCF_DO_STCLASS_AND) {
3181 if (min1) {
3182 cl_and(data->start_class, &accum);
3183 flags &= ~SCF_DO_STCLASS;
3184 }
3185 else {
3186 /* Switch to OR mode: cache the old value of
3187 * data->start_class */
3188 INIT_AND_WITHP;
3189 StructCopy(data->start_class, and_withp,
3190 struct regnode_charclass_class);
3191 flags &= ~SCF_DO_STCLASS_AND;
3192 StructCopy(&accum, data->start_class,
3193 struct regnode_charclass_class);
3194 flags |= SCF_DO_STCLASS_OR;
3195 data->start_class->flags |= ANYOF_EOS;
3196 }
3197 }
3198
3199 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3200 /* demq.
3201
3202 Assuming this was/is a branch we are dealing with: 'scan' now
3203 points at the item that follows the branch sequence, whatever
3204 it is. We now start at the beginning of the sequence and look
3205 for subsequences of
3206
3207 BRANCH->EXACT=>x1
3208 BRANCH->EXACT=>x2
3209 tail
3210
3211 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3212
3213 If we can find such a subsequence we need to turn the first
3214 element into a trie and then add the subsequent branch exact
3215 strings to the trie.
3216
3217 We have two cases
3218
3219 1. patterns where the whole set of branches can be converted.
3220
3221 2. patterns where only a subset can be converted.
3222
3223 In case 1 we can replace the whole set with a single regop
3224 for the trie. In case 2 we need to keep the start and end
3225 branches so
3226
3227 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3228 becomes BRANCH TRIE; BRANCH X;
3229
3230 There is an additional case, that being where there is a
3231 common prefix, which gets split out into an EXACT like node
3232 preceding the TRIE node.
3233
3234 If x(1..n)==tail then we can do a simple trie, if not we make
3235 a "jump" trie, such that when we match the appropriate word
3236 we "jump" to the appropriate tail node. Essentially we turn
3237 a nested if into a case structure of sorts.
3238
3239 */
3240
3241 int made=0;
3242 if (!re_trie_maxbuff) {
3243 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3244 if (!SvIOK(re_trie_maxbuff))
3245 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3246 }
3247 if ( SvIV(re_trie_maxbuff)>=0 ) {
3248 regnode *cur;
3249 regnode *first = (regnode *)NULL;
3250 regnode *last = (regnode *)NULL;
3251 regnode *tail = scan;
3252 U8 trietype = 0;
3253 U32 count=0;
3254
3255#ifdef DEBUGGING
3256 SV * const mysv = sv_newmortal(); /* for dumping */
3257#endif
3258 /* var tail is used because there may be a TAIL
3259 regop in the way. Ie, the exacts will point to the
3260 thing following the TAIL, but the last branch will
3261 point at the TAIL. So we advance tail. If we
3262 have nested (?:) we may have to move through several
3263 tails.
3264 */
3265
3266 while ( OP( tail ) == TAIL ) {
3267 /* this is the TAIL generated by (?:) */
3268 tail = regnext( tail );
3269 }
3270
3271
3272 DEBUG_TRIE_COMPILE_r({
3273 regprop(RExC_rx, mysv, tail );
3274 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3275 (int)depth * 2 + 2, "",
3276 "Looking for TRIE'able sequences. Tail node is: ",
3277 SvPV_nolen_const( mysv )
3278 );
3279 });
3280
3281 /*
3282
3283 Step through the branches
3284 cur represents each branch,
3285 noper is the first thing to be matched as part of that branch
3286 noper_next is the regnext() of that node.
3287
3288 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3289 via a "jump trie" but we also support building with NOJUMPTRIE,
3290 which restricts the trie logic to structures like /FOO|BAR/.
3291
3292 If noper is a trieable nodetype then the branch is a possible optimization
3293 target. If we are building under NOJUMPTRIE then we require that noper_next
3294 is the same as scan (our current position in the regex program).
3295
3296 Once we have two or more consecutive such branches we can create a
3297 trie of the EXACT's contents and stitch it in place into the program.
3298
3299 If the sequence represents all of the branches in the alternation we
3300 replace the entire thing with a single TRIE node.
3301
3302 Otherwise when it is a subsequence we need to stitch it in place and
3303 replace only the relevant branches. This means the first branch has
3304 to remain as it is used by the alternation logic, and its next pointer,
3305 and needs to be repointed at the item on the branch chain following
3306 the last branch we have optimized away.
3307
3308 This could be either a BRANCH, in which case the subsequence is internal,
3309 or it could be the item following the branch sequence in which case the
3310 subsequence is at the end (which does not necessarily mean the first node
3311 is the start of the alternation).
3312
3313 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3314
3315 optype | trietype
3316 ----------------+-----------
3317 NOTHING | NOTHING
3318 EXACT | EXACT
3319 EXACTFU | EXACTFU
3320 EXACTFU_SS | EXACTFU
3321 EXACTFU_TRICKYFOLD | EXACTFU
3322 EXACTFA | 0
3323
3324
3325 */
3326#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3327 ( EXACT == (X) ) ? EXACT : \
3328 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3329 0 )
3330
3331 /* dont use tail as the end marker for this traverse */
3332 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3333 regnode * const noper = NEXTOPER( cur );
3334 U8 noper_type = OP( noper );
3335 U8 noper_trietype = TRIE_TYPE( noper_type );
3336#if defined(DEBUGGING) || defined(NOJUMPTRIE)
3337 regnode * const noper_next = regnext( noper );
3338 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3339 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3340#endif
3341
3342 DEBUG_TRIE_COMPILE_r({
3343 regprop(RExC_rx, mysv, cur);
3344 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3345 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3346
3347 regprop(RExC_rx, mysv, noper);
3348 PerlIO_printf( Perl_debug_log, " -> %s",
3349 SvPV_nolen_const(mysv));
3350
3351 if ( noper_next ) {
3352 regprop(RExC_rx, mysv, noper_next );
3353 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3354 SvPV_nolen_const(mysv));
3355 }
3356 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3357 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3358 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3359 );
3360 });
3361
3362 /* Is noper a trieable nodetype that can be merged with the
3363 * current trie (if there is one)? */
3364 if ( noper_trietype
3365 &&
3366 (
3367 ( noper_trietype == NOTHING)
3368 || ( trietype == NOTHING )
3369 || ( trietype == noper_trietype )
3370 )
3371#ifdef NOJUMPTRIE
3372 && noper_next == tail
3373#endif
3374 && count < U16_MAX)
3375 {
3376 /* Handle mergable triable node
3377 * Either we are the first node in a new trieable sequence,
3378 * in which case we do some bookkeeping, otherwise we update
3379 * the end pointer. */
3380 if ( !first ) {
3381 first = cur;
3382 if ( noper_trietype == NOTHING ) {
3383#if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3384 regnode * const noper_next = regnext( noper );
3385 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3386 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3387#endif
3388
3389 if ( noper_next_trietype ) {
3390 trietype = noper_next_trietype;
3391 } else if (noper_next_type) {
3392 /* a NOTHING regop is 1 regop wide. We need at least two
3393 * for a trie so we can't merge this in */
3394 first = NULL;
3395 }
3396 } else {
3397 trietype = noper_trietype;
3398 }
3399 } else {
3400 if ( trietype == NOTHING )
3401 trietype = noper_trietype;
3402 last = cur;
3403 }
3404 if (first)
3405 count++;
3406 } /* end handle mergable triable node */
3407 else {
3408 /* handle unmergable node -
3409 * noper may either be a triable node which can not be tried
3410 * together with the current trie, or a non triable node */
3411 if ( last ) {
3412 /* If last is set and trietype is not NOTHING then we have found
3413 * at least two triable branch sequences in a row of a similar
3414 * trietype so we can turn them into a trie. If/when we
3415 * allow NOTHING to start a trie sequence this condition will be
3416 * required, and it isn't expensive so we leave it in for now. */
3417 if ( trietype != NOTHING )
3418 make_trie( pRExC_state,
3419 startbranch, first, cur, tail, count,
3420 trietype, depth+1 );
3421 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3422 }
3423 if ( noper_trietype
3424#ifdef NOJUMPTRIE
3425 && noper_next == tail
3426#endif
3427 ){
3428 /* noper is triable, so we can start a new trie sequence */
3429 count = 1;
3430 first = cur;
3431 trietype = noper_trietype;
3432 } else if (first) {
3433 /* if we already saw a first but the current node is not triable then we have
3434 * to reset the first information. */
3435 count = 0;
3436 first = NULL;
3437 trietype = 0;
3438 }
3439 } /* end handle unmergable node */
3440 } /* loop over branches */
3441 DEBUG_TRIE_COMPILE_r({
3442 regprop(RExC_rx, mysv, cur);
3443 PerlIO_printf( Perl_debug_log,
3444 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3445 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3446
3447 });
3448 if ( last ) {
3449 if ( trietype != NOTHING ) {
3450 /* the last branch of the sequence was part of a trie,
3451 * so we have to construct it here outside of the loop
3452 */
3453 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3454#ifdef TRIE_STUDY_OPT
3455 if ( ((made == MADE_EXACT_TRIE &&
3456 startbranch == first)
3457 || ( first_non_open == first )) &&
3458 depth==0 ) {
3459 flags |= SCF_TRIE_RESTUDY;
3460 if ( startbranch == first
3461 && scan == tail )
3462 {
3463 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3464 }
3465 }
3466#endif
3467 } else {
3468 /* at this point we know whatever we have is a NOTHING sequence/branch
3469 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3470 */
3471 if ( startbranch == first ) {
3472 regnode *opt;
3473 /* the entire thing is a NOTHING sequence, something like this:
3474 * (?:|) So we can turn it into a plain NOTHING op. */
3475 DEBUG_TRIE_COMPILE_r({
3476 regprop(RExC_rx, mysv, cur);
3477 PerlIO_printf( Perl_debug_log,
3478 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3479 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3480
3481 });
3482 OP(startbranch)= NOTHING;
3483 NEXT_OFF(startbranch)= tail - startbranch;
3484 for ( opt= startbranch + 1; opt < tail ; opt++ )
3485 OP(opt)= OPTIMIZED;
3486 }
3487 }
3488 } /* end if ( last) */
3489 } /* TRIE_MAXBUF is non zero */
3490
3491 } /* do trie */
3492
3493 }
3494 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3495 scan = NEXTOPER(NEXTOPER(scan));
3496 } else /* single branch is optimized. */
3497 scan = NEXTOPER(scan);
3498 continue;
3499 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3500 scan_frame *newframe = NULL;
3501 I32 paren;
3502 regnode *start;
3503 regnode *end;
3504
3505 if (OP(scan) != SUSPEND) {
3506 /* set the pointer */
3507 if (OP(scan) == GOSUB) {
3508 paren = ARG(scan);
3509 RExC_recurse[ARG2L(scan)] = scan;
3510 start = RExC_open_parens[paren-1];
3511 end = RExC_close_parens[paren-1];
3512 } else {
3513 paren = 0;
3514 start = RExC_rxi->program + 1;
3515 end = RExC_opend;
3516 }
3517 if (!recursed) {
3518 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3519 SAVEFREEPV(recursed);
3520 }
3521 if (!PAREN_TEST(recursed,paren+1)) {
3522 PAREN_SET(recursed,paren+1);
3523 Newx(newframe,1,scan_frame);
3524 } else {
3525 if (flags & SCF_DO_SUBSTR) {
3526 SCAN_COMMIT(pRExC_state,data,minlenp);
3527 data->longest = &(data->longest_float);
3528 }
3529 is_inf = is_inf_internal = 1;
3530 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3531 cl_anything(pRExC_state, data->start_class);
3532 flags &= ~SCF_DO_STCLASS;
3533 }
3534 } else {
3535 Newx(newframe,1,scan_frame);
3536 paren = stopparen;
3537 start = scan+2;
3538 end = regnext(scan);
3539 }
3540 if (newframe) {
3541 assert(start);
3542 assert(end);
3543 SAVEFREEPV(newframe);
3544 newframe->next = regnext(scan);
3545 newframe->last = last;
3546 newframe->stop = stopparen;
3547 newframe->prev = frame;
3548
3549 frame = newframe;
3550 scan = start;
3551 stopparen = paren;
3552 last = end;
3553
3554 continue;
3555 }
3556 }
3557 else if (OP(scan) == EXACT) {
3558 I32 l = STR_LEN(scan);
3559 UV uc;
3560 if (UTF) {
3561 const U8 * const s = (U8*)STRING(scan);
3562 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3563 l = utf8_length(s, s + l);
3564 } else {
3565 uc = *((U8*)STRING(scan));
3566 }
3567 min += l;
3568 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3569 /* The code below prefers earlier match for fixed
3570 offset, later match for variable offset. */
3571 if (data->last_end == -1) { /* Update the start info. */
3572 data->last_start_min = data->pos_min;
3573 data->last_start_max = is_inf
3574 ? I32_MAX : data->pos_min + data->pos_delta;
3575 }
3576 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3577 if (UTF)
3578 SvUTF8_on(data->last_found);
3579 {
3580 SV * const sv = data->last_found;
3581 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3582 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3583 if (mg && mg->mg_len >= 0)
3584 mg->mg_len += utf8_length((U8*)STRING(scan),
3585 (U8*)STRING(scan)+STR_LEN(scan));
3586 }
3587 data->last_end = data->pos_min + l;
3588 data->pos_min += l; /* As in the first entry. */
3589 data->flags &= ~SF_BEFORE_EOL;
3590 }
3591 if (flags & SCF_DO_STCLASS_AND) {
3592 /* Check whether it is compatible with what we know already! */
3593 int compat = 1;
3594
3595
3596 /* If compatible, we or it in below. It is compatible if is
3597 * in the bitmp and either 1) its bit or its fold is set, or 2)
3598 * it's for a locale. Even if there isn't unicode semantics
3599 * here, at runtime there may be because of matching against a
3600 * utf8 string, so accept a possible false positive for
3601 * latin1-range folds */
3602 if (uc >= 0x100 ||
3603 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3604 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3605 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3606 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3607 )
3608 {
3609 compat = 0;
3610 }
3611 ANYOF_CLASS_ZERO(data->start_class);
3612 ANYOF_BITMAP_ZERO(data->start_class);
3613 if (compat)
3614 ANYOF_BITMAP_SET(data->start_class, uc);
3615 else if (uc >= 0x100) {
3616 int i;
3617
3618 /* Some Unicode code points fold to the Latin1 range; as
3619 * XXX temporary code, instead of figuring out if this is
3620 * one, just assume it is and set all the start class bits
3621 * that could be some such above 255 code point's fold
3622 * which will generate fals positives. As the code
3623 * elsewhere that does compute the fold settles down, it
3624 * can be extracted out and re-used here */
3625 for (i = 0; i < 256; i++){
3626 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3627 ANYOF_BITMAP_SET(data->start_class, i);
3628 }
3629 }
3630 }
3631 data->start_class->flags &= ~ANYOF_EOS;
3632 if (uc < 0x100)
3633 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3634 }
3635 else if (flags & SCF_DO_STCLASS_OR) {
3636 /* false positive possible if the class is case-folded */
3637 if (uc < 0x100)
3638 ANYOF_BITMAP_SET(data->start_class, uc);
3639 else
3640 data->start_class->flags |= ANYOF_UNICODE_ALL;
3641 data->start_class->flags &= ~ANYOF_EOS;
3642 cl_and(data->start_class, and_withp);
3643 }
3644 flags &= ~SCF_DO_STCLASS;
3645 }
3646 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3647 I32 l = STR_LEN(scan);
3648 UV uc = *((U8*)STRING(scan));
3649
3650 /* Search for fixed substrings supports EXACT only. */
3651 if (flags & SCF_DO_SUBSTR) {
3652 assert(data);
3653 SCAN_COMMIT(pRExC_state, data, minlenp);
3654 }
3655 if (UTF) {
3656 const U8 * const s = (U8 *)STRING(scan);
3657 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3658 l = utf8_length(s, s + l);
3659 }
3660 else if (has_exactf_sharp_s) {
3661 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3662 }
3663 min += l - min_subtract;
3664 if (min < 0) {
3665 min = 0;
3666 }
3667 delta += min_subtract;
3668 if (flags & SCF_DO_SUBSTR) {
3669 data->pos_min += l - min_subtract;
3670 if (data->pos_min < 0) {
3671 data->pos_min = 0;
3672 }
3673 data->pos_delta += min_subtract;
3674 if (min_subtract) {
3675 data->longest = &(data->longest_float);
3676 }
3677 }
3678 if (flags & SCF_DO_STCLASS_AND) {
3679 /* Check whether it is compatible with what we know already! */
3680 int compat = 1;
3681 if (uc >= 0x100 ||
3682 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3683 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3684 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3685 {
3686 compat = 0;
3687 }
3688 ANYOF_CLASS_ZERO(data->start_class);
3689 ANYOF_BITMAP_ZERO(data->start_class);
3690 if (compat) {
3691 ANYOF_BITMAP_SET(data->start_class, uc);
3692 data->start_class->flags &= ~ANYOF_EOS;
3693 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3694 if (OP(scan) == EXACTFL) {
3695 /* XXX This set is probably no longer necessary, and
3696 * probably wrong as LOCALE now is on in the initial
3697 * state */
3698 data->start_class->flags |= ANYOF_LOCALE;
3699 }
3700 else {
3701
3702 /* Also set the other member of the fold pair. In case
3703 * that unicode semantics is called for at runtime, use
3704 * the full latin1 fold. (Can't do this for locale,
3705 * because not known until runtime) */
3706 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3707
3708 /* All other (EXACTFL handled above) folds except under
3709 * /iaa that include s, S, and sharp_s also may include
3710 * the others */
3711 if (OP(scan) != EXACTFA) {
3712 if (uc == 's' || uc == 'S') {
3713 ANYOF_BITMAP_SET(data->start_class,
3714 LATIN_SMALL_LETTER_SHARP_S);
3715 }
3716 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3717 ANYOF_BITMAP_SET(data->start_class, 's');
3718 ANYOF_BITMAP_SET(data->start_class, 'S');
3719 }
3720 }
3721 }
3722 }
3723 else if (uc >= 0x100) {
3724 int i;
3725 for (i = 0; i < 256; i++){
3726 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3727 ANYOF_BITMAP_SET(data->start_class, i);
3728 }
3729 }
3730 }
3731 }
3732 else if (flags & SCF_DO_STCLASS_OR) {
3733 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3734 /* false positive possible if the class is case-folded.
3735 Assume that the locale settings are the same... */
3736 if (uc < 0x100) {
3737 ANYOF_BITMAP_SET(data->start_class, uc);
3738 if (OP(scan) != EXACTFL) {
3739
3740 /* And set the other member of the fold pair, but
3741 * can't do that in locale because not known until
3742 * run-time */
3743 ANYOF_BITMAP_SET(data->start_class,
3744 PL_fold_latin1[uc]);
3745
3746 /* All folds except under /iaa that include s, S,
3747 * and sharp_s also may include the others */
3748 if (OP(scan) != EXACTFA) {
3749 if (uc == 's' || uc == 'S') {
3750 ANYOF_BITMAP_SET(data->start_class,
3751 LATIN_SMALL_LETTER_SHARP_S);
3752 }
3753 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3754 ANYOF_BITMAP_SET(data->start_class, 's');
3755 ANYOF_BITMAP_SET(data->start_class, 'S');
3756 }
3757 }
3758 }
3759 }
3760 data->start_class->flags &= ~ANYOF_EOS;
3761 }
3762 cl_and(data->start_class, and_withp);
3763 }
3764 flags &= ~SCF_DO_STCLASS;
3765 }
3766 else if (REGNODE_VARIES(OP(scan))) {
3767 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3768 I32 f = flags, pos_before = 0;
3769 regnode * const oscan = scan;
3770 struct regnode_charclass_class this_class;
3771 struct regnode_charclass_class *oclass = NULL;
3772 I32 next_is_eval = 0;
3773
3774 switch (PL_regkind[OP(scan)]) {
3775 case WHILEM: /* End of (?:...)* . */
3776 scan = NEXTOPER(scan);
3777 goto finish;
3778 case PLUS:
3779 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3780 next = NEXTOPER(scan);
3781 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3782 mincount = 1;
3783 maxcount = REG_INFTY;
3784 next = regnext(scan);
3785 scan = NEXTOPER(scan);
3786 goto do_curly;
3787 }
3788 }
3789 if (flags & SCF_DO_SUBSTR)
3790 data->pos_min++;
3791 min++;
3792 /* Fall through. */
3793 case STAR:
3794 if (flags & SCF_DO_STCLASS) {
3795 mincount = 0;
3796 maxcount = REG_INFTY;
3797 next = regnext(scan);
3798 scan = NEXTOPER(scan);
3799 goto do_curly;
3800 }
3801 is_inf = is_inf_internal = 1;
3802 scan = regnext(scan);
3803 if (flags & SCF_DO_SUBSTR) {
3804 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3805 data->longest = &(data->longest_float);
3806 }
3807 goto optimize_curly_tail;
3808 case CURLY:
3809 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3810 && (scan->flags == stopparen))
3811 {
3812 mincount = 1;
3813 maxcount = 1;
3814 } else {
3815 mincount = ARG1(scan);
3816 maxcount = ARG2(scan);
3817 }
3818 next = regnext(scan);
3819 if (OP(scan) == CURLYX) {
3820 I32 lp = (data ? *(data->last_closep) : 0);
3821 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3822 }
3823 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3824 next_is_eval = (OP(scan) == EVAL);
3825 do_curly:
3826 if (flags & SCF_DO_SUBSTR) {
3827 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3828 pos_before = data->pos_min;
3829 }
3830 if (data) {
3831 fl = data->flags;
3832 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3833 if (is_inf)
3834 data->flags |= SF_IS_INF;
3835 }
3836 if (flags & SCF_DO_STCLASS) {
3837 cl_init(pRExC_state, &this_class);
3838 oclass = data->start_class;
3839 data->start_class = &this_class;
3840 f |= SCF_DO_STCLASS_AND;
3841 f &= ~SCF_DO_STCLASS_OR;
3842 }
3843 /* Exclude from super-linear cache processing any {n,m}
3844 regops for which the combination of input pos and regex
3845 pos is not enough information to determine if a match
3846 will be possible.
3847
3848 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3849 regex pos at the \s*, the prospects for a match depend not
3850 only on the input position but also on how many (bar\s*)
3851 repeats into the {4,8} we are. */
3852 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3853 f &= ~SCF_WHILEM_VISITED_POS;
3854
3855 /* This will finish on WHILEM, setting scan, or on NULL: */
3856 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3857 last, data, stopparen, recursed, NULL,
3858 (mincount == 0
3859 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3860
3861 if (flags & SCF_DO_STCLASS)
3862 data->start_class = oclass;
3863 if (mincount == 0 || minnext == 0) {
3864 if (flags & SCF_DO_STCLASS_OR) {
3865 cl_or(pRExC_state, data->start_class, &this_class);
3866 }
3867 else if (flags & SCF_DO_STCLASS_AND) {
3868 /* Switch to OR mode: cache the old value of
3869 * data->start_class */
3870 INIT_AND_WITHP;
3871 StructCopy(data->start_class, and_withp,
3872 struct regnode_charclass_class);
3873 flags &= ~SCF_DO_STCLASS_AND;
3874 StructCopy(&this_class, data->start_class,
3875 struct regnode_charclass_class);
3876 flags |= SCF_DO_STCLASS_OR;
3877 data->start_class->flags |= ANYOF_EOS;
3878 }
3879 } else { /* Non-zero len */
3880 if (flags & SCF_DO_STCLASS_OR) {
3881 cl_or(pRExC_state, data->start_class, &this_class);
3882 cl_and(data->start_class, and_withp);
3883 }
3884 else if (flags & SCF_DO_STCLASS_AND)
3885 cl_and(data->start_class, &this_class);
3886 flags &= ~SCF_DO_STCLASS;
3887 }
3888 if (!scan) /* It was not CURLYX, but CURLY. */
3889 scan = next;
3890 if ( /* ? quantifier ok, except for (?{ ... }) */
3891 (next_is_eval || !(mincount == 0 && maxcount == 1))
3892 && (minnext == 0) && (deltanext == 0)
3893 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3894 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3895 {
3896 ckWARNreg(RExC_parse,
3897 "Quantifier unexpected on zero-length expression");
3898 }
3899
3900 min += minnext * mincount;
3901 is_inf_internal |= ((maxcount == REG_INFTY
3902 && (minnext + deltanext) > 0)
3903 || deltanext == I32_MAX);
3904 is_inf |= is_inf_internal;
3905 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3906
3907 /* Try powerful optimization CURLYX => CURLYN. */
3908 if ( OP(oscan) == CURLYX && data
3909 && data->flags & SF_IN_PAR
3910 && !(data->flags & SF_HAS_EVAL)
3911 && !deltanext && minnext == 1 ) {
3912 /* Try to optimize to CURLYN. */
3913 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3914 regnode * const nxt1 = nxt;
3915#ifdef DEBUGGING
3916 regnode *nxt2;
3917#endif
3918
3919 /* Skip open. */
3920 nxt = regnext(nxt);
3921 if (!REGNODE_SIMPLE(OP(nxt))
3922 && !(PL_regkind[OP(nxt)] == EXACT
3923 && STR_LEN(nxt) == 1))
3924 goto nogo;
3925#ifdef DEBUGGING
3926 nxt2 = nxt;
3927#endif
3928 nxt = regnext(nxt);
3929 if (OP(nxt) != CLOSE)
3930 goto nogo;
3931 if (RExC_open_parens) {
3932 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3933 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3934 }
3935 /* Now we know that nxt2 is the only contents: */
3936 oscan->flags = (U8)ARG(nxt);
3937 OP(oscan) = CURLYN;
3938 OP(nxt1) = NOTHING; /* was OPEN. */
3939
3940#ifdef DEBUGGING
3941 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3942 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3943 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3944 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3945 OP(nxt + 1) = OPTIMIZED; /* was count. */
3946 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3947#endif
3948 }
3949 nogo:
3950
3951 /* Try optimization CURLYX => CURLYM. */
3952 if ( OP(oscan) == CURLYX && data
3953 && !(data->flags & SF_HAS_PAR)
3954 && !(data->flags & SF_HAS_EVAL)
3955 && !deltanext /* atom is fixed width */
3956 && minnext != 0 /* CURLYM can't handle zero width */
3957 ) {
3958 /* XXXX How to optimize if data == 0? */
3959 /* Optimize to a simpler form. */
3960 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3961 regnode *nxt2;
3962
3963 OP(oscan) = CURLYM;
3964 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3965 && (OP(nxt2) != WHILEM))
3966 nxt = nxt2;
3967 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3968 /* Need to optimize away parenths. */
3969 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3970 /* Set the parenth number. */
3971 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3972
3973 oscan->flags = (U8)ARG(nxt);
3974 if (RExC_open_parens) {
3975 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3976 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3977 }
3978 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3979 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3980
3981#ifdef DEBUGGING
3982 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3983 OP(nxt + 1) = OPTIMIZED; /* was count. */
3984 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3985 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3986#endif
3987#if 0
3988 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3989 regnode *nnxt = regnext(nxt1);
3990 if (nnxt == nxt) {
3991 if (reg_off_by_arg[OP(nxt1)])
3992 ARG_SET(nxt1, nxt2 - nxt1);
3993 else if (nxt2 - nxt1 < U16_MAX)
3994 NEXT_OFF(nxt1) = nxt2 - nxt1;
3995 else
3996 OP(nxt) = NOTHING; /* Cannot beautify */
3997 }
3998 nxt1 = nnxt;
3999 }
4000#endif
4001 /* Optimize again: */
4002 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4003 NULL, stopparen, recursed, NULL, 0,depth+1);
4004 }
4005 else
4006 oscan->flags = 0;
4007 }
4008 else if ((OP(oscan) == CURLYX)
4009 && (flags & SCF_WHILEM_VISITED_POS)
4010 /* See the comment on a similar expression above.
4011 However, this time it's not a subexpression
4012 we care about, but the expression itself. */
4013 && (maxcount == REG_INFTY)
4014 && data && ++data->whilem_c < 16) {
4015 /* This stays as CURLYX, we can put the count/of pair. */
4016 /* Find WHILEM (as in regexec.c) */
4017 regnode *nxt = oscan + NEXT_OFF(oscan);
4018
4019 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4020 nxt += ARG(nxt);
4021 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4022 | (RExC_whilem_seen << 4)); /* On WHILEM */
4023 }
4024 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4025 pars++;
4026 if (flags & SCF_DO_SUBSTR) {
4027 SV *last_str = NULL;
4028 int counted = mincount != 0;
4029
4030 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4031#if defined(SPARC64_GCC_WORKAROUND)
4032 I32 b = 0;
4033 STRLEN l = 0;
4034 const char *s = NULL;
4035 I32 old = 0;
4036
4037 if (pos_before >= data->last_start_min)
4038 b = pos_before;
4039 else
4040 b = data->last_start_min;
4041
4042 l = 0;
4043 s = SvPV_const(data->last_found, l);
4044 old = b - data->last_start_min;
4045
4046#else
4047 I32 b = pos_before >= data->last_start_min
4048 ? pos_before : data->last_start_min;
4049 STRLEN l;
4050 const char * const s = SvPV_const(data->last_found, l);
4051 I32 old = b - data->last_start_min;
4052#endif
4053
4054 if (UTF)
4055 old = utf8_hop((U8*)s, old) - (U8*)s;
4056 l -= old;
4057 /* Get the added string: */
4058 last_str = newSVpvn_utf8(s + old, l, UTF);
4059 if (deltanext == 0 && pos_before == b) {
4060 /* What was added is a constant string */
4061 if (mincount > 1) {
4062 SvGROW(last_str, (mincount * l) + 1);
4063 repeatcpy(SvPVX(last_str) + l,
4064 SvPVX_const(last_str), l, mincount - 1);
4065 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4066 /* Add additional parts. */
4067 SvCUR_set(data->last_found,
4068 SvCUR(data->last_found) - l);
4069 sv_catsv(data->last_found, last_str);
4070 {
4071 SV * sv = data->last_found;
4072 MAGIC *mg =
4073 SvUTF8(sv) && SvMAGICAL(sv) ?
4074 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4075 if (mg && mg->mg_len >= 0)
4076 mg->mg_len += CHR_SVLEN(last_str) - l;
4077 }
4078 data->last_end += l * (mincount - 1);
4079 }
4080 } else {
4081 /* start offset must point into the last copy */
4082 data->last_start_min += minnext * (mincount - 1);
4083 data->last_start_max += is_inf ? I32_MAX
4084 : (maxcount - 1) * (minnext + data->pos_delta);
4085 }
4086 }
4087 /* It is counted once already... */
4088 data->pos_min += minnext * (mincount - counted);
4089 data->pos_delta += - counted * deltanext +
4090 (minnext + deltanext) * maxcount - minnext * mincount;
4091 if (mincount != maxcount) {
4092 /* Cannot extend fixed substrings found inside
4093 the group. */
4094 SCAN_COMMIT(pRExC_state,data,minlenp);
4095 if (mincount && last_str) {
4096 SV * const sv = data->last_found;
4097 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4098 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4099
4100 if (mg)
4101 mg->mg_len = -1;
4102 sv_setsv(sv, last_str);
4103 data->last_end = data->pos_min;
4104 data->last_start_min =
4105 data->pos_min - CHR_SVLEN(last_str);
4106 data->last_start_max = is_inf
4107 ? I32_MAX
4108 : data->pos_min + data->pos_delta
4109 - CHR_SVLEN(last_str);
4110 }
4111 data->longest = &(data->longest_float);
4112 }
4113 SvREFCNT_dec(last_str);
4114 }
4115 if (data && (fl & SF_HAS_EVAL))
4116 data->flags |= SF_HAS_EVAL;
4117 optimize_curly_tail:
4118 if (OP(oscan) != CURLYX) {
4119 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4120 && NEXT_OFF(next))
4121 NEXT_OFF(oscan) += NEXT_OFF(next);
4122 }
4123 continue;
4124 default: /* REF, ANYOFV, and CLUMP only? */
4125 if (flags & SCF_DO_SUBSTR) {
4126 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4127 data->longest = &(data->longest_float);
4128 }
4129 is_inf = is_inf_internal = 1;
4130 if (flags & SCF_DO_STCLASS_OR)
4131 cl_anything(pRExC_state, data->start_class);
4132 flags &= ~SCF_DO_STCLASS;
4133 break;
4134 }
4135 }
4136 else if (OP(scan) == LNBREAK) {
4137 if (flags & SCF_DO_STCLASS) {
4138 int value = 0;
4139 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4140 if (flags & SCF_DO_STCLASS_AND) {
4141 for (value = 0; value < 256; value++)
4142 if (!is_VERTWS_cp(value))
4143 ANYOF_BITMAP_CLEAR(data->start_class, value);
4144 }
4145 else {
4146 for (value = 0; value < 256; value++)
4147 if (is_VERTWS_cp(value))
4148 ANYOF_BITMAP_SET(data->start_class, value);
4149 }
4150 if (flags & SCF_DO_STCLASS_OR)
4151 cl_and(data->start_class, and_withp);
4152 flags &= ~SCF_DO_STCLASS;
4153 }
4154 min += 1;
4155 delta += 1;
4156 if (flags & SCF_DO_SUBSTR) {
4157 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4158 data->pos_min += 1;
4159 data->pos_delta += 1;
4160 data->longest = &(data->longest_float);
4161 }
4162 }
4163 else if (REGNODE_SIMPLE(OP(scan))) {
4164 int value = 0;
4165
4166 if (flags & SCF_DO_SUBSTR) {
4167 SCAN_COMMIT(pRExC_state,data,minlenp);
4168 data->pos_min++;
4169 }
4170 min++;
4171 if (flags & SCF_DO_STCLASS) {
4172 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4173
4174 /* Some of the logic below assumes that switching
4175 locale on will only add false positives. */
4176 switch (PL_regkind[OP(scan)]) {
4177 case SANY:
4178 default:
4179 do_default:
4180 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4181 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4182 cl_anything(pRExC_state, data->start_class);
4183 break;
4184 case REG_ANY:
4185 if (OP(scan) == SANY)
4186 goto do_default;
4187 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4188 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4189 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4190 cl_anything(pRExC_state, data->start_class);
4191 }
4192 if (flags & SCF_DO_STCLASS_AND || !value)
4193 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4194 break;
4195 case ANYOF:
4196 if (flags & SCF_DO_STCLASS_AND)
4197 cl_and(data->start_class,
4198 (struct regnode_charclass_class*)scan);
4199 else
4200 cl_or(pRExC_state, data->start_class,
4201 (struct regnode_charclass_class*)scan);
4202 break;
4203 case ALNUM:
4204 if (flags & SCF_DO_STCLASS_AND) {
4205 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4206 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4207 if (OP(scan) == ALNUMU) {
4208 for (value = 0; value < 256; value++) {
4209 if (!isWORDCHAR_L1(value)) {
4210 ANYOF_BITMAP_CLEAR(data->start_class, value);
4211 }
4212 }
4213 } else {
4214 for (value = 0; value < 256; value++) {
4215 if (!isALNUM(value)) {
4216 ANYOF_BITMAP_CLEAR(data->start_class, value);
4217 }
4218 }
4219 }
4220 }
4221 }
4222 else {
4223 if (data->start_class->flags & ANYOF_LOCALE)
4224 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4225
4226 /* Even if under locale, set the bits for non-locale
4227 * in case it isn't a true locale-node. This will
4228 * create false positives if it truly is locale */
4229 if (OP(scan) == ALNUMU) {
4230 for (value = 0; value < 256; value++) {
4231 if (isWORDCHAR_L1(value)) {
4232 ANYOF_BITMAP_SET(data->start_class, value);
4233 }
4234 }
4235 } else {
4236 for (value = 0; value < 256; value++) {
4237 if (isALNUM(value)) {
4238 ANYOF_BITMAP_SET(data->start_class, value);
4239 }
4240 }
4241 }
4242 }
4243 break;
4244 case NALNUM:
4245 if (flags & SCF_DO_STCLASS_AND) {
4246 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4247 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4248 if (OP(scan) == NALNUMU) {
4249 for (value = 0; value < 256; value++) {
4250 if (isWORDCHAR_L1(value)) {
4251 ANYOF_BITMAP_CLEAR(data->start_class, value);
4252 }
4253 }
4254 } else {
4255 for (value = 0; value < 256; value++) {
4256 if (isALNUM(value)) {
4257 ANYOF_BITMAP_CLEAR(data->start_class, value);
4258 }
4259 }
4260 }
4261 }
4262 }
4263 else {
4264 if (data->start_class->flags & ANYOF_LOCALE)
4265 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4266
4267 /* Even if under locale, set the bits for non-locale in
4268 * case it isn't a true locale-node. This will create
4269 * false positives if it truly is locale */
4270 if (OP(scan) == NALNUMU) {
4271 for (value = 0; value < 256; value++) {
4272 if (! isWORDCHAR_L1(value)) {
4273 ANYOF_BITMAP_SET(data->start_class, value);
4274 }
4275 }
4276 } else {
4277 for (value = 0; value < 256; value++) {
4278 if (! isALNUM(value)) {
4279 ANYOF_BITMAP_SET(data->start_class, value);
4280 }
4281 }
4282 }
4283 }
4284 break;
4285 case SPACE:
4286 if (flags & SCF_DO_STCLASS_AND) {
4287 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4288 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4289 if (OP(scan) == SPACEU) {
4290 for (value = 0; value < 256; value++) {
4291 if (!isSPACE_L1(value)) {
4292 ANYOF_BITMAP_CLEAR(data->start_class, value);
4293 }
4294 }
4295 } else {
4296 for (value = 0; value < 256; value++) {
4297 if (!isSPACE(value)) {
4298 ANYOF_BITMAP_CLEAR(data->start_class, value);
4299 }
4300 }
4301 }
4302 }
4303 }
4304 else {
4305 if (data->start_class->flags & ANYOF_LOCALE) {
4306 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4307 }
4308 if (OP(scan) == SPACEU) {
4309 for (value = 0; value < 256; value++) {
4310 if (isSPACE_L1(value)) {
4311 ANYOF_BITMAP_SET(data->start_class, value);
4312 }
4313 }
4314 } else {
4315 for (value = 0; value < 256; value++) {
4316 if (isSPACE(value)) {
4317 ANYOF_BITMAP_SET(data->start_class, value);
4318 }
4319 }
4320 }
4321 }
4322 break;
4323 case NSPACE:
4324 if (flags & SCF_DO_STCLASS_AND) {
4325 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4326 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4327 if (OP(scan) == NSPACEU) {
4328 for (value = 0; value < 256; value++) {
4329 if (isSPACE_L1(value)) {
4330 ANYOF_BITMAP_CLEAR(data->start_class, value);
4331 }
4332 }
4333 } else {
4334 for (value = 0; value < 256; value++) {
4335 if (isSPACE(value)) {
4336 ANYOF_BITMAP_CLEAR(data->start_class, value);
4337 }
4338 }
4339 }
4340 }
4341 }
4342 else {
4343 if (data->start_class->flags & ANYOF_LOCALE)
4344 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4345 if (OP(scan) == NSPACEU) {
4346 for (value = 0; value < 256; value++) {
4347 if (!isSPACE_L1(value)) {
4348 ANYOF_BITMAP_SET(data->start_class, value);
4349 }
4350 }
4351 }
4352 else {
4353 for (value = 0; value < 256; value++) {
4354 if (!isSPACE(value)) {
4355 ANYOF_BITMAP_SET(data->start_class, value);
4356 }
4357 }
4358 }
4359 }
4360 break;
4361 case DIGIT:
4362 if (flags & SCF_DO_STCLASS_AND) {
4363 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4364 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4365 for (value = 0; value < 256; value++)
4366 if (!isDIGIT(value))
4367 ANYOF_BITMAP_CLEAR(data->start_class, value);
4368 }
4369 }
4370 else {
4371 if (data->start_class->flags & ANYOF_LOCALE)
4372 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4373 for (value = 0; value < 256; value++)
4374 if (isDIGIT(value))
4375 ANYOF_BITMAP_SET(data->start_class, value);
4376 }
4377 break;
4378 case NDIGIT:
4379 if (flags & SCF_DO_STCLASS_AND) {
4380 if (!(data->start_class->flags & ANYOF_LOCALE))
4381 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4382 for (value = 0; value < 256; value++)
4383 if (isDIGIT(value))
4384 ANYOF_BITMAP_CLEAR(data->start_class, value);
4385 }
4386 else {
4387 if (data->start_class->flags & ANYOF_LOCALE)
4388 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4389 for (value = 0; value < 256; value++)
4390 if (!isDIGIT(value))
4391 ANYOF_BITMAP_SET(data->start_class, value);
4392 }
4393 break;
4394 CASE_SYNST_FNC(VERTWS);
4395 CASE_SYNST_FNC(HORIZWS);
4396
4397 }
4398 if (flags & SCF_DO_STCLASS_OR)
4399 cl_and(data->start_class, and_withp);
4400 flags &= ~SCF_DO_STCLASS;
4401 }
4402 }
4403 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4404 data->flags |= (OP(scan) == MEOL
4405 ? SF_BEFORE_MEOL
4406 : SF_BEFORE_SEOL);
4407 }
4408 else if ( PL_regkind[OP(scan)] == BRANCHJ
4409 /* Lookbehind, or need to calculate parens/evals/stclass: */
4410 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4411 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4412 if ( OP(scan) == UNLESSM &&
4413 scan->flags == 0 &&
4414 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4415 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4416 ) {
4417 regnode *opt;
4418 regnode *upto= regnext(scan);
4419 DEBUG_PARSE_r({
4420 SV * const mysv_val=sv_newmortal();
4421 DEBUG_STUDYDATA("OPFAIL",data,depth);
4422
4423 /*DEBUG_PARSE_MSG("opfail");*/
4424 regprop(RExC_rx, mysv_val, upto);
4425 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4426 SvPV_nolen_const(mysv_val),
4427 (IV)REG_NODE_NUM(upto),
4428 (IV)(upto - scan)
4429 );
4430 });
4431 OP(scan) = OPFAIL;
4432 NEXT_OFF(scan) = upto - scan;
4433 for (opt= scan + 1; opt < upto ; opt++)
4434 OP(opt) = OPTIMIZED;
4435 scan= upto;
4436 continue;
4437 }
4438 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4439 || OP(scan) == UNLESSM )
4440 {
4441 /* Negative Lookahead/lookbehind
4442 In this case we can't do fixed string optimisation.
4443 */
4444
4445 I32 deltanext, minnext, fake = 0;
4446 regnode *nscan;
4447 struct regnode_charclass_class intrnl;
4448 int f = 0;
4449
4450 data_fake.flags = 0;
4451 if (data) {
4452 data_fake.whilem_c = data->whilem_c;
4453 data_fake.last_closep = data->last_closep;
4454 }
4455 else
4456 data_fake.last_closep = &fake;
4457 data_fake.pos_delta = delta;
4458 if ( flags & SCF_DO_STCLASS && !scan->flags
4459 && OP(scan) == IFMATCH ) { /* Lookahead */
4460 cl_init(pRExC_state, &intrnl);
4461 data_fake.start_class = &intrnl;
4462 f |= SCF_DO_STCLASS_AND;
4463 }
4464 if (flags & SCF_WHILEM_VISITED_POS)
4465 f |= SCF_WHILEM_VISITED_POS;
4466 next = regnext(scan);
4467 nscan = NEXTOPER(NEXTOPER(scan));
4468 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4469 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4470 if (scan->flags) {
4471 if (deltanext) {
4472 FAIL("Variable length lookbehind not implemented");
4473 }
4474 else if (minnext > (I32)U8_MAX) {
4475 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4476 }
4477 scan->flags = (U8)minnext;
4478 }
4479 if (data) {
4480 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4481 pars++;
4482 if (data_fake.flags & SF_HAS_EVAL)
4483 data->flags |= SF_HAS_EVAL;
4484 data->whilem_c = data_fake.whilem_c;
4485 }
4486 if (f & SCF_DO_STCLASS_AND) {
4487 if (flags & SCF_DO_STCLASS_OR) {
4488 /* OR before, AND after: ideally we would recurse with
4489 * data_fake to get the AND applied by study of the
4490 * remainder of the pattern, and then derecurse;
4491 * *** HACK *** for now just treat as "no information".
4492 * See [perl #56690].
4493 */
4494 cl_init(pRExC_state, data->start_class);
4495 } else {
4496 /* AND before and after: combine and continue */
4497 const int was = (data->start_class->flags & ANYOF_EOS);
4498
4499 cl_and(data->start_class, &intrnl);
4500 if (was)
4501 data->start_class->flags |= ANYOF_EOS;
4502 }
4503 }
4504 }
4505#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4506 else {
4507 /* Positive Lookahead/lookbehind
4508 In this case we can do fixed string optimisation,
4509 but we must be careful about it. Note in the case of
4510 lookbehind the positions will be offset by the minimum
4511 length of the pattern, something we won't know about
4512 until after the recurse.
4513 */
4514 I32 deltanext, fake = 0;
4515 regnode *nscan;
4516 struct regnode_charclass_class intrnl;
4517 int f = 0;
4518 /* We use SAVEFREEPV so that when the full compile
4519 is finished perl will clean up the allocated
4520 minlens when it's all done. This way we don't
4521 have to worry about freeing them when we know
4522 they wont be used, which would be a pain.
4523 */
4524 I32 *minnextp;
4525 Newx( minnextp, 1, I32 );
4526 SAVEFREEPV(minnextp);
4527
4528 if (data) {
4529 StructCopy(data, &data_fake, scan_data_t);
4530 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4531 f |= SCF_DO_SUBSTR;
4532 if (scan->flags)
4533 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4534 data_fake.last_found=newSVsv(data->last_found);
4535 }
4536 }
4537 else
4538 data_fake.last_closep = &fake;
4539 data_fake.flags = 0;
4540 data_fake.pos_delta = delta;
4541 if (is_inf)
4542 data_fake.flags |= SF_IS_INF;
4543 if ( flags & SCF_DO_STCLASS && !scan->flags
4544 && OP(scan) == IFMATCH ) { /* Lookahead */
4545 cl_init(pRExC_state, &intrnl);
4546 data_fake.start_class = &intrnl;
4547 f |= SCF_DO_STCLASS_AND;
4548 }
4549 if (flags & SCF_WHILEM_VISITED_POS)
4550 f |= SCF_WHILEM_VISITED_POS;
4551 next = regnext(scan);
4552 nscan = NEXTOPER(NEXTOPER(scan));
4553
4554 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4555 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4556 if (scan->flags) {
4557 if (deltanext) {
4558 FAIL("Variable length lookbehind not implemented");
4559 }
4560 else if (*minnextp > (I32)U8_MAX) {
4561 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4562 }
4563 scan->flags = (U8)*minnextp;
4564 }
4565
4566 *minnextp += min;
4567
4568 if (f & SCF_DO_STCLASS_AND) {
4569 const int was = (data->start_class->flags & ANYOF_EOS);
4570
4571 cl_and(data->start_class, &intrnl);
4572 if (was)
4573 data->start_class->flags |= ANYOF_EOS;
4574 }
4575 if (data) {
4576 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4577 pars++;
4578 if (data_fake.flags & SF_HAS_EVAL)
4579 data->flags |= SF_HAS_EVAL;
4580 data->whilem_c = data_fake.whilem_c;
4581 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4582 if (RExC_rx->minlen<*minnextp)
4583 RExC_rx->minlen=*minnextp;
4584 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4585 SvREFCNT_dec(data_fake.last_found);
4586
4587 if ( data_fake.minlen_fixed != minlenp )
4588 {
4589 data->offset_fixed= data_fake.offset_fixed;
4590 data->minlen_fixed= data_fake.minlen_fixed;
4591 data->lookbehind_fixed+= scan->flags;
4592 }
4593 if ( data_fake.minlen_float != minlenp )
4594 {
4595 data->minlen_float= data_fake.minlen_float;
4596 data->offset_float_min=data_fake.offset_float_min;
4597 data->offset_float_max=data_fake.offset_float_max;
4598 data->lookbehind_float+= scan->flags;
4599 }
4600 }
4601 }
4602 }
4603#endif
4604 }
4605 else if (OP(scan) == OPEN) {
4606 if (stopparen != (I32)ARG(scan))
4607 pars++;
4608 }
4609 else if (OP(scan) == CLOSE) {
4610 if (stopparen == (I32)ARG(scan)) {
4611 break;
4612 }
4613 if ((I32)ARG(scan) == is_par) {
4614 next = regnext(scan);
4615
4616 if ( next && (OP(next) != WHILEM) && next < last)
4617 is_par = 0; /* Disable optimization */
4618 }
4619 if (data)
4620 *(data->last_closep) = ARG(scan);
4621 }
4622 else if (OP(scan) == EVAL) {
4623 if (data)
4624 data->flags |= SF_HAS_EVAL;
4625 }
4626 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4627 if (flags & SCF_DO_SUBSTR) {
4628 SCAN_COMMIT(pRExC_state,data,minlenp);
4629 flags &= ~SCF_DO_SUBSTR;
4630 }
4631 if (data && OP(scan)==ACCEPT) {
4632 data->flags |= SCF_SEEN_ACCEPT;
4633 if (stopmin > min)
4634 stopmin = min;
4635 }
4636 }
4637 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4638 {
4639 if (flags & SCF_DO_SUBSTR) {
4640 SCAN_COMMIT(pRExC_state,data,minlenp);
4641 data->longest = &(data->longest_float);
4642 }
4643 is_inf = is_inf_internal = 1;
4644 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4645 cl_anything(pRExC_state, data->start_class);
4646 flags &= ~SCF_DO_STCLASS;
4647 }
4648 else if (OP(scan) == GPOS) {
4649 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4650 !(delta || is_inf || (data && data->pos_delta)))
4651 {
4652 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4653 RExC_rx->extflags |= RXf_ANCH_GPOS;
4654 if (RExC_rx->gofs < (U32)min)
4655 RExC_rx->gofs = min;
4656 } else {
4657 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4658 RExC_rx->gofs = 0;
4659 }
4660 }
4661#ifdef TRIE_STUDY_OPT
4662#ifdef FULL_TRIE_STUDY
4663 else if (PL_regkind[OP(scan)] == TRIE) {
4664 /* NOTE - There is similar code to this block above for handling
4665 BRANCH nodes on the initial study. If you change stuff here
4666 check there too. */
4667 regnode *trie_node= scan;
4668 regnode *tail= regnext(scan);
4669 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4670 I32 max1 = 0, min1 = I32_MAX;
4671 struct regnode_charclass_class accum;
4672
4673 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4674 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4675 if (flags & SCF_DO_STCLASS)
4676 cl_init_zero(pRExC_state, &accum);
4677
4678 if (!trie->jump) {
4679 min1= trie->minlen;
4680 max1= trie->maxlen;
4681 } else {
4682 const regnode *nextbranch= NULL;
4683 U32 word;
4684
4685 for ( word=1 ; word <= trie->wordcount ; word++)
4686 {
4687 I32 deltanext=0, minnext=0, f = 0, fake;
4688 struct regnode_charclass_class this_class;
4689
4690 data_fake.flags = 0;
4691 if (data) {
4692 data_fake.whilem_c = data->whilem_c;
4693 data_fake.last_closep = data->last_closep;
4694 }
4695 else
4696 data_fake.last_closep = &fake;
4697 data_fake.pos_delta = delta;
4698 if (flags & SCF_DO_STCLASS) {
4699 cl_init(pRExC_state, &this_class);
4700 data_fake.start_class = &this_class;
4701 f = SCF_DO_STCLASS_AND;
4702 }
4703 if (flags & SCF_WHILEM_VISITED_POS)
4704 f |= SCF_WHILEM_VISITED_POS;
4705
4706 if (trie->jump[word]) {
4707 if (!nextbranch)
4708 nextbranch = trie_node + trie->jump[0];
4709 scan= trie_node + trie->jump[word];
4710 /* We go from the jump point to the branch that follows
4711 it. Note this means we need the vestigal unused branches
4712 even though they arent otherwise used.
4713 */
4714 minnext = study_chunk(pRExC_state, &scan, minlenp,
4715 &deltanext, (regnode *)nextbranch, &data_fake,
4716 stopparen, recursed, NULL, f,depth+1);
4717 }
4718 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4719 nextbranch= regnext((regnode*)nextbranch);
4720
4721 if (min1 > (I32)(minnext + trie->minlen))
4722 min1 = minnext + trie->minlen;
4723 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4724 max1 = minnext + deltanext + trie->maxlen;
4725 if (deltanext == I32_MAX)
4726 is_inf = is_inf_internal = 1;
4727
4728 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4729 pars++;
4730 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4731 if ( stopmin > min + min1)
4732 stopmin = min + min1;
4733 flags &= ~SCF_DO_SUBSTR;
4734 if (data)
4735 data->flags |= SCF_SEEN_ACCEPT;
4736 }
4737 if (data) {
4738 if (data_fake.flags & SF_HAS_EVAL)
4739 data->flags |= SF_HAS_EVAL;
4740 data->whilem_c = data_fake.whilem_c;
4741 }
4742 if (flags & SCF_DO_STCLASS)
4743 cl_or(pRExC_state, &accum, &this_class);
4744 }
4745 }
4746 if (flags & SCF_DO_SUBSTR) {
4747 data->pos_min += min1;
4748 data->pos_delta += max1 - min1;
4749 if (max1 != min1 || is_inf)
4750 data->longest = &(data->longest_float);
4751 }
4752 min += min1;
4753 delta += max1 - min1;
4754 if (flags & SCF_DO_STCLASS_OR) {
4755 cl_or(pRExC_state, data->start_class, &accum);
4756 if (min1) {
4757 cl_and(data->start_class, and_withp);
4758 flags &= ~SCF_DO_STCLASS;
4759 }
4760 }
4761 else if (flags & SCF_DO_STCLASS_AND) {
4762 if (min1) {
4763 cl_and(data->start_class, &accum);
4764 flags &= ~SCF_DO_STCLASS;
4765 }
4766 else {
4767 /* Switch to OR mode: cache the old value of
4768 * data->start_class */
4769 INIT_AND_WITHP;
4770 StructCopy(data->start_class, and_withp,
4771 struct regnode_charclass_class);
4772 flags &= ~SCF_DO_STCLASS_AND;
4773 StructCopy(&accum, data->start_class,
4774 struct regnode_charclass_class);
4775 flags |= SCF_DO_STCLASS_OR;
4776 data->start_class->flags |= ANYOF_EOS;
4777 }
4778 }
4779 scan= tail;
4780 continue;
4781 }
4782#else
4783 else if (PL_regkind[OP(scan)] == TRIE) {
4784 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4785 U8*bang=NULL;
4786
4787 min += trie->minlen;
4788 delta += (trie->maxlen - trie->minlen);
4789 flags &= ~SCF_DO_STCLASS; /* xxx */
4790 if (flags & SCF_DO_SUBSTR) {
4791 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4792 data->pos_min += trie->minlen;
4793 data->pos_delta += (trie->maxlen - trie->minlen);
4794 if (trie->maxlen != trie->minlen)
4795 data->longest = &(data->longest_float);
4796 }
4797 if (trie->jump) /* no more substrings -- for now /grr*/
4798 flags &= ~SCF_DO_SUBSTR;
4799 }
4800#endif /* old or new */
4801#endif /* TRIE_STUDY_OPT */
4802
4803 /* Else: zero-length, ignore. */
4804 scan = regnext(scan);
4805 }
4806 if (frame) {
4807 last = frame->last;
4808 scan = frame->next;
4809 stopparen = frame->stop;
4810 frame = frame->prev;
4811 goto fake_study_recurse;
4812 }
4813
4814 finish:
4815 assert(!frame);
4816 DEBUG_STUDYDATA("pre-fin:",data,depth);
4817
4818 *scanp = scan;
4819 *deltap = is_inf_internal ? I32_MAX : delta;
4820 if (flags & SCF_DO_SUBSTR && is_inf)
4821 data->pos_delta = I32_MAX - data->pos_min;
4822 if (is_par > (I32)U8_MAX)
4823 is_par = 0;
4824 if (is_par && pars==1 && data) {
4825 data->flags |= SF_IN_PAR;
4826 data->flags &= ~SF_HAS_PAR;
4827 }
4828 else if (pars && data) {
4829 data->flags |= SF_HAS_PAR;
4830 data->flags &= ~SF_IN_PAR;
4831 }
4832 if (flags & SCF_DO_STCLASS_OR)
4833 cl_and(data->start_class, and_withp);
4834 if (flags & SCF_TRIE_RESTUDY)
4835 data->flags |= SCF_TRIE_RESTUDY;
4836
4837 DEBUG_STUDYDATA("post-fin:",data,depth);
4838
4839 return min < stopmin ? min : stopmin;
4840}
4841
4842STATIC U32
4843S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4844{
4845 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4846
4847 PERL_ARGS_ASSERT_ADD_DATA;
4848
4849 Renewc(RExC_rxi->data,
4850 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4851 char, struct reg_data);
4852 if(count)
4853 Renew(RExC_rxi->data->what, count + n, U8);
4854 else
4855 Newx(RExC_rxi->data->what, n, U8);
4856 RExC_rxi->data->count = count + n;
4857 Copy(s, RExC_rxi->data->what + count, n, U8);
4858 return count;
4859}
4860
4861/*XXX: todo make this not included in a non debugging perl */
4862#ifndef PERL_IN_XSUB_RE
4863void
4864Perl_reginitcolors(pTHX)
4865{
4866 dVAR;
4867 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4868 if (s) {
4869 char *t = savepv(s);
4870 int i = 0;
4871 PL_colors[0] = t;
4872 while (++i < 6) {
4873 t = strchr(t, '\t');
4874 if (t) {
4875 *t = '\0';
4876 PL_colors[i] = ++t;
4877 }
4878 else
4879 PL_colors[i] = t = (char *)"";
4880 }
4881 } else {
4882 int i = 0;
4883 while (i < 6)
4884 PL_colors[i++] = (char *)"";
4885 }
4886 PL_colorset = 1;
4887}
4888#endif
4889
4890
4891#ifdef TRIE_STUDY_OPT
4892#define CHECK_RESTUDY_GOTO \
4893 if ( \
4894 (data.flags & SCF_TRIE_RESTUDY) \
4895 && ! restudied++ \
4896 ) goto reStudy
4897#else
4898#define CHECK_RESTUDY_GOTO
4899#endif
4900
4901/*
4902 * pregcomp - compile a regular expression into internal code
4903 *
4904 * Decides which engine's compiler to call based on the hint currently in
4905 * scope
4906 */
4907
4908#ifndef PERL_IN_XSUB_RE
4909
4910/* return the currently in-scope regex engine (or the default if none) */
4911
4912regexp_engine const *
4913Perl_current_re_engine(pTHX)
4914{
4915 dVAR;
4916
4917 if (IN_PERL_COMPILETIME) {
4918 HV * const table = GvHV(PL_hintgv);
4919 SV **ptr;
4920
4921 if (!table)
4922 return &PL_core_reg_engine;
4923 ptr = hv_fetchs(table, "regcomp", FALSE);
4924 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4925 return &PL_core_reg_engine;
4926 return INT2PTR(regexp_engine*,SvIV(*ptr));
4927 }
4928 else {
4929 SV *ptr;
4930 if (!PL_curcop->cop_hints_hash)
4931 return &PL_core_reg_engine;
4932 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4933 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4934 return &PL_core_reg_engine;
4935 return INT2PTR(regexp_engine*,SvIV(ptr));
4936 }
4937}
4938
4939
4940REGEXP *
4941Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4942{
4943 dVAR;
4944 regexp_engine const *eng = current_re_engine();
4945 GET_RE_DEBUG_FLAGS_DECL;
4946
4947 PERL_ARGS_ASSERT_PREGCOMP;
4948
4949 /* Dispatch a request to compile a regexp to correct regexp engine. */
4950 DEBUG_COMPILE_r({
4951 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4952 PTR2UV(eng));
4953 });
4954 return CALLREGCOMP_ENG(eng, pattern, flags);
4955}
4956#endif
4957
4958/* public(ish) wrapper for Perl_re_op_compile that only takes an SV
4959 * pattern rather than a list of OPs */
4960
4961REGEXP *
4962Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4963{
4964 SV *pat = pattern; /* defeat constness! */
4965 PERL_ARGS_ASSERT_RE_COMPILE;
4966 return Perl_re_op_compile(aTHX_ &pat, 1, NULL, current_re_engine(),
4967 NULL, NULL, rx_flags, 0);
4968}
4969
4970/* see if there are any run-time code blocks in the pattern.
4971 * False positives are allowed */
4972
4973static bool
4974S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4975 U32 pm_flags, char *pat, STRLEN plen)
4976{
4977 int n = 0;
4978 STRLEN s;
4979
4980 /* avoid infinitely recursing when we recompile the pattern parcelled up
4981 * as qr'...'. A single constant qr// string can't have have any
4982 * run-time component in it, and thus, no runtime code. (A non-qr
4983 * string, however, can, e.g. $x =~ '(?{})') */
4984 if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4985 return 0;
4986
4987 for (s = 0; s < plen; s++) {
4988 if (n < pRExC_state->num_code_blocks
4989 && s == pRExC_state->code_blocks[n].start)
4990 {
4991 s = pRExC_state->code_blocks[n].end;
4992 n++;
4993 continue;
4994 }
4995 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4996 * positives here */
4997 if (pat[s] == '(' && pat[s+1] == '?' &&
4998 (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
4999 )
5000 return 1;
5001 }
5002 return 0;
5003}
5004
5005/* Handle run-time code blocks. We will already have compiled any direct
5006 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5007 * copy of it, but with any literal code blocks blanked out and
5008 * appropriate chars escaped; then feed it into
5009 *
5010 * eval "qr'modified_pattern'"
5011 *
5012 * For example,
5013 *
5014 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5015 *
5016 * becomes
5017 *
5018 * qr'a\\bc def\'ghi\\\\jkl(?{"this is runtime"})mno'
5019 *
5020 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5021 * and merge them with any code blocks of the original regexp.
5022 *
5023 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5024 * instead, just save the qr and return FALSE; this tells our caller that
5025 * the original pattern needs upgrading to utf8.
5026 */
5027
5028bool
5029S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5030 char *pat, STRLEN plen)
5031{
5032 SV *qr;
5033
5034 GET_RE_DEBUG_FLAGS_DECL;
5035
5036 if (pRExC_state->runtime_code_qr) {
5037 /* this is the second time we've been called; this should
5038 * only happen if the main pattern got upgraded to utf8
5039 * during compilation; re-use the qr we compiled first time
5040 * round (which should be utf8 too)
5041 */
5042 qr = pRExC_state->runtime_code_qr;
5043 pRExC_state->runtime_code_qr = NULL;
5044 assert(RExC_utf8 && SvUTF8(qr));
5045 }
5046 else {
5047 int n = 0;
5048 STRLEN s;
5049 char *p, *newpat;
5050 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5051 SV *sv, *qr_ref;
5052 dSP;
5053
5054 /* determine how many extra chars we need for ' and \ escaping */
5055 for (s = 0; s < plen; s++) {
5056 if (pat[s] == '\'' || pat[s] == '\\')
5057 newlen++;
5058 }
5059
5060 Newx(newpat, newlen, char);
5061 p = newpat;
5062 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5063
5064 for (s = 0; s < plen; s++) {
5065 if (n < pRExC_state->num_code_blocks
5066 && s == pRExC_state->code_blocks[n].start)
5067 {
5068 /* blank out literal code block */
5069 assert(pat[s] == '(');
5070 while (s <= pRExC_state->code_blocks[n].end) {
5071 *p++ = ' ';
5072 s++;
5073 }
5074 s--;
5075 n++;
5076 continue;
5077 }
5078 if (pat[s] == '\'' || pat[s] == '\\')
5079 *p++ = '\\';
5080 *p++ = pat[s];
5081 }
5082 *p++ = '\'';
5083 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5084 *p++ = 'x';
5085 *p++ = '\0';
5086 DEBUG_COMPILE_r({
5087 PerlIO_printf(Perl_debug_log,
5088 "%sre-parsing pattern for runtime code:%s %s\n",
5089 PL_colors[4],PL_colors[5],newpat);
5090 });
5091
5092 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5093 Safefree(newpat);
5094
5095 ENTER;
5096 SAVETMPS;
5097 save_re_context();
5098 PUSHSTACKi(PERLSI_REQUIRE);
5099 /* this causes the toker to collapse \\ into \ when parsing
5100 * qr''; normally only q'' does this. It also alters hints
5101 * handling */
5102 PL_reg_state.re_reparsing = TRUE;
5103 eval_sv(sv, G_SCALAR);
5104 SvREFCNT_dec(sv);
5105 SPAGAIN;
5106 qr_ref = POPs;
5107 PUTBACK;
5108 if (SvTRUE(ERRSV))
5109 Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV));
5110 assert(SvROK(qr_ref));
5111 qr = SvRV(qr_ref);
5112 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5113 /* the leaving below frees the tmp qr_ref.
5114 * Give qr a life of its own */
5115 SvREFCNT_inc(qr);
5116 POPSTACK;
5117 FREETMPS;
5118 LEAVE;
5119
5120 }
5121
5122 if (!RExC_utf8 && SvUTF8(qr)) {
5123 /* first time through; the pattern got upgraded; save the
5124 * qr for the next time through */
5125 assert(!pRExC_state->runtime_code_qr);
5126 pRExC_state->runtime_code_qr = qr;
5127 return 0;
5128 }
5129
5130
5131 /* extract any code blocks within the returned qr// */
5132
5133
5134 /* merge the main (r1) and run-time (r2) code blocks into one */
5135 {
5136 RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2);
5137 struct reg_code_block *new_block, *dst;
5138 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5139 int i1 = 0, i2 = 0;
5140
5141 if (!r2->num_code_blocks) /* we guessed wrong */
5142 return 1;
5143
5144 Newx(new_block,
5145 r1->num_code_blocks + r2->num_code_blocks,
5146 struct reg_code_block);
5147 dst = new_block;
5148
5149 while ( i1 < r1->num_code_blocks
5150 || i2 < r2->num_code_blocks)
5151 {
5152 struct reg_code_block *src;
5153 bool is_qr = 0;
5154
5155 if (i1 == r1->num_code_blocks) {
5156 src = &r2->code_blocks[i2++];
5157 is_qr = 1;
5158 }
5159 else if (i2 == r2->num_code_blocks)
5160 src = &r1->code_blocks[i1++];
5161 else if ( r1->code_blocks[i1].start
5162 < r2->code_blocks[i2].start)
5163 {
5164 src = &r1->code_blocks[i1++];
5165 assert(src->end < r2->code_blocks[i2].start);
5166 }
5167 else {
5168 assert( r1->code_blocks[i1].start
5169 > r2->code_blocks[i2].start);
5170 src = &r2->code_blocks[i2++];
5171 is_qr = 1;
5172 assert(src->end < r1->code_blocks[i1].start);
5173 }
5174
5175 assert(pat[src->start] == '(');
5176 assert(pat[src->end] == ')');
5177 dst->start = src->start;
5178 dst->end = src->end;
5179 dst->block = src->block;
5180 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5181 : src->src_regex;
5182 dst++;
5183 }
5184 r1->num_code_blocks += r2->num_code_blocks;
5185 Safefree(r1->code_blocks);
5186 r1->code_blocks = new_block;
5187 }
5188
5189 SvREFCNT_dec(qr);
5190 return 1;
5191}
5192
5193
5194/*
5195 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5196 * regular expression into internal code.
5197 * The pattern may be passed either as:
5198 * a list of SVs (patternp plus pat_count)
5199 * a list of OPs (expr)
5200 * If both are passed, the SV list is used, but the OP list indicates
5201 * which SVs are actually pre-compiled code blocks
5202 *
5203 * The SVs in the list have magic and qr overloading applied to them (and
5204 * the list may be modified in-place with replacement SVs in the latter
5205 * case).
5206 *
5207 * If the pattern hasn't changed from old_re, then old_re will be
5208 * returned.
5209 *
5210 * eng is the current engine. If that engine has an op_comp method, then
5211 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5212 * do the initial concatenation of arguments and pass on to the external
5213 * engine.
5214 *
5215 * If is_bare_re is not null, set it to a boolean indicating whether the
5216 * arg list reduced (after overloading) to a single bare regex which has
5217 * been returned (i.e. /$qr/).
5218 *
5219 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5220 *
5221 * pm_flags contains the PMf_* flags, typically based on those from the
5222 * pm_flags field of the related PMOP. Currently we're only interested in
5223 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5224 *
5225 * We can't allocate space until we know how big the compiled form will be,
5226 * but we can't compile it (and thus know how big it is) until we've got a
5227 * place to put the code. So we cheat: we compile it twice, once with code
5228 * generation turned off and size counting turned on, and once "for real".
5229 * This also means that we don't allocate space until we are sure that the
5230 * thing really will compile successfully, and we never have to move the
5231 * code and thus invalidate pointers into it. (Note that it has to be in
5232 * one piece because free() must be able to free it all.) [NB: not true in perl]
5233 *
5234 * Beware that the optimization-preparation code in here knows about some
5235 * of the structure of the compiled regexp. [I'll say.]
5236 */
5237
5238REGEXP *
5239Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5240 OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5241 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5242{
5243 dVAR;
5244 REGEXP *rx;
5245 struct regexp *r;
5246 register regexp_internal *ri;
5247 STRLEN plen;
5248 char * VOL exp;
5249 char* xend;
5250 regnode *scan;
5251 I32 flags;
5252 I32 minlen = 0;
5253 U32 rx_flags;
5254 SV * VOL pat;
5255
5256 /* these are all flags - maybe they should be turned
5257 * into a single int with different bit masks */
5258 I32 sawlookahead = 0;
5259 I32 sawplus = 0;
5260 I32 sawopen = 0;
5261 bool used_setjump = FALSE;
5262 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5263 bool code_is_utf8 = 0;
5264 bool VOL recompile = 0;
5265 bool runtime_code = 0;
5266 U8 jump_ret = 0;
5267 dJMPENV;
5268 scan_data_t data;
5269 RExC_state_t RExC_state;
5270 RExC_state_t * const pRExC_state = &RExC_state;
5271#ifdef TRIE_STUDY_OPT
5272 int restudied;
5273 RExC_state_t copyRExC_state;
5274#endif
5275 GET_RE_DEBUG_FLAGS_DECL;
5276
5277 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5278
5279 DEBUG_r(if (!PL_colorset) reginitcolors());
5280
5281#ifndef PERL_IN_XSUB_RE
5282 /* Initialize these here instead of as-needed, as is quick and avoids
5283 * having to test them each time otherwise */
5284 if (! PL_AboveLatin1) {
5285 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5286 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5287 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5288
5289 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5290 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5291
5292 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5293 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5294
5295 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5296 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5297
5298 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5299
5300 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5301 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5302
5303 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5304
5305 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5306 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5307
5308 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5309 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5310
5311 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5312 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5313
5314 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5315 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5316
5317 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5318 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5319
5320 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5321 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5322
5323 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5324 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5325
5326 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5327 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5328
5329 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5330
5331 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5332 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5333
5334 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5335 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5336 }
5337#endif
5338
5339 pRExC_state->code_blocks = NULL;
5340 pRExC_state->num_code_blocks = 0;
5341
5342 if (is_bare_re)
5343 *is_bare_re = FALSE;
5344
5345 if (expr && (expr->op_type == OP_LIST ||
5346 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5347
5348 /* is the source UTF8, and how many code blocks are there? */
5349 OP *o;
5350 int ncode = 0;
5351
5352 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5353 if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5354 code_is_utf8 = 1;
5355 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5356 /* count of DO blocks */
5357 ncode++;
5358 }
5359 if (ncode) {
5360 pRExC_state->num_code_blocks = ncode;
5361 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5362 }
5363 }
5364
5365 if (pat_count) {
5366 /* handle a list of SVs */
5367
5368 SV **svp;
5369
5370 /* apply magic and RE overloading to each arg */
5371 for (svp = patternp; svp < patternp + pat_count; svp++) {
5372 SV *rx = *svp;
5373 SvGETMAGIC(rx);
5374 if (SvROK(rx) && SvAMAGIC(rx)) {
5375 SV *sv = AMG_CALLunary(rx, regexp_amg);
5376 if (sv) {
5377 if (SvROK(sv))
5378 sv = SvRV(sv);
5379 if (SvTYPE(sv) != SVt_REGEXP)
5380 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5381 *svp = sv;
5382 }
5383 }
5384 }
5385
5386 if (pat_count > 1) {
5387 /* concat multiple args and find any code block indexes */
5388
5389 OP *o = NULL;
5390 int n = 0;
5391 bool utf8 = 0;
5392 STRLEN orig_patlen = 0;
5393
5394 if (pRExC_state->num_code_blocks) {
5395 o = cLISTOPx(expr)->op_first;
5396 assert(o->op_type == OP_PUSHMARK);
5397 o = o->op_sibling;
5398 }
5399
5400 pat = newSVpvn("", 0);
5401 SAVEFREESV(pat);
5402
5403 /* determine if the pattern is going to be utf8 (needed
5404 * in advance to align code block indices correctly).
5405 * XXX This could fail to be detected for an arg with
5406 * overloading but not concat overloading; but the main effect
5407 * in this obscure case is to need a 'use re eval' for a
5408 * literal code block */
5409 for (svp = patternp; svp < patternp + pat_count; svp++) {
5410 if (SvUTF8(*svp))
5411 utf8 = 1;
5412 }
5413 if (utf8)
5414 SvUTF8_on(pat);
5415
5416 for (svp = patternp; svp < patternp + pat_count; svp++) {
5417 SV *sv, *msv = *svp;
5418 SV *rx;
5419 bool code = 0;
5420 if (o) {
5421 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5422 assert(n < pRExC_state->num_code_blocks);
5423 pRExC_state->code_blocks[n].start = SvCUR(pat);
5424 pRExC_state->code_blocks[n].block = o;
5425 pRExC_state->code_blocks[n].src_regex = NULL;
5426 n++;
5427 code = 1;
5428 o = o->op_sibling; /* skip CONST */
5429 assert(o);
5430 }
5431 o = o->op_sibling;;
5432 }
5433
5434 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5435 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5436 {
5437 sv_setsv(pat, sv);
5438 /* overloading involved: all bets are off over literal
5439 * code. Pretend we haven't seen it */
5440 pRExC_state->num_code_blocks -= n;
5441 n = 0;
5442 rx = NULL;
5443
5444 }
5445 else {
5446 while (SvAMAGIC(msv)
5447 && (sv = AMG_CALLunary(msv, string_amg))
5448 && sv != msv)
5449 {
5450 msv = sv;
5451 SvGETMAGIC(msv);
5452 }
5453 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5454 msv = SvRV(msv);
5455 orig_patlen = SvCUR(pat);
5456 sv_catsv_nomg(pat, msv);
5457 rx = msv;
5458 if (code)
5459 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5460 }
5461
5462 /* extract any code blocks within any embedded qr//'s */
5463 if (rx && SvTYPE(rx) == SVt_REGEXP
5464 && RX_ENGINE((REGEXP*)rx)->op_comp)
5465 {
5466
5467 RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
5468 if (ri->num_code_blocks) {
5469 int i;
5470 /* the presence of an embedded qr// with code means
5471 * we should always recompile: the text of the
5472 * qr// may not have changed, but it may be a
5473 * different closure than last time */
5474 recompile = 1;
5475 Renew(pRExC_state->code_blocks,
5476 pRExC_state->num_code_blocks + ri->num_code_blocks,
5477 struct reg_code_block);
5478 pRExC_state->num_code_blocks += ri->num_code_blocks;
5479 for (i=0; i < ri->num_code_blocks; i++) {
5480 struct reg_code_block *src, *dst;
5481 STRLEN offset = orig_patlen
5482 + ((struct regexp *)SvANY(rx))->pre_prefix;
5483 assert(n < pRExC_state->num_code_blocks);
5484 src = &ri->code_blocks[i];
5485 dst = &pRExC_state->code_blocks[n];
5486 dst->start = src->start + offset;
5487 dst->end = src->end + offset;
5488 dst->block = src->block;
5489 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5490 src->src_regex
5491 ? src->src_regex
5492 : (REGEXP*)rx);
5493 n++;
5494 }
5495 }
5496 }
5497 }
5498 SvSETMAGIC(pat);
5499 }
5500 else {
5501 SV *sv;
5502 pat = *patternp;
5503 while (SvAMAGIC(pat)
5504 && (sv = AMG_CALLunary(pat, string_amg))
5505 && sv != pat)
5506 {
5507 pat = sv;
5508 SvGETMAGIC(pat);
5509 }
5510 }
5511
5512 /* handle bare regex: foo =~ $re */
5513 {
5514 SV *re = pat;
5515 if (SvROK(re))
5516 re = SvRV(re);
5517 if (SvTYPE(re) == SVt_REGEXP) {
5518 if (is_bare_re)
5519 *is_bare_re = TRUE;
5520 SvREFCNT_inc(re);
5521 Safefree(pRExC_state->code_blocks);
5522 return (REGEXP*)re;
5523 }
5524 }
5525 }
5526 else {
5527 /* not a list of SVs, so must be a list of OPs */
5528 assert(expr);
5529 if (expr->op_type == OP_LIST) {
5530 int i = -1;
5531 bool is_code = 0;
5532 OP *o;
5533
5534 pat = newSVpvn("", 0);
5535 SAVEFREESV(pat);
5536 if (code_is_utf8)
5537 SvUTF8_on(pat);
5538
5539 /* given a list of CONSTs and DO blocks in expr, append all
5540 * the CONSTs to pat, and record the start and end of each
5541 * code block in code_blocks[] (each DO{} op is followed by an
5542 * OP_CONST containing the corresponding literal '(?{...})
5543 * text)
5544 */
5545 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5546 if (o->op_type == OP_CONST) {
5547 sv_catsv(pat, cSVOPo_sv);
5548 if (is_code) {
5549 pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5550 is_code = 0;
5551 }
5552 }
5553 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5554 assert(i+1 < pRExC_state->num_code_blocks);
5555 pRExC_state->code_blocks[++i].start = SvCUR(pat);
5556 pRExC_state->code_blocks[i].block = o;
5557 pRExC_state->code_blocks[i].src_regex = NULL;
5558 is_code = 1;
5559 }
5560 }
5561 }
5562 else {
5563 assert(expr->op_type == OP_CONST);
5564 pat = cSVOPx_sv(expr);
5565 }
5566 }
5567
5568 exp = SvPV_nomg(pat, plen);
5569
5570 if (!eng->op_comp) {
5571 if ((SvUTF8(pat) && IN_BYTES)
5572 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5573 {
5574 /* make a temporary copy; either to convert to bytes,
5575 * or to avoid repeating get-magic / overloaded stringify */
5576 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5577 (IN_BYTES ? 0 : SvUTF8(pat)));
5578 }
5579 Safefree(pRExC_state->code_blocks);
5580 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5581 }
5582
5583 /* ignore the utf8ness if the pattern is 0 length */
5584 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5585 RExC_uni_semantics = 0;
5586 RExC_contains_locale = 0;
5587 pRExC_state->runtime_code_qr = NULL;
5588
5589 /****************** LONG JUMP TARGET HERE***********************/
5590 /* Longjmp back to here if have to switch in midstream to utf8 */
5591 if (! RExC_orig_utf8) {
5592 JMPENV_PUSH(jump_ret);
5593 used_setjump = TRUE;
5594 }
5595
5596 if (jump_ret == 0) { /* First time through */
5597 xend = exp + plen;
5598
5599 DEBUG_COMPILE_r({
5600 SV *dsv= sv_newmortal();
5601 RE_PV_QUOTED_DECL(s, RExC_utf8,
5602 dsv, exp, plen, 60);
5603 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5604 PL_colors[4],PL_colors[5],s);
5605 });
5606 }
5607 else { /* longjumped back */
5608 U8 *src, *dst;
5609 int n=0;
5610 STRLEN s = 0, d = 0;
5611 bool do_end = 0;
5612
5613 /* If the cause for the longjmp was other than changing to utf8, pop
5614 * our own setjmp, and longjmp to the correct handler */
5615 if (jump_ret != UTF8_LONGJMP) {
5616 JMPENV_POP;
5617 JMPENV_JUMP(jump_ret);
5618 }
5619
5620 GET_RE_DEBUG_FLAGS;
5621
5622 /* It's possible to write a regexp in ascii that represents Unicode
5623 codepoints outside of the byte range, such as via \x{100}. If we
5624 detect such a sequence we have to convert the entire pattern to utf8
5625 and then recompile, as our sizing calculation will have been based
5626 on 1 byte == 1 character, but we will need to use utf8 to encode
5627 at least some part of the pattern, and therefore must convert the whole
5628 thing.
5629 -- dmq */
5630 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5631 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5632
5633 /* upgrade pattern to UTF8, and if there are code blocks,
5634 * recalculate the indices.
5635 * This is essentially an unrolled Perl_bytes_to_utf8() */
5636
5637 src = (U8*)SvPV_nomg(pat, plen);
5638 Newx(dst, plen * 2 + 1, U8);
5639
5640 while (s < plen) {
5641 const UV uv = NATIVE_TO_ASCII(src[s]);
5642 if (UNI_IS_INVARIANT(uv))
5643 dst[d] = (U8)UTF_TO_NATIVE(uv);
5644 else {
5645 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5646 dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
5647 }
5648 if (n < pRExC_state->num_code_blocks) {
5649 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5650 pRExC_state->code_blocks[n].start = d;
5651 assert(dst[d] == '(');
5652 do_end = 1;
5653 }
5654 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5655 pRExC_state->code_blocks[n].end = d;
5656 assert(dst[d] == ')');
5657 do_end = 0;
5658 n++;
5659 }
5660 }
5661 s++;
5662 d++;
5663 }
5664 dst[d] = '\0';
5665 plen = d;
5666 exp = (char*) dst;
5667 xend = exp + plen;
5668 SAVEFREEPV(exp);
5669 RExC_orig_utf8 = RExC_utf8 = 1;
5670 }
5671
5672 /* return old regex if pattern hasn't changed */
5673
5674 if ( old_re
5675 && !recompile
5676 && !!RX_UTF8(old_re) == !!RExC_utf8
5677 && RX_PRECOMP(old_re)
5678 && RX_PRELEN(old_re) == plen
5679 && memEQ(RX_PRECOMP(old_re), exp, plen))
5680 {
5681 /* with runtime code, always recompile */
5682 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5683 exp, plen);
5684 if (!runtime_code) {
5685 ReREFCNT_inc(old_re);
5686 if (used_setjump) {
5687 JMPENV_POP;
5688 }
5689 Safefree(pRExC_state->code_blocks);
5690 return old_re;
5691 }
5692 }
5693 else if ((pm_flags & PMf_USE_RE_EVAL)
5694 /* this second condition covers the non-regex literal case,
5695 * i.e. $foo =~ '(?{})'. */
5696 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5697 && (PL_hints & HINT_RE_EVAL))
5698 )
5699 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5700 exp, plen);
5701
5702#ifdef TRIE_STUDY_OPT
5703 restudied = 0;
5704#endif
5705
5706 rx_flags = orig_rx_flags;
5707
5708 if (initial_charset == REGEX_LOCALE_CHARSET) {
5709 RExC_contains_locale = 1;
5710 }
5711 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5712
5713 /* Set to use unicode semantics if the pattern is in utf8 and has the
5714 * 'depends' charset specified, as it means unicode when utf8 */
5715 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5716 }
5717
5718 RExC_precomp = exp;
5719 RExC_flags = rx_flags;
5720 RExC_pm_flags = pm_flags;
5721
5722 if (runtime_code) {
5723 if (PL_tainting && PL_tainted)
5724 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5725
5726 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5727 /* whoops, we have a non-utf8 pattern, whilst run-time code
5728 * got compiled as utf8. Try again with a utf8 pattern */
5729 JMPENV_JUMP(UTF8_LONGJMP);
5730 }
5731 }
5732 assert(!pRExC_state->runtime_code_qr);
5733
5734 RExC_sawback = 0;
5735
5736 RExC_seen = 0;
5737 RExC_in_lookbehind = 0;
5738 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5739 RExC_extralen = 0;
5740 RExC_override_recoding = 0;
5741
5742 /* First pass: determine size, legality. */
5743 RExC_parse = exp;
5744 RExC_start = exp;
5745 RExC_end = xend;
5746 RExC_naughty = 0;
5747 RExC_npar = 1;
5748 RExC_nestroot = 0;
5749 RExC_size = 0L;
5750 RExC_emit = &PL_regdummy;
5751 RExC_whilem_seen = 0;
5752 RExC_open_parens = NULL;
5753 RExC_close_parens = NULL;
5754 RExC_opend = NULL;
5755 RExC_paren_names = NULL;
5756#ifdef DEBUGGING
5757 RExC_paren_name_list = NULL;
5758#endif
5759 RExC_recurse = NULL;
5760 RExC_recurse_count = 0;
5761 pRExC_state->code_index = 0;
5762
5763#if 0 /* REGC() is (currently) a NOP at the first pass.
5764 * Clever compilers notice this and complain. --jhi */
5765 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5766#endif
5767 DEBUG_PARSE_r(
5768 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5769 RExC_lastnum=0;
5770 RExC_lastparse=NULL;
5771 );
5772 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5773 RExC_precomp = NULL;
5774 Safefree(pRExC_state->code_blocks);
5775 return(NULL);
5776 }
5777
5778 /* Here, finished first pass. Get rid of any added setjmp */
5779 if (used_setjump) {
5780 JMPENV_POP;
5781 }
5782
5783 DEBUG_PARSE_r({
5784 PerlIO_printf(Perl_debug_log,
5785 "Required size %"IVdf" nodes\n"
5786 "Starting second pass (creation)\n",
5787 (IV)RExC_size);
5788 RExC_lastnum=0;
5789 RExC_lastparse=NULL;
5790 });
5791
5792 /* The first pass could have found things that force Unicode semantics */
5793 if ((RExC_utf8 || RExC_uni_semantics)
5794 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5795 {
5796 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5797 }
5798
5799 /* Small enough for pointer-storage convention?
5800 If extralen==0, this means that we will not need long jumps. */
5801 if (RExC_size >= 0x10000L && RExC_extralen)
5802 RExC_size += RExC_extralen;
5803 else
5804 RExC_extralen = 0;
5805 if (RExC_whilem_seen > 15)
5806 RExC_whilem_seen = 15;
5807
5808 /* Allocate space and zero-initialize. Note, the two step process
5809 of zeroing when in debug mode, thus anything assigned has to
5810 happen after that */
5811 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5812 r = (struct regexp*)SvANY(rx);
5813 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5814 char, regexp_internal);
5815 if ( r == NULL || ri == NULL )
5816 FAIL("Regexp out of space");
5817#ifdef DEBUGGING
5818 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5819 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5820#else
5821 /* bulk initialize base fields with 0. */
5822 Zero(ri, sizeof(regexp_internal), char);
5823#endif
5824
5825 /* non-zero initialization begins here */
5826 RXi_SET( r, ri );
5827 r->engine= eng;
5828 r->extflags = rx_flags;
5829 if (pm_flags & PMf_IS_QR) {
5830 ri->code_blocks = pRExC_state->code_blocks;
5831 ri->num_code_blocks = pRExC_state->num_code_blocks;
5832 }
5833 else
5834 SAVEFREEPV(pRExC_state->code_blocks);
5835
5836 {
5837 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5838 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5839
5840 /* The caret is output if there are any defaults: if not all the STD
5841 * flags are set, or if no character set specifier is needed */
5842 bool has_default =
5843 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5844 || ! has_charset);
5845 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5846 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5847 >> RXf_PMf_STD_PMMOD_SHIFT);
5848 const char *fptr = STD_PAT_MODS; /*"msix"*/
5849 char *p;
5850 /* Allocate for the worst case, which is all the std flags are turned
5851 * on. If more precision is desired, we could do a population count of
5852 * the flags set. This could be done with a small lookup table, or by
5853 * shifting, masking and adding, or even, when available, assembly
5854 * language for a machine-language population count.
5855 * We never output a minus, as all those are defaults, so are
5856 * covered by the caret */
5857 const STRLEN wraplen = plen + has_p + has_runon
5858 + has_default /* If needs a caret */
5859
5860 /* If needs a character set specifier */
5861 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5862 + (sizeof(STD_PAT_MODS) - 1)
5863 + (sizeof("(?:)") - 1);
5864
5865 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5866 SvPOK_on(rx);
5867 if (RExC_utf8)
5868 SvFLAGS(rx) |= SVf_UTF8;
5869 *p++='('; *p++='?';
5870
5871 /* If a default, cover it using the caret */
5872 if (has_default) {
5873 *p++= DEFAULT_PAT_MOD;
5874 }
5875 if (has_charset) {
5876 STRLEN len;
5877 const char* const name = get_regex_charset_name(r->extflags, &len);
5878 Copy(name, p, len, char);
5879 p += len;
5880 }
5881 if (has_p)
5882 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5883 {
5884 char ch;
5885 while((ch = *fptr++)) {
5886 if(reganch & 1)
5887 *p++ = ch;
5888 reganch >>= 1;
5889 }
5890 }
5891
5892 *p++ = ':';
5893 Copy(RExC_precomp, p, plen, char);
5894 assert ((RX_WRAPPED(rx) - p) < 16);
5895 r->pre_prefix = p - RX_WRAPPED(rx);
5896 p += plen;
5897 if (has_runon)
5898 *p++ = '\n';
5899 *p++ = ')';
5900 *p = 0;
5901 SvCUR_set(rx, p - SvPVX_const(rx));
5902 }
5903
5904 r->intflags = 0;
5905 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5906
5907 if (RExC_seen & REG_SEEN_RECURSE) {
5908 Newxz(RExC_open_parens, RExC_npar,regnode *);
5909 SAVEFREEPV(RExC_open_parens);
5910 Newxz(RExC_close_parens,RExC_npar,regnode *);
5911 SAVEFREEPV(RExC_close_parens);
5912 }
5913
5914 /* Useful during FAIL. */
5915#ifdef RE_TRACK_PATTERN_OFFSETS
5916 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5917 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5918 "%s %"UVuf" bytes for offset annotations.\n",
5919 ri->u.offsets ? "Got" : "Couldn't get",
5920 (UV)((2*RExC_size+1) * sizeof(U32))));
5921#endif
5922 SetProgLen(ri,RExC_size);
5923 RExC_rx_sv = rx;
5924 RExC_rx = r;
5925 RExC_rxi = ri;
5926
5927 /* Second pass: emit code. */
5928 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
5929 RExC_pm_flags = pm_flags;
5930 RExC_parse = exp;
5931 RExC_end = xend;
5932 RExC_naughty = 0;
5933 RExC_npar = 1;
5934 RExC_emit_start = ri->program;
5935 RExC_emit = ri->program;
5936 RExC_emit_bound = ri->program + RExC_size + 1;
5937 pRExC_state->code_index = 0;
5938
5939 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5940 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5941 ReREFCNT_dec(rx);
5942 return(NULL);
5943 }
5944 /* XXXX To minimize changes to RE engine we always allocate
5945 3-units-long substrs field. */
5946 Newx(r->substrs, 1, struct reg_substr_data);
5947 if (RExC_recurse_count) {
5948 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5949 SAVEFREEPV(RExC_recurse);
5950 }
5951
5952reStudy:
5953 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5954 Zero(r->substrs, 1, struct reg_substr_data);
5955
5956#ifdef TRIE_STUDY_OPT
5957 if (!restudied) {
5958 StructCopy(&zero_scan_data, &data, scan_data_t);
5959 copyRExC_state = RExC_state;
5960 } else {
5961 U32 seen=RExC_seen;
5962 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5963
5964 RExC_state = copyRExC_state;
5965 if (seen & REG_TOP_LEVEL_BRANCHES)
5966 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5967 else
5968 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5969 if (data.last_found) {
5970 SvREFCNT_dec(data.longest_fixed);
5971 SvREFCNT_dec(data.longest_float);
5972 SvREFCNT_dec(data.last_found);
5973 }
5974 StructCopy(&zero_scan_data, &data, scan_data_t);
5975 }
5976#else
5977 StructCopy(&zero_scan_data, &data, scan_data_t);
5978#endif
5979
5980 /* Dig out information for optimizations. */
5981 r->extflags = RExC_flags; /* was pm_op */
5982 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5983
5984 if (UTF)
5985 SvUTF8_on(rx); /* Unicode in it? */
5986 ri->regstclass = NULL;
5987 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
5988 r->intflags |= PREGf_NAUGHTY;
5989 scan = ri->program + 1; /* First BRANCH. */
5990
5991 /* testing for BRANCH here tells us whether there is "must appear"
5992 data in the pattern. If there is then we can use it for optimisations */
5993 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
5994 I32 fake;
5995 STRLEN longest_float_length, longest_fixed_length;
5996 struct regnode_charclass_class ch_class; /* pointed to by data */
5997 int stclass_flag;
5998 I32 last_close = 0; /* pointed to by data */
5999 regnode *first= scan;
6000 regnode *first_next= regnext(first);
6001 /*
6002 * Skip introductions and multiplicators >= 1
6003 * so that we can extract the 'meat' of the pattern that must
6004 * match in the large if() sequence following.
6005 * NOTE that EXACT is NOT covered here, as it is normally
6006 * picked up by the optimiser separately.
6007 *
6008 * This is unfortunate as the optimiser isnt handling lookahead
6009 * properly currently.
6010 *
6011 */
6012 while ((OP(first) == OPEN && (sawopen = 1)) ||
6013 /* An OR of *one* alternative - should not happen now. */
6014 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6015 /* for now we can't handle lookbehind IFMATCH*/
6016 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6017 (OP(first) == PLUS) ||
6018 (OP(first) == MINMOD) ||
6019 /* An {n,m} with n>0 */
6020 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6021 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6022 {
6023 /*
6024 * the only op that could be a regnode is PLUS, all the rest
6025 * will be regnode_1 or regnode_2.
6026 *
6027 */
6028 if (OP(first) == PLUS)
6029 sawplus = 1;
6030 else
6031 first += regarglen[OP(first)];
6032
6033 first = NEXTOPER(first);
6034 first_next= regnext(first);
6035 }
6036
6037 /* Starting-point info. */
6038 again:
6039 DEBUG_PEEP("first:",first,0);
6040 /* Ignore EXACT as we deal with it later. */
6041 if (PL_regkind[OP(first)] == EXACT) {
6042 if (OP(first) == EXACT)
6043 NOOP; /* Empty, get anchored substr later. */
6044 else
6045 ri->regstclass = first;
6046 }
6047#ifdef TRIE_STCLASS
6048 else if (PL_regkind[OP(first)] == TRIE &&
6049 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6050 {
6051 regnode *trie_op;
6052 /* this can happen only on restudy */
6053 if ( OP(first) == TRIE ) {
6054 struct regnode_1 *trieop = (struct regnode_1 *)
6055 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6056 StructCopy(first,trieop,struct regnode_1);
6057 trie_op=(regnode *)trieop;
6058 } else {
6059 struct regnode_charclass *trieop = (struct regnode_charclass *)
6060 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6061 StructCopy(first,trieop,struct regnode_charclass);
6062 trie_op=(regnode *)trieop;
6063 }
6064 OP(trie_op)+=2;
6065 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6066 ri->regstclass = trie_op;
6067 }
6068#endif
6069 else if (REGNODE_SIMPLE(OP(first)))
6070 ri->regstclass = first;
6071 else if (PL_regkind[OP(first)] == BOUND ||
6072 PL_regkind[OP(first)] == NBOUND)
6073 ri->regstclass = first;
6074 else if (PL_regkind[OP(first)] == BOL) {
6075 r->extflags |= (OP(first) == MBOL
6076 ? RXf_ANCH_MBOL
6077 : (OP(first) == SBOL
6078 ? RXf_ANCH_SBOL
6079 : RXf_ANCH_BOL));
6080 first = NEXTOPER(first);
6081 goto again;
6082 }
6083 else if (OP(first) == GPOS) {
6084 r->extflags |= RXf_ANCH_GPOS;
6085 first = NEXTOPER(first);
6086 goto again;
6087 }
6088 else if ((!sawopen || !RExC_sawback) &&
6089 (OP(first) == STAR &&
6090 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6091 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6092 {
6093 /* turn .* into ^.* with an implied $*=1 */
6094 const int type =
6095 (OP(NEXTOPER(first)) == REG_ANY)
6096 ? RXf_ANCH_MBOL
6097 : RXf_ANCH_SBOL;
6098 r->extflags |= type;
6099 r->intflags |= PREGf_IMPLICIT;
6100 first = NEXTOPER(first);
6101 goto again;
6102 }
6103 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6104 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6105 /* x+ must match at the 1st pos of run of x's */
6106 r->intflags |= PREGf_SKIP;
6107
6108 /* Scan is after the zeroth branch, first is atomic matcher. */
6109#ifdef TRIE_STUDY_OPT
6110 DEBUG_PARSE_r(
6111 if (!restudied)
6112 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6113 (IV)(first - scan + 1))
6114 );
6115#else
6116 DEBUG_PARSE_r(
6117 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6118 (IV)(first - scan + 1))
6119 );
6120#endif
6121
6122
6123 /*
6124 * If there's something expensive in the r.e., find the
6125 * longest literal string that must appear and make it the
6126 * regmust. Resolve ties in favor of later strings, since
6127 * the regstart check works with the beginning of the r.e.
6128 * and avoiding duplication strengthens checking. Not a
6129 * strong reason, but sufficient in the absence of others.
6130 * [Now we resolve ties in favor of the earlier string if
6131 * it happens that c_offset_min has been invalidated, since the
6132 * earlier string may buy us something the later one won't.]
6133 */
6134
6135 data.longest_fixed = newSVpvs("");
6136 data.longest_float = newSVpvs("");
6137 data.last_found = newSVpvs("");
6138 data.longest = &(data.longest_fixed);
6139 first = scan;
6140 if (!ri->regstclass) {
6141 cl_init(pRExC_state, &ch_class);
6142 data.start_class = &ch_class;
6143 stclass_flag = SCF_DO_STCLASS_AND;
6144 } else /* XXXX Check for BOUND? */
6145 stclass_flag = 0;
6146 data.last_closep = &last_close;
6147
6148 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6149 &data, -1, NULL, NULL,
6150 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6151
6152
6153 CHECK_RESTUDY_GOTO;
6154
6155
6156 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6157 && data.last_start_min == 0 && data.last_end > 0
6158 && !RExC_seen_zerolen
6159 && !(RExC_seen & REG_SEEN_VERBARG)
6160 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6161 r->extflags |= RXf_CHECK_ALL;
6162 scan_commit(pRExC_state, &data,&minlen,0);
6163 SvREFCNT_dec(data.last_found);
6164
6165 /* Note that code very similar to this but for anchored string
6166 follows immediately below, changes may need to be made to both.
6167 Be careful.
6168 */
6169 longest_float_length = CHR_SVLEN(data.longest_float);
6170 if (longest_float_length
6171 || (data.flags & SF_FL_BEFORE_EOL
6172 && (!(data.flags & SF_FL_BEFORE_MEOL)
6173 || (RExC_flags & RXf_PMf_MULTILINE))))
6174 {
6175 I32 t,ml;
6176
6177 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
6178 if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
6179 || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6180 && data.offset_fixed == data.offset_float_min
6181 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6182 goto remove_float; /* As in (a)+. */
6183
6184 /* copy the information about the longest float from the reg_scan_data
6185 over to the program. */
6186 if (SvUTF8(data.longest_float)) {
6187 r->float_utf8 = data.longest_float;
6188 r->float_substr = NULL;
6189 } else {
6190 r->float_substr = data.longest_float;
6191 r->float_utf8 = NULL;
6192 }
6193 /* float_end_shift is how many chars that must be matched that
6194 follow this item. We calculate it ahead of time as once the
6195 lookbehind offset is added in we lose the ability to correctly
6196 calculate it.*/
6197 ml = data.minlen_float ? *(data.minlen_float)
6198 : (I32)longest_float_length;
6199 r->float_end_shift = ml - data.offset_float_min
6200 - longest_float_length + (SvTAIL(data.longest_float) != 0)
6201 + data.lookbehind_float;
6202 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6203 r->float_max_offset = data.offset_float_max;
6204 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6205 r->float_max_offset -= data.lookbehind_float;
6206
6207 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
6208 && (!(data.flags & SF_FL_BEFORE_MEOL)
6209 || (RExC_flags & RXf_PMf_MULTILINE)));
6210 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
6211 }
6212 else {
6213 remove_float:
6214 r->float_substr = r->float_utf8 = NULL;
6215 SvREFCNT_dec(data.longest_float);
6216 longest_float_length = 0;
6217 }
6218
6219 /* Note that code very similar to this but for floating string
6220 is immediately above, changes may need to be made to both.
6221 Be careful.
6222 */
6223 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6224
6225 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
6226 if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
6227 && (longest_fixed_length
6228 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
6229 && (!(data.flags & SF_FIX_BEFORE_MEOL)
6230 || (RExC_flags & RXf_PMf_MULTILINE)))) )
6231 {
6232 I32 t,ml;
6233
6234 /* copy the information about the longest fixed
6235 from the reg_scan_data over to the program. */
6236 if (SvUTF8(data.longest_fixed)) {
6237 r->anchored_utf8 = data.longest_fixed;
6238 r->anchored_substr = NULL;
6239 } else {
6240 r->anchored_substr = data.longest_fixed;
6241 r->anchored_utf8 = NULL;
6242 }
6243 /* fixed_end_shift is how many chars that must be matched that
6244 follow this item. We calculate it ahead of time as once the
6245 lookbehind offset is added in we lose the ability to correctly
6246 calculate it.*/
6247 ml = data.minlen_fixed ? *(data.minlen_fixed)
6248 : (I32)longest_fixed_length;
6249 r->anchored_end_shift = ml - data.offset_fixed
6250 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
6251 + data.lookbehind_fixed;
6252 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6253
6254 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
6255 && (!(data.flags & SF_FIX_BEFORE_MEOL)
6256 || (RExC_flags & RXf_PMf_MULTILINE)));
6257 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
6258 }
6259 else {
6260 r->anchored_substr = r->anchored_utf8 = NULL;
6261 SvREFCNT_dec(data.longest_fixed);
6262 longest_fixed_length = 0;
6263 }
6264 if (ri->regstclass
6265 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6266 ri->regstclass = NULL;
6267
6268 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6269 && stclass_flag
6270 && !(data.start_class->flags & ANYOF_EOS)
6271 && !cl_is_anything(data.start_class))
6272 {
6273 const U32 n = add_data(pRExC_state, 1, "f");
6274 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6275
6276 Newx(RExC_rxi->data->data[n], 1,
6277 struct regnode_charclass_class);
6278 StructCopy(data.start_class,
6279 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6280 struct regnode_charclass_class);
6281 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6282 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6283 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6284 regprop(r, sv, (regnode*)data.start_class);
6285 PerlIO_printf(Perl_debug_log,
6286 "synthetic stclass \"%s\".\n",
6287 SvPVX_const(sv));});
6288 }
6289
6290 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6291 if (longest_fixed_length > longest_float_length) {
6292 r->check_end_shift = r->anchored_end_shift;
6293 r->check_substr = r->anchored_substr;
6294 r->check_utf8 = r->anchored_utf8;
6295 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6296 if (r->extflags & RXf_ANCH_SINGLE)
6297 r->extflags |= RXf_NOSCAN;
6298 }
6299 else {
6300 r->check_end_shift = r->float_end_shift;
6301 r->check_substr = r->float_substr;
6302 r->check_utf8 = r->float_utf8;
6303 r->check_offset_min = r->float_min_offset;
6304 r->check_offset_max = r->float_max_offset;
6305 }
6306 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6307 This should be changed ASAP! */
6308 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6309 r->extflags |= RXf_USE_INTUIT;
6310 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6311 r->extflags |= RXf_INTUIT_TAIL;
6312 }
6313 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6314 if ( (STRLEN)minlen < longest_float_length )
6315 minlen= longest_float_length;
6316 if ( (STRLEN)minlen < longest_fixed_length )
6317 minlen= longest_fixed_length;
6318 */
6319 }
6320 else {
6321 /* Several toplevels. Best we can is to set minlen. */
6322 I32 fake;
6323 struct regnode_charclass_class ch_class;
6324 I32 last_close = 0;
6325
6326 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6327
6328 scan = ri->program + 1;
6329 cl_init(pRExC_state, &ch_class);
6330 data.start_class = &ch_class;
6331 data.last_closep = &last_close;
6332
6333
6334 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6335 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6336
6337 CHECK_RESTUDY_GOTO;
6338
6339 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6340 = r->float_substr = r->float_utf8 = NULL;
6341
6342 if (!(data.start_class->flags & ANYOF_EOS)
6343 && !cl_is_anything(data.start_class))
6344 {
6345 const U32 n = add_data(pRExC_state, 1, "f");
6346 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6347
6348 Newx(RExC_rxi->data->data[n], 1,
6349 struct regnode_charclass_class);
6350 StructCopy(data.start_class,
6351 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6352 struct regnode_charclass_class);
6353 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6354 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6355 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6356 regprop(r, sv, (regnode*)data.start_class);
6357 PerlIO_printf(Perl_debug_log,
6358 "synthetic stclass \"%s\".\n",
6359 SvPVX_const(sv));});
6360 }
6361 }
6362
6363 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6364 the "real" pattern. */
6365 DEBUG_OPTIMISE_r({
6366 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6367 (IV)minlen, (IV)r->minlen);
6368 });
6369 r->minlenret = minlen;
6370 if (r->minlen < minlen)
6371 r->minlen = minlen;
6372
6373 if (RExC_seen & REG_SEEN_GPOS)
6374 r->extflags |= RXf_GPOS_SEEN;
6375 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6376 r->extflags |= RXf_LOOKBEHIND_SEEN;
6377 if (pRExC_state->num_code_blocks)
6378 r->extflags |= RXf_EVAL_SEEN;
6379 if (RExC_seen & REG_SEEN_CANY)
6380 r->extflags |= RXf_CANY_SEEN;
6381 if (RExC_seen & REG_SEEN_VERBARG)
6382 r->intflags |= PREGf_VERBARG_SEEN;
6383 if (RExC_seen & REG_SEEN_CUTGROUP)
6384 r->intflags |= PREGf_CUTGROUP_SEEN;
6385 if (pm_flags & PMf_USE_RE_EVAL)
6386 r->intflags |= PREGf_USE_RE_EVAL;
6387 if (RExC_paren_names)
6388 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6389 else
6390 RXp_PAREN_NAMES(r) = NULL;
6391
6392#ifdef STUPID_PATTERN_CHECKS
6393 if (RX_PRELEN(rx) == 0)
6394 r->extflags |= RXf_NULL;
6395 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6396 /* XXX: this should happen BEFORE we compile */
6397 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6398 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6399 r->extflags |= RXf_WHITE;
6400 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6401 r->extflags |= RXf_START_ONLY;
6402#else
6403 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6404 /* XXX: this should happen BEFORE we compile */
6405 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6406 else {
6407 regnode *first = ri->program + 1;
6408 U8 fop = OP(first);
6409
6410 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6411 r->extflags |= RXf_NULL;
6412 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6413 r->extflags |= RXf_START_ONLY;
6414 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6415 && OP(regnext(first)) == END)
6416 r->extflags |= RXf_WHITE;
6417 }
6418#endif
6419#ifdef DEBUGGING
6420 if (RExC_paren_names) {
6421 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6422 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6423 } else
6424#endif
6425 ri->name_list_idx = 0;
6426
6427 if (RExC_recurse_count) {
6428 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6429 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6430 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6431 }
6432 }
6433 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6434 /* assume we don't need to swap parens around before we match */
6435
6436 DEBUG_DUMP_r({
6437 PerlIO_printf(Perl_debug_log,"Final program:\n");
6438 regdump(r);
6439 });
6440#ifdef RE_TRACK_PATTERN_OFFSETS
6441 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6442 const U32 len = ri->u.offsets[0];
6443 U32 i;
6444 GET_RE_DEBUG_FLAGS_DECL;
6445 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6446 for (i = 1; i <= len; i++) {
6447 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6448 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6449 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6450 }
6451 PerlIO_printf(Perl_debug_log, "\n");
6452 });
6453#endif
6454 return rx;
6455}
6456
6457
6458SV*
6459Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6460 const U32 flags)
6461{
6462 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6463
6464 PERL_UNUSED_ARG(value);
6465
6466 if (flags & RXapif_FETCH) {
6467 return reg_named_buff_fetch(rx, key, flags);
6468 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6469 Perl_croak_no_modify(aTHX);
6470 return NULL;
6471 } else if (flags & RXapif_EXISTS) {
6472 return reg_named_buff_exists(rx, key, flags)
6473 ? &PL_sv_yes
6474 : &PL_sv_no;
6475 } else if (flags & RXapif_REGNAMES) {
6476 return reg_named_buff_all(rx, flags);
6477 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6478 return reg_named_buff_scalar(rx, flags);
6479 } else {
6480 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6481 return NULL;
6482 }
6483}
6484
6485SV*
6486Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6487 const U32 flags)
6488{
6489 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6490 PERL_UNUSED_ARG(lastkey);
6491
6492 if (flags & RXapif_FIRSTKEY)
6493 return reg_named_buff_firstkey(rx, flags);
6494 else if (flags & RXapif_NEXTKEY)
6495 return reg_named_buff_nextkey(rx, flags);
6496 else {
6497 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6498 return NULL;
6499 }
6500}
6501
6502SV*
6503Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6504 const U32 flags)
6505{
6506 AV *retarray = NULL;
6507 SV *ret;
6508 struct regexp *const rx = (struct regexp *)SvANY(r);
6509
6510 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6511
6512 if (flags & RXapif_ALL)
6513 retarray=newAV();
6514
6515 if (rx && RXp_PAREN_NAMES(rx)) {
6516 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6517 if (he_str) {
6518 IV i;
6519 SV* sv_dat=HeVAL(he_str);
6520 I32 *nums=(I32*)SvPVX(sv_dat);
6521 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6522 if ((I32)(rx->nparens) >= nums[i]
6523 && rx->offs[nums[i]].start != -1
6524 && rx->offs[nums[i]].end != -1)
6525 {
6526 ret = newSVpvs("");
6527 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6528 if (!retarray)
6529 return ret;
6530 } else {
6531 if (retarray)
6532 ret = newSVsv(&PL_sv_undef);
6533 }
6534 if (retarray)
6535 av_push(retarray, ret);
6536 }
6537 if (retarray)
6538 return newRV_noinc(MUTABLE_SV(retarray));
6539 }
6540 }
6541 return NULL;
6542}
6543
6544bool
6545Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6546 const U32 flags)
6547{
6548 struct regexp *const rx = (struct regexp *)SvANY(r);
6549
6550 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6551
6552 if (rx && RXp_PAREN_NAMES(rx)) {
6553 if (flags & RXapif_ALL) {
6554 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6555 } else {
6556 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6557 if (sv) {
6558 SvREFCNT_dec(sv);
6559 return TRUE;
6560 } else {
6561 return FALSE;
6562 }
6563 }
6564 } else {
6565 return FALSE;
6566 }
6567}
6568
6569SV*
6570Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6571{
6572 struct regexp *const rx = (struct regexp *)SvANY(r);
6573
6574 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6575
6576 if ( rx && RXp_PAREN_NAMES(rx) ) {
6577 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6578
6579 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6580 } else {
6581 return FALSE;
6582 }
6583}
6584
6585SV*
6586Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6587{
6588 struct regexp *const rx = (struct regexp *)SvANY(r);
6589 GET_RE_DEBUG_FLAGS_DECL;
6590
6591 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6592
6593 if (rx && RXp_PAREN_NAMES(rx)) {
6594 HV *hv = RXp_PAREN_NAMES(rx);
6595 HE *temphe;
6596 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6597 IV i;
6598 IV parno = 0;
6599 SV* sv_dat = HeVAL(temphe);
6600 I32 *nums = (I32*)SvPVX(sv_dat);
6601 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6602 if ((I32)(rx->lastparen) >= nums[i] &&
6603 rx->offs[nums[i]].start != -1 &&
6604 rx->offs[nums[i]].end != -1)
6605 {
6606 parno = nums[i];
6607 break;
6608 }
6609 }
6610 if (parno || flags & RXapif_ALL) {
6611 return newSVhek(HeKEY_hek(temphe));
6612 }
6613 }
6614 }
6615 return NULL;
6616}
6617
6618SV*
6619Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6620{
6621 SV *ret;
6622 AV *av;
6623 I32 length;
6624 struct regexp *const rx = (struct regexp *)SvANY(r);
6625
6626 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6627
6628 if (rx && RXp_PAREN_NAMES(rx)) {
6629 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6630 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6631 } else if (flags & RXapif_ONE) {
6632 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6633 av = MUTABLE_AV(SvRV(ret));
6634 length = av_len(av);
6635 SvREFCNT_dec(ret);
6636 return newSViv(length + 1);
6637 } else {
6638 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6639 return NULL;
6640 }
6641 }
6642 return &PL_sv_undef;
6643}
6644
6645SV*
6646Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6647{
6648 struct regexp *const rx = (struct regexp *)SvANY(r);
6649 AV *av = newAV();
6650
6651 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6652
6653 if (rx && RXp_PAREN_NAMES(rx)) {
6654 HV *hv= RXp_PAREN_NAMES(rx);
6655 HE *temphe;
6656 (void)hv_iterinit(hv);
6657 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6658 IV i;
6659 IV parno = 0;
6660 SV* sv_dat = HeVAL(temphe);
6661 I32 *nums = (I32*)SvPVX(sv_dat);
6662 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6663 if ((I32)(rx->lastparen) >= nums[i] &&
6664 rx->offs[nums[i]].start != -1 &&
6665 rx->offs[nums[i]].end != -1)
6666 {
6667 parno = nums[i];
6668 break;
6669 }
6670 }
6671 if (parno || flags & RXapif_ALL) {
6672 av_push(av, newSVhek(HeKEY_hek(temphe)));
6673 }
6674 }
6675 }
6676
6677 return newRV_noinc(MUTABLE_SV(av));
6678}
6679
6680void
6681Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6682 SV * const sv)
6683{
6684 struct regexp *const rx = (struct regexp *)SvANY(r);
6685 char *s = NULL;
6686 I32 i = 0;
6687 I32 s1, t1;
6688
6689 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6690
6691 if (!rx->subbeg) {
6692 sv_setsv(sv,&PL_sv_undef);
6693 return;
6694 }
6695 else
6696 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
6697 /* $` */
6698 i = rx->offs[0].start;
6699 s = rx->subbeg;
6700 }
6701 else
6702 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
6703 /* $' */
6704 s = rx->subbeg + rx->offs[0].end;
6705 i = rx->sublen - rx->offs[0].end;
6706 }
6707 else
6708 if ( 0 <= paren && paren <= (I32)rx->nparens &&
6709 (s1 = rx->offs[paren].start) != -1 &&
6710 (t1 = rx->offs[paren].end) != -1)
6711 {
6712 /* $& $1 ... */
6713 i = t1 - s1;
6714 s = rx->subbeg + s1;
6715 } else {
6716 sv_setsv(sv,&PL_sv_undef);
6717 return;
6718 }
6719 assert(rx->sublen >= (s - rx->subbeg) + i );
6720 if (i >= 0) {
6721 const int oldtainted = PL_tainted;
6722 TAINT_NOT;
6723 sv_setpvn(sv, s, i);
6724 PL_tainted = oldtainted;
6725 if ( (rx->extflags & RXf_CANY_SEEN)
6726 ? (RXp_MATCH_UTF8(rx)
6727 && (!i || is_utf8_string((U8*)s, i)))
6728 : (RXp_MATCH_UTF8(rx)) )
6729 {
6730 SvUTF8_on(sv);
6731 }
6732 else
6733 SvUTF8_off(sv);
6734 if (PL_tainting) {
6735 if (RXp_MATCH_TAINTED(rx)) {
6736 if (SvTYPE(sv) >= SVt_PVMG) {
6737 MAGIC* const mg = SvMAGIC(sv);
6738 MAGIC* mgt;
6739 PL_tainted = 1;
6740 SvMAGIC_set(sv, mg->mg_moremagic);
6741 SvTAINT(sv);
6742 if ((mgt = SvMAGIC(sv))) {
6743 mg->mg_moremagic = mgt;
6744 SvMAGIC_set(sv, mg);
6745 }
6746 } else {
6747 PL_tainted = 1;
6748 SvTAINT(sv);
6749 }
6750 } else
6751 SvTAINTED_off(sv);
6752 }
6753 } else {
6754 sv_setsv(sv,&PL_sv_undef);
6755 return;
6756 }
6757}
6758
6759void
6760Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6761 SV const * const value)
6762{
6763 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6764
6765 PERL_UNUSED_ARG(rx);
6766 PERL_UNUSED_ARG(paren);
6767 PERL_UNUSED_ARG(value);
6768
6769 if (!PL_localizing)
6770 Perl_croak_no_modify(aTHX);
6771}
6772
6773I32
6774Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6775 const I32 paren)
6776{
6777 struct regexp *const rx = (struct regexp *)SvANY(r);
6778 I32 i;
6779 I32 s1, t1;
6780
6781 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6782
6783 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6784 switch (paren) {
6785 /* $` / ${^PREMATCH} */
6786 case RX_BUFF_IDX_PREMATCH:
6787 if (rx->offs[0].start != -1) {
6788 i = rx->offs[0].start;
6789 if (i > 0) {
6790 s1 = 0;
6791 t1 = i;
6792 goto getlen;
6793 }
6794 }
6795 return 0;
6796 /* $' / ${^POSTMATCH} */
6797 case RX_BUFF_IDX_POSTMATCH:
6798 if (rx->offs[0].end != -1) {
6799 i = rx->sublen - rx->offs[0].end;
6800 if (i > 0) {
6801 s1 = rx->offs[0].end;
6802 t1 = rx->sublen;
6803 goto getlen;
6804 }
6805 }
6806 return 0;
6807 /* $& / ${^MATCH}, $1, $2, ... */
6808 default:
6809 if (paren <= (I32)rx->nparens &&
6810 (s1 = rx->offs[paren].start) != -1 &&
6811 (t1 = rx->offs[paren].end) != -1)
6812 {
6813 i = t1 - s1;
6814 goto getlen;
6815 } else {
6816 if (ckWARN(WARN_UNINITIALIZED))
6817 report_uninit((const SV *)sv);
6818 return 0;
6819 }
6820 }
6821 getlen:
6822 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6823 const char * const s = rx->subbeg + s1;
6824 const U8 *ep;
6825 STRLEN el;
6826
6827 i = t1 - s1;
6828 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6829 i = el;
6830 }
6831 return i;
6832}
6833
6834SV*
6835Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6836{
6837 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6838 PERL_UNUSED_ARG(rx);
6839 if (0)
6840 return NULL;
6841 else
6842 return newSVpvs("Regexp");
6843}
6844
6845/* Scans the name of a named buffer from the pattern.
6846 * If flags is REG_RSN_RETURN_NULL returns null.
6847 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6848 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6849 * to the parsed name as looked up in the RExC_paren_names hash.
6850 * If there is an error throws a vFAIL().. type exception.
6851 */
6852
6853#define REG_RSN_RETURN_NULL 0
6854#define REG_RSN_RETURN_NAME 1
6855#define REG_RSN_RETURN_DATA 2
6856
6857STATIC SV*
6858S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6859{
6860 char *name_start = RExC_parse;
6861
6862 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6863
6864 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6865 /* skip IDFIRST by using do...while */
6866 if (UTF)
6867 do {
6868 RExC_parse += UTF8SKIP(RExC_parse);
6869 } while (isALNUM_utf8((U8*)RExC_parse));
6870 else
6871 do {
6872 RExC_parse++;
6873 } while (isALNUM(*RExC_parse));
6874 } else {
6875 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6876 vFAIL("Group name must start with a non-digit word character");
6877 }
6878 if ( flags ) {
6879 SV* sv_name
6880 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6881 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6882 if ( flags == REG_RSN_RETURN_NAME)
6883 return sv_name;
6884 else if (flags==REG_RSN_RETURN_DATA) {
6885 HE *he_str = NULL;
6886 SV *sv_dat = NULL;
6887 if ( ! sv_name ) /* should not happen*/
6888 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6889 if (RExC_paren_names)
6890 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6891 if ( he_str )
6892 sv_dat = HeVAL(he_str);
6893 if ( ! sv_dat )
6894 vFAIL("Reference to nonexistent named group");
6895 return sv_dat;
6896 }
6897 else {
6898 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6899 (unsigned long) flags);
6900 }
6901 assert(0); /* NOT REACHED */
6902 }
6903 return NULL;
6904}
6905
6906#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6907 int rem=(int)(RExC_end - RExC_parse); \
6908 int cut; \
6909 int num; \
6910 int iscut=0; \
6911 if (rem>10) { \
6912 rem=10; \
6913 iscut=1; \
6914 } \
6915 cut=10-rem; \
6916 if (RExC_lastparse!=RExC_parse) \
6917 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6918 rem, RExC_parse, \
6919 cut + 4, \
6920 iscut ? "..." : "<" \
6921 ); \
6922 else \
6923 PerlIO_printf(Perl_debug_log,"%16s",""); \
6924 \
6925 if (SIZE_ONLY) \
6926 num = RExC_size + 1; \
6927 else \
6928 num=REG_NODE_NUM(RExC_emit); \
6929 if (RExC_lastnum!=num) \
6930 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6931 else \
6932 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6933 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6934 (int)((depth*2)), "", \
6935 (funcname) \
6936 ); \
6937 RExC_lastnum=num; \
6938 RExC_lastparse=RExC_parse; \
6939})
6940
6941
6942
6943#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6944 DEBUG_PARSE_MSG((funcname)); \
6945 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6946})
6947#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6948 DEBUG_PARSE_MSG((funcname)); \
6949 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6950})
6951
6952/* This section of code defines the inversion list object and its methods. The
6953 * interfaces are highly subject to change, so as much as possible is static to
6954 * this file. An inversion list is here implemented as a malloc'd C UV array
6955 * with some added info that is placed as UVs at the beginning in a header
6956 * portion. An inversion list for Unicode is an array of code points, sorted
6957 * by ordinal number. The zeroth element is the first code point in the list.
6958 * The 1th element is the first element beyond that not in the list. In other
6959 * words, the first range is
6960 * invlist[0]..(invlist[1]-1)
6961 * The other ranges follow. Thus every element whose index is divisible by two
6962 * marks the beginning of a range that is in the list, and every element not
6963 * divisible by two marks the beginning of a range not in the list. A single
6964 * element inversion list that contains the single code point N generally
6965 * consists of two elements
6966 * invlist[0] == N
6967 * invlist[1] == N+1
6968 * (The exception is when N is the highest representable value on the
6969 * machine, in which case the list containing just it would be a single
6970 * element, itself. By extension, if the last range in the list extends to
6971 * infinity, then the first element of that range will be in the inversion list
6972 * at a position that is divisible by two, and is the final element in the
6973 * list.)
6974 * Taking the complement (inverting) an inversion list is quite simple, if the
6975 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6976 * This implementation reserves an element at the beginning of each inversion list
6977 * to contain 0 when the list contains 0, and contains 1 otherwise. The actual
6978 * beginning of the list is either that element if 0, or the next one if 1.
6979 *
6980 * More about inversion lists can be found in "Unicode Demystified"
6981 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6982 * More will be coming when functionality is added later.
6983 *
6984 * The inversion list data structure is currently implemented as an SV pointing
6985 * to an array of UVs that the SV thinks are bytes. This allows us to have an
6986 * array of UV whose memory management is automatically handled by the existing
6987 * facilities for SV's.
6988 *
6989 * Some of the methods should always be private to the implementation, and some
6990 * should eventually be made public */
6991
6992#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
6993#define INVLIST_ITER_OFFSET 1 /* Current iteration position */
6994
6995/* This is a combination of a version and data structure type, so that one
6996 * being passed in can be validated to be an inversion list of the correct
6997 * vintage. When the structure of the header is changed, a new random number
6998 * in the range 2**31-1 should be generated and the new() method changed to
6999 * insert that at this location. Then, if an auxiliary program doesn't change
7000 * correspondingly, it will be discovered immediately */
7001#define INVLIST_VERSION_ID_OFFSET 2
7002#define INVLIST_VERSION_ID 1064334010
7003
7004/* For safety, when adding new elements, remember to #undef them at the end of
7005 * the inversion list code section */
7006
7007#define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
7008/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
7009 * contains the code point U+00000, and begins here. If 1, the inversion list
7010 * doesn't contain U+0000, and it begins at the next UV in the array.
7011 * Inverting an inversion list consists of adding or removing the 0 at the
7012 * beginning of it. By reserving a space for that 0, inversion can be made
7013 * very fast */
7014
7015#define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
7016
7017/* Internally things are UVs */
7018#define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
7019#define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
7020
7021#define INVLIST_INITIAL_LEN 10
7022
7023PERL_STATIC_INLINE UV*
7024S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7025{
7026 /* Returns a pointer to the first element in the inversion list's array.
7027 * This is called upon initialization of an inversion list. Where the
7028 * array begins depends on whether the list has the code point U+0000
7029 * in it or not. The other parameter tells it whether the code that
7030 * follows this call is about to put a 0 in the inversion list or not.
7031 * The first element is either the element with 0, if 0, or the next one,
7032 * if 1 */
7033
7034 UV* zero = get_invlist_zero_addr(invlist);
7035
7036 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7037
7038 /* Must be empty */
7039 assert(! *get_invlist_len_addr(invlist));
7040
7041 /* 1^1 = 0; 1^0 = 1 */
7042 *zero = 1 ^ will_have_0;
7043 return zero + *zero;
7044}
7045
7046PERL_STATIC_INLINE UV*
7047S_invlist_array(pTHX_ SV* const invlist)
7048{
7049 /* Returns the pointer to the inversion list's array. Every time the
7050 * length changes, this needs to be called in case malloc or realloc moved
7051 * it */
7052
7053 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7054
7055 /* Must not be empty. If these fail, you probably didn't check for <len>
7056 * being non-zero before trying to get the array */
7057 assert(*get_invlist_len_addr(invlist));
7058 assert(*get_invlist_zero_addr(invlist) == 0
7059 || *get_invlist_zero_addr(invlist) == 1);
7060
7061 /* The array begins either at the element reserved for zero if the
7062 * list contains 0 (that element will be set to 0), or otherwise the next
7063 * element (in which case the reserved element will be set to 1). */
7064 return (UV *) (get_invlist_zero_addr(invlist)
7065 + *get_invlist_zero_addr(invlist));
7066}
7067
7068PERL_STATIC_INLINE UV*
7069S_get_invlist_len_addr(pTHX_ SV* invlist)
7070{
7071 /* Return the address of the UV that contains the current number
7072 * of used elements in the inversion list */
7073
7074 PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
7075
7076 return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
7077}
7078
7079PERL_STATIC_INLINE UV
7080S_invlist_len(pTHX_ SV* const invlist)
7081{
7082 /* Returns the current number of elements stored in the inversion list's
7083 * array */
7084
7085 PERL_ARGS_ASSERT_INVLIST_LEN;
7086
7087 return *get_invlist_len_addr(invlist);
7088}
7089
7090PERL_STATIC_INLINE void
7091S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7092{
7093 /* Sets the current number of elements stored in the inversion list */
7094
7095 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7096
7097 *get_invlist_len_addr(invlist) = len;
7098
7099 assert(len <= SvLEN(invlist));
7100
7101 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7102 /* If the list contains U+0000, that element is part of the header,
7103 * and should not be counted as part of the array. It will contain
7104 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
7105 * subtract:
7106 * SvCUR_set(invlist,
7107 * TO_INTERNAL_SIZE(len
7108 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
7109 * But, this is only valid if len is not 0. The consequences of not doing
7110 * this is that the memory allocation code may think that 1 more UV is
7111 * being used than actually is, and so might do an unnecessary grow. That
7112 * seems worth not bothering to make this the precise amount.
7113 *
7114 * Note that when inverting, SvCUR shouldn't change */
7115}
7116
7117PERL_STATIC_INLINE UV
7118S_invlist_max(pTHX_ SV* const invlist)
7119{
7120 /* Returns the maximum number of elements storable in the inversion list's
7121 * array, without having to realloc() */
7122
7123 PERL_ARGS_ASSERT_INVLIST_MAX;
7124
7125 return FROM_INTERNAL_SIZE(SvLEN(invlist));
7126}
7127
7128PERL_STATIC_INLINE UV*
7129S_get_invlist_zero_addr(pTHX_ SV* invlist)
7130{
7131 /* Return the address of the UV that is reserved to hold 0 if the inversion
7132 * list contains 0. This has to be the last element of the heading, as the
7133 * list proper starts with either it if 0, or the next element if not.
7134 * (But we force it to contain either 0 or 1) */
7135
7136 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7137
7138 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7139}
7140
7141#ifndef PERL_IN_XSUB_RE
7142SV*
7143Perl__new_invlist(pTHX_ IV initial_size)
7144{
7145
7146 /* Return a pointer to a newly constructed inversion list, with enough
7147 * space to store 'initial_size' elements. If that number is negative, a
7148 * system default is used instead */
7149
7150 SV* new_list;
7151
7152 if (initial_size < 0) {
7153 initial_size = INVLIST_INITIAL_LEN;
7154 }
7155
7156 /* Allocate the initial space */
7157 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7158 invlist_set_len(new_list, 0);
7159
7160 /* Force iterinit() to be used to get iteration to work */
7161 *get_invlist_iter_addr(new_list) = UV_MAX;
7162
7163 /* This should force a segfault if a method doesn't initialize this
7164 * properly */
7165 *get_invlist_zero_addr(new_list) = UV_MAX;
7166
7167 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7168#if HEADER_LENGTH != 4
7169# error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7170#endif
7171
7172 return new_list;
7173}
7174#endif
7175
7176STATIC SV*
7177S__new_invlist_C_array(pTHX_ UV* list)
7178{
7179 /* Return a pointer to a newly constructed inversion list, initialized to
7180 * point to <list>, which has to be in the exact correct inversion list
7181 * form, including internal fields. Thus this is a dangerous routine that
7182 * should not be used in the wrong hands */
7183
7184 SV* invlist = newSV_type(SVt_PV);
7185
7186 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7187
7188 SvPV_set(invlist, (char *) list);
7189 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7190 shouldn't touch it */
7191 SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
7192
7193 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7194 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7195 }
7196
7197 return invlist;
7198}
7199
7200STATIC void
7201S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7202{
7203 /* Grow the maximum size of an inversion list */
7204
7205 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7206
7207 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7208}
7209
7210PERL_STATIC_INLINE void
7211S_invlist_trim(pTHX_ SV* const invlist)
7212{
7213 PERL_ARGS_ASSERT_INVLIST_TRIM;
7214
7215 /* Change the length of the inversion list to how many entries it currently
7216 * has */
7217
7218 SvPV_shrink_to_cur((SV *) invlist);
7219}
7220
7221/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
7222 * etc */
7223#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
7224#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
7225
7226#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7227
7228STATIC void
7229S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7230{
7231 /* Subject to change or removal. Append the range from 'start' to 'end' at
7232 * the end of the inversion list. The range must be above any existing
7233 * ones. */
7234
7235 UV* array;
7236 UV max = invlist_max(invlist);
7237 UV len = invlist_len(invlist);
7238
7239 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7240
7241 if (len == 0) { /* Empty lists must be initialized */
7242 array = _invlist_array_init(invlist, start == 0);
7243 }
7244 else {
7245 /* Here, the existing list is non-empty. The current max entry in the
7246 * list is generally the first value not in the set, except when the
7247 * set extends to the end of permissible values, in which case it is
7248 * the first entry in that final set, and so this call is an attempt to
7249 * append out-of-order */
7250
7251 UV final_element = len - 1;
7252 array = invlist_array(invlist);
7253 if (array[final_element] > start
7254 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7255 {
7256 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7257 array[final_element], start,
7258 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7259 }
7260
7261 /* Here, it is a legal append. If the new range begins with the first
7262 * value not in the set, it is extending the set, so the new first
7263 * value not in the set is one greater than the newly extended range.
7264 * */
7265 if (array[final_element] == start) {
7266 if (end != UV_MAX) {
7267 array[final_element] = end + 1;
7268 }
7269 else {
7270 /* But if the end is the maximum representable on the machine,
7271 * just let the range that this would extend to have no end */
7272 invlist_set_len(invlist, len - 1);
7273 }
7274 return;
7275 }
7276 }
7277
7278 /* Here the new range doesn't extend any existing set. Add it */
7279
7280 len += 2; /* Includes an element each for the start and end of range */
7281
7282 /* If overflows the existing space, extend, which may cause the array to be
7283 * moved */
7284 if (max < len) {
7285 invlist_extend(invlist, len);
7286 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7287 failure in invlist_array() */
7288 array = invlist_array(invlist);
7289 }
7290 else {
7291 invlist_set_len(invlist, len);
7292 }
7293
7294 /* The next item on the list starts the range, the one after that is
7295 * one past the new range. */
7296 array[len - 2] = start;
7297 if (end != UV_MAX) {
7298 array[len - 1] = end + 1;
7299 }
7300 else {
7301 /* But if the end is the maximum representable on the machine, just let
7302 * the range have no end */
7303 invlist_set_len(invlist, len - 1);
7304 }
7305}
7306
7307#ifndef PERL_IN_XSUB_RE
7308
7309STATIC IV
7310S_invlist_search(pTHX_ SV* const invlist, const UV cp)
7311{
7312 /* Searches the inversion list for the entry that contains the input code
7313 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7314 * return value is the index into the list's array of the range that
7315 * contains <cp> */
7316
7317 IV low = 0;
7318 IV high = invlist_len(invlist);
7319 const UV * const array = invlist_array(invlist);
7320
7321 PERL_ARGS_ASSERT_INVLIST_SEARCH;
7322
7323 /* If list is empty or the code point is before the first element, return
7324 * failure. */
7325 if (high == 0 || cp < array[0]) {
7326 return -1;
7327 }
7328
7329 /* Binary search. What we are looking for is <i> such that
7330 * array[i] <= cp < array[i+1]
7331 * The loop below converges on the i+1. */
7332 while (low < high) {
7333 IV mid = (low + high) / 2;
7334 if (array[mid] <= cp) {
7335 low = mid + 1;
7336
7337 /* We could do this extra test to exit the loop early.
7338 if (cp < array[low]) {
7339 return mid;
7340 }
7341 */
7342 }
7343 else { /* cp < array[mid] */
7344 high = mid;
7345 }
7346 }
7347
7348 return high - 1;
7349}
7350
7351void
7352Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7353{
7354 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7355 * but is used when the swash has an inversion list. This makes this much
7356 * faster, as it uses a binary search instead of a linear one. This is
7357 * intimately tied to that function, and perhaps should be in utf8.c,
7358 * except it is intimately tied to inversion lists as well. It assumes
7359 * that <swatch> is all 0's on input */
7360
7361 UV current = start;
7362 const IV len = invlist_len(invlist);
7363 IV i;
7364 const UV * array;
7365
7366 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7367
7368 if (len == 0) { /* Empty inversion list */
7369 return;
7370 }
7371
7372 array = invlist_array(invlist);
7373
7374 /* Find which element it is */
7375 i = invlist_search(invlist, start);
7376
7377 /* We populate from <start> to <end> */
7378 while (current < end) {
7379 UV upper;
7380
7381 /* The inversion list gives the results for every possible code point
7382 * after the first one in the list. Only those ranges whose index is
7383 * even are ones that the inversion list matches. For the odd ones,
7384 * and if the initial code point is not in the list, we have to skip
7385 * forward to the next element */
7386 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7387 i++;
7388 if (i >= len) { /* Finished if beyond the end of the array */
7389 return;
7390 }
7391 current = array[i];
7392 if (current >= end) { /* Finished if beyond the end of what we
7393 are populating */
7394 return;
7395 }
7396 }
7397 assert(current >= start);
7398
7399 /* The current range ends one below the next one, except don't go past
7400 * <end> */
7401 i++;
7402 upper = (i < len && array[i] < end) ? array[i] : end;
7403
7404 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7405 * for each code point in it */
7406 for (; current < upper; current++) {
7407 const STRLEN offset = (STRLEN)(current - start);
7408 swatch[offset >> 3] |= 1 << (offset & 7);
7409 }
7410
7411 /* Quit if at the end of the list */
7412 if (i >= len) {
7413
7414 /* But first, have to deal with the highest possible code point on
7415 * the platform. The previous code assumes that <end> is one
7416 * beyond where we want to populate, but that is impossible at the
7417 * platform's infinity, so have to handle it specially */
7418 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7419 {
7420 const STRLEN offset = (STRLEN)(end - start);
7421 swatch[offset >> 3] |= 1 << (offset & 7);
7422 }
7423 return;
7424 }
7425
7426 /* Advance to the next range, which will be for code points not in the
7427 * inversion list */
7428 current = array[i];
7429 }
7430
7431 return;
7432}
7433
7434
7435void
7436Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7437{
7438 /* Take the union of two inversion lists and point <output> to it. *output
7439 * should be defined upon input, and if it points to one of the two lists,
7440 * the reference count to that list will be decremented. The first list,
7441 * <a>, may be NULL, in which case a copy of the second list is returned.
7442 * If <complement_b> is TRUE, the union is taken of the complement
7443 * (inversion) of <b> instead of b itself.
7444 *
7445 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7446 * Richard Gillam, published by Addison-Wesley, and explained at some
7447 * length there. The preface says to incorporate its examples into your
7448 * code at your own risk.
7449 *
7450 * The algorithm is like a merge sort.
7451 *
7452 * XXX A potential performance improvement is to keep track as we go along
7453 * if only one of the inputs contributes to the result, meaning the other
7454 * is a subset of that one. In that case, we can skip the final copy and
7455 * return the larger of the input lists, but then outside code might need
7456 * to keep track of whether to free the input list or not */
7457
7458 UV* array_a; /* a's array */
7459 UV* array_b;
7460 UV len_a; /* length of a's array */
7461 UV len_b;
7462
7463 SV* u; /* the resulting union */
7464 UV* array_u;
7465 UV len_u;
7466
7467 UV i_a = 0; /* current index into a's array */
7468 UV i_b = 0;
7469 UV i_u = 0;
7470
7471 /* running count, as explained in the algorithm source book; items are
7472 * stopped accumulating and are output when the count changes to/from 0.
7473 * The count is incremented when we start a range that's in the set, and
7474 * decremented when we start a range that's not in the set. So its range
7475 * is 0 to 2. Only when the count is zero is something not in the set.
7476 */
7477 UV count = 0;
7478
7479 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7480 assert(a != b);
7481
7482 /* If either one is empty, the union is the other one */
7483 if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
7484 if (*output == a) {
7485 if (a != NULL) {
7486 SvREFCNT_dec(a);
7487 }
7488 }
7489 if (*output != b) {
7490 *output = invlist_clone(b);
7491 if (complement_b) {
7492 _invlist_invert(*output);
7493 }
7494 } /* else *output already = b; */
7495 return;
7496 }
7497 else if ((len_b = invlist_len(b)) == 0) {
7498 if (*output == b) {
7499 SvREFCNT_dec(b);
7500 }
7501
7502 /* The complement of an empty list is a list that has everything in it,
7503 * so the union with <a> includes everything too */
7504 if (complement_b) {
7505 if (a == *output) {
7506 SvREFCNT_dec(a);
7507 }
7508 *output = _new_invlist(1);
7509 _append_range_to_invlist(*output, 0, UV_MAX);
7510 }
7511 else if (*output != a) {
7512 *output = invlist_clone(a);
7513 }
7514 /* else *output already = a; */
7515 return;
7516 }
7517
7518 /* Here both lists exist and are non-empty */
7519 array_a = invlist_array(a);
7520 array_b = invlist_array(b);
7521
7522 /* If are to take the union of 'a' with the complement of b, set it
7523 * up so are looking at b's complement. */
7524 if (complement_b) {
7525
7526 /* To complement, we invert: if the first element is 0, remove it. To
7527 * do this, we just pretend the array starts one later, and clear the
7528 * flag as we don't have to do anything else later */
7529 if (array_b[0] == 0) {
7530 array_b++;
7531 len_b--;
7532 complement_b = FALSE;
7533 }
7534 else {
7535
7536 /* But if the first element is not zero, we unshift a 0 before the
7537 * array. The data structure reserves a space for that 0 (which
7538 * should be a '1' right now), so physical shifting is unneeded,
7539 * but temporarily change that element to 0. Before exiting the
7540 * routine, we must restore the element to '1' */
7541 array_b--;
7542 len_b++;
7543 array_b[0] = 0;
7544 }
7545 }
7546
7547 /* Size the union for the worst case: that the sets are completely
7548 * disjoint */
7549 u = _new_invlist(len_a + len_b);
7550
7551 /* Will contain U+0000 if either component does */
7552 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7553 || (len_b > 0 && array_b[0] == 0));
7554
7555 /* Go through each list item by item, stopping when exhausted one of
7556 * them */
7557 while (i_a < len_a && i_b < len_b) {
7558 UV cp; /* The element to potentially add to the union's array */
7559 bool cp_in_set; /* is it in the the input list's set or not */
7560
7561 /* We need to take one or the other of the two inputs for the union.
7562 * Since we are merging two sorted lists, we take the smaller of the
7563 * next items. In case of a tie, we take the one that is in its set
7564 * first. If we took one not in the set first, it would decrement the
7565 * count, possibly to 0 which would cause it to be output as ending the
7566 * range, and the next time through we would take the same number, and
7567 * output it again as beginning the next range. By doing it the
7568 * opposite way, there is no possibility that the count will be
7569 * momentarily decremented to 0, and thus the two adjoining ranges will
7570 * be seamlessly merged. (In a tie and both are in the set or both not
7571 * in the set, it doesn't matter which we take first.) */
7572 if (array_a[i_a] < array_b[i_b]
7573 || (array_a[i_a] == array_b[i_b]
7574 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7575 {
7576 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7577 cp= array_a[i_a++];
7578 }
7579 else {
7580 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7581 cp= array_b[i_b++];
7582 }
7583
7584 /* Here, have chosen which of the two inputs to look at. Only output
7585 * if the running count changes to/from 0, which marks the
7586 * beginning/end of a range in that's in the set */
7587 if (cp_in_set) {
7588 if (count == 0) {
7589 array_u[i_u++] = cp;
7590 }
7591 count++;
7592 }
7593 else {
7594 count--;
7595 if (count == 0) {
7596 array_u[i_u++] = cp;
7597 }
7598 }
7599 }
7600
7601 /* Here, we are finished going through at least one of the lists, which
7602 * means there is something remaining in at most one. We check if the list
7603 * that hasn't been exhausted is positioned such that we are in the middle
7604 * of a range in its set or not. (i_a and i_b point to the element beyond
7605 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7606 * is potentially more to output.
7607 * There are four cases:
7608 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7609 * in the union is entirely from the non-exhausted set.
7610 * 2) Both were in their sets, count is 2. Nothing further should
7611 * be output, as everything that remains will be in the exhausted
7612 * list's set, hence in the union; decrementing to 1 but not 0 insures
7613 * that
7614 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7615 * Nothing further should be output because the union includes
7616 * everything from the exhausted set. Not decrementing ensures that.
7617 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7618 * decrementing to 0 insures that we look at the remainder of the
7619 * non-exhausted set */
7620 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7621 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7622 {
7623 count--;
7624 }
7625
7626 /* The final length is what we've output so far, plus what else is about to
7627 * be output. (If 'count' is non-zero, then the input list we exhausted
7628 * has everything remaining up to the machine's limit in its set, and hence
7629 * in the union, so there will be no further output. */
7630 len_u = i_u;
7631 if (count == 0) {
7632 /* At most one of the subexpressions will be non-zero */
7633 len_u += (len_a - i_a) + (len_b - i_b);
7634 }
7635
7636 /* Set result to final length, which can change the pointer to array_u, so
7637 * re-find it */
7638 if (len_u != invlist_len(u)) {
7639 invlist_set_len(u, len_u);
7640 invlist_trim(u);
7641 array_u = invlist_array(u);
7642 }
7643
7644 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7645 * the other) ended with everything above it not in its set. That means
7646 * that the remaining part of the union is precisely the same as the
7647 * non-exhausted list, so can just copy it unchanged. (If both list were
7648 * exhausted at the same time, then the operations below will be both 0.)
7649 */
7650 if (count == 0) {
7651 IV copy_count; /* At most one will have a non-zero copy count */
7652 if ((copy_count = len_a - i_a) > 0) {
7653 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7654 }
7655 else if ((copy_count = len_b - i_b) > 0) {
7656 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7657 }
7658 }
7659
7660 /* We may be removing a reference to one of the inputs */
7661 if (a == *output || b == *output) {
7662 SvREFCNT_dec(*output);
7663 }
7664
7665 /* If we've changed b, restore it */
7666 if (complement_b) {
7667 array_b[0] = 1;
7668 }
7669
7670 *output = u;
7671 return;
7672}
7673
7674void
7675Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7676{
7677 /* Take the intersection of two inversion lists and point <i> to it. *i
7678 * should be defined upon input, and if it points to one of the two lists,
7679 * the reference count to that list will be decremented.
7680 * If <complement_b> is TRUE, the result will be the intersection of <a>
7681 * and the complement (or inversion) of <b> instead of <b> directly.
7682 *
7683 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7684 * Richard Gillam, published by Addison-Wesley, and explained at some
7685 * length there. The preface says to incorporate its examples into your
7686 * code at your own risk. In fact, it had bugs
7687 *
7688 * The algorithm is like a merge sort, and is essentially the same as the
7689 * union above
7690 */
7691
7692 UV* array_a; /* a's array */
7693 UV* array_b;
7694 UV len_a; /* length of a's array */
7695 UV len_b;
7696
7697 SV* r; /* the resulting intersection */
7698 UV* array_r;
7699 UV len_r;
7700
7701 UV i_a = 0; /* current index into a's array */
7702 UV i_b = 0;
7703 UV i_r = 0;
7704
7705 /* running count, as explained in the algorithm source book; items are
7706 * stopped accumulating and are output when the count changes to/from 2.
7707 * The count is incremented when we start a range that's in the set, and
7708 * decremented when we start a range that's not in the set. So its range
7709 * is 0 to 2. Only when the count is 2 is something in the intersection.
7710 */
7711 UV count = 0;
7712
7713 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7714 assert(a != b);
7715
7716 /* Special case if either one is empty */
7717 len_a = invlist_len(a);
7718 if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
7719
7720 if (len_a != 0 && complement_b) {
7721
7722 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7723 * be empty. Here, also we are using 'b's complement, which hence
7724 * must be every possible code point. Thus the intersection is
7725 * simply 'a'. */
7726 if (*i != a) {
7727 *i = invlist_clone(a);
7728
7729 if (*i == b) {
7730 SvREFCNT_dec(b);
7731 }
7732 }
7733 /* else *i is already 'a' */
7734 return;
7735 }
7736
7737 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7738 * intersection must be empty */
7739 if (*i == a) {
7740 SvREFCNT_dec(a);
7741 }
7742 else if (*i == b) {
7743 SvREFCNT_dec(b);
7744 }
7745 *i = _new_invlist(0);
7746 return;
7747 }
7748
7749 /* Here both lists exist and are non-empty */
7750 array_a = invlist_array(a);
7751 array_b = invlist_array(b);
7752
7753 /* If are to take the intersection of 'a' with the complement of b, set it
7754 * up so are looking at b's complement. */
7755 if (complement_b) {
7756
7757 /* To complement, we invert: if the first element is 0, remove it. To
7758 * do this, we just pretend the array starts one later, and clear the
7759 * flag as we don't have to do anything else later */
7760 if (array_b[0] == 0) {
7761 array_b++;
7762 len_b--;
7763 complement_b = FALSE;
7764 }
7765 else {
7766
7767 /* But if the first element is not zero, we unshift a 0 before the
7768 * array. The data structure reserves a space for that 0 (which
7769 * should be a '1' right now), so physical shifting is unneeded,
7770 * but temporarily change that element to 0. Before exiting the
7771 * routine, we must restore the element to '1' */
7772 array_b--;
7773 len_b++;
7774 array_b[0] = 0;
7775 }
7776 }
7777
7778 /* Size the intersection for the worst case: that the intersection ends up
7779 * fragmenting everything to be completely disjoint */
7780 r= _new_invlist(len_a + len_b);
7781
7782 /* Will contain U+0000 iff both components do */
7783 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7784 && len_b > 0 && array_b[0] == 0);
7785
7786 /* Go through each list item by item, stopping when exhausted one of
7787 * them */
7788 while (i_a < len_a && i_b < len_b) {
7789 UV cp; /* The element to potentially add to the intersection's
7790 array */
7791 bool cp_in_set; /* Is it in the input list's set or not */
7792
7793 /* We need to take one or the other of the two inputs for the
7794 * intersection. Since we are merging two sorted lists, we take the
7795 * smaller of the next items. In case of a tie, we take the one that
7796 * is not in its set first (a difference from the union algorithm). If
7797 * we took one in the set first, it would increment the count, possibly
7798 * to 2 which would cause it to be output as starting a range in the
7799 * intersection, and the next time through we would take that same
7800 * number, and output it again as ending the set. By doing it the
7801 * opposite of this, there is no possibility that the count will be
7802 * momentarily incremented to 2. (In a tie and both are in the set or
7803 * both not in the set, it doesn't matter which we take first.) */
7804 if (array_a[i_a] < array_b[i_b]
7805 || (array_a[i_a] == array_b[i_b]
7806 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7807 {
7808 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7809 cp= array_a[i_a++];
7810 }
7811 else {
7812 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7813 cp= array_b[i_b++];
7814 }
7815
7816 /* Here, have chosen which of the two inputs to look at. Only output
7817 * if the running count changes to/from 2, which marks the
7818 * beginning/end of a range that's in the intersection */
7819 if (cp_in_set) {
7820 count++;
7821 if (count == 2) {
7822 array_r[i_r++] = cp;
7823 }
7824 }
7825 else {
7826 if (count == 2) {
7827 array_r[i_r++] = cp;
7828 }
7829 count--;
7830 }
7831 }
7832
7833 /* Here, we are finished going through at least one of the lists, which
7834 * means there is something remaining in at most one. We check if the list
7835 * that has been exhausted is positioned such that we are in the middle
7836 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7837 * the ones we care about.) There are four cases:
7838 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7839 * nothing left in the intersection.
7840 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7841 * above 2. What should be output is exactly that which is in the
7842 * non-exhausted set, as everything it has is also in the intersection
7843 * set, and everything it doesn't have can't be in the intersection
7844 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7845 * gets incremented to 2. Like the previous case, the intersection is
7846 * everything that remains in the non-exhausted set.
7847 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7848 * remains 1. And the intersection has nothing more. */
7849 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7850 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7851 {
7852 count++;
7853 }
7854
7855 /* The final length is what we've output so far plus what else is in the
7856 * intersection. At most one of the subexpressions below will be non-zero */
7857 len_r = i_r;
7858 if (count >= 2) {
7859 len_r += (len_a - i_a) + (len_b - i_b);
7860 }
7861
7862 /* Set result to final length, which can change the pointer to array_r, so
7863 * re-find it */
7864 if (len_r != invlist_len(r)) {
7865 invlist_set_len(r, len_r);
7866 invlist_trim(r);
7867 array_r = invlist_array(r);
7868 }
7869
7870 /* Finish outputting any remaining */
7871 if (count >= 2) { /* At most one will have a non-zero copy count */
7872 IV copy_count;
7873 if ((copy_count = len_a - i_a) > 0) {
7874 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7875 }
7876 else if ((copy_count = len_b - i_b) > 0) {
7877 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7878 }
7879 }
7880
7881 /* We may be removing a reference to one of the inputs */
7882 if (a == *i || b == *i) {
7883 SvREFCNT_dec(*i);
7884 }
7885
7886 /* If we've changed b, restore it */
7887 if (complement_b) {
7888 array_b[0] = 1;
7889 }
7890
7891 *i = r;
7892 return;
7893}
7894
7895SV*
7896Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7897{
7898 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7899 * set. A pointer to the inversion list is returned. This may actually be
7900 * a new list, in which case the passed in one has been destroyed. The
7901 * passed in inversion list can be NULL, in which case a new one is created
7902 * with just the one range in it */
7903
7904 SV* range_invlist;
7905 UV len;
7906
7907 if (invlist == NULL) {
7908 invlist = _new_invlist(2);
7909 len = 0;
7910 }
7911 else {
7912 len = invlist_len(invlist);
7913 }
7914
7915 /* If comes after the final entry, can just append it to the end */
7916 if (len == 0
7917 || start >= invlist_array(invlist)
7918 [invlist_len(invlist) - 1])
7919 {
7920 _append_range_to_invlist(invlist, start, end);
7921 return invlist;
7922 }
7923
7924 /* Here, can't just append things, create and return a new inversion list
7925 * which is the union of this range and the existing inversion list */
7926 range_invlist = _new_invlist(2);
7927 _append_range_to_invlist(range_invlist, start, end);
7928
7929 _invlist_union(invlist, range_invlist, &invlist);
7930
7931 /* The temporary can be freed */
7932 SvREFCNT_dec(range_invlist);
7933
7934 return invlist;
7935}
7936
7937#endif
7938
7939PERL_STATIC_INLINE SV*
7940S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7941 return _add_range_to_invlist(invlist, cp, cp);
7942}
7943
7944#ifndef PERL_IN_XSUB_RE
7945void
7946Perl__invlist_invert(pTHX_ SV* const invlist)
7947{
7948 /* Complement the input inversion list. This adds a 0 if the list didn't
7949 * have a zero; removes it otherwise. As described above, the data
7950 * structure is set up so that this is very efficient */
7951
7952 UV* len_pos = get_invlist_len_addr(invlist);
7953
7954 PERL_ARGS_ASSERT__INVLIST_INVERT;
7955
7956 /* The inverse of matching nothing is matching everything */
7957 if (*len_pos == 0) {
7958 _append_range_to_invlist(invlist, 0, UV_MAX);
7959 return;
7960 }
7961
7962 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
7963 * zero element was a 0, so it is being removed, so the length decrements
7964 * by 1; and vice-versa. SvCUR is unaffected */
7965 if (*get_invlist_zero_addr(invlist) ^= 1) {
7966 (*len_pos)--;
7967 }
7968 else {
7969 (*len_pos)++;
7970 }
7971}
7972
7973void
7974Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7975{
7976 /* Complement the input inversion list (which must be a Unicode property,
7977 * all of which don't match above the Unicode maximum code point.) And
7978 * Perl has chosen to not have the inversion match above that either. This
7979 * adds a 0x110000 if the list didn't end with it, and removes it if it did
7980 */
7981
7982 UV len;
7983 UV* array;
7984
7985 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7986
7987 _invlist_invert(invlist);
7988
7989 len = invlist_len(invlist);
7990
7991 if (len != 0) { /* If empty do nothing */
7992 array = invlist_array(invlist);
7993 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7994 /* Add 0x110000. First, grow if necessary */
7995 len++;
7996 if (invlist_max(invlist) < len) {
7997 invlist_extend(invlist, len);
7998 array = invlist_array(invlist);
7999 }
8000 invlist_set_len(invlist, len);
8001 array[len - 1] = PERL_UNICODE_MAX + 1;
8002 }
8003 else { /* Remove the 0x110000 */
8004 invlist_set_len(invlist, len - 1);
8005 }
8006 }
8007
8008 return;
8009}
8010#endif
8011
8012PERL_STATIC_INLINE SV*
8013S_invlist_clone(pTHX_ SV* const invlist)
8014{
8015
8016 /* Return a new inversion list that is a copy of the input one, which is
8017 * unchanged */
8018
8019 /* Need to allocate extra space to accommodate Perl's addition of a
8020 * trailing NUL to SvPV's, since it thinks they are always strings */
8021 SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
8022 STRLEN length = SvCUR(invlist);
8023
8024 PERL_ARGS_ASSERT_INVLIST_CLONE;
8025
8026 SvCUR_set(new_invlist, length); /* This isn't done automatically */
8027 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8028
8029 return new_invlist;
8030}
8031
8032PERL_STATIC_INLINE UV*
8033S_get_invlist_iter_addr(pTHX_ SV* invlist)
8034{
8035 /* Return the address of the UV that contains the current iteration
8036 * position */
8037
8038 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8039
8040 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8041}
8042
8043PERL_STATIC_INLINE UV*
8044S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8045{
8046 /* Return the address of the UV that contains the version id. */
8047
8048 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8049
8050 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8051}
8052
8053PERL_STATIC_INLINE void
8054S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8055{
8056 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8057
8058 *get_invlist_iter_addr(invlist) = 0;
8059}
8060
8061STATIC bool
8062S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8063{
8064 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8065 * This call sets in <*start> and <*end>, the next range in <invlist>.
8066 * Returns <TRUE> if successful and the next call will return the next
8067 * range; <FALSE> if was already at the end of the list. If the latter,
8068 * <*start> and <*end> are unchanged, and the next call to this function
8069 * will start over at the beginning of the list */
8070
8071 UV* pos = get_invlist_iter_addr(invlist);
8072 UV len = invlist_len(invlist);
8073 UV *array;
8074
8075 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8076
8077 if (*pos >= len) {
8078 *pos = UV_MAX; /* Force iternit() to be required next time */
8079 return FALSE;
8080 }
8081
8082 array = invlist_array(invlist);
8083
8084 *start = array[(*pos)++];
8085
8086 if (*pos >= len) {
8087 *end = UV_MAX;
8088 }
8089 else {
8090 *end = array[(*pos)++] - 1;
8091 }
8092
8093 return TRUE;
8094}
8095
8096#ifndef PERL_IN_XSUB_RE
8097SV *
8098Perl__invlist_contents(pTHX_ SV* const invlist)
8099{
8100 /* Get the contents of an inversion list into a string SV so that they can
8101 * be printed out. It uses the format traditionally done for debug tracing
8102 */
8103
8104 UV start, end;
8105 SV* output = newSVpvs("\n");
8106
8107 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8108
8109 invlist_iterinit(invlist);
8110 while (invlist_iternext(invlist, &start, &end)) {
8111 if (end == UV_MAX) {
8112 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8113 }
8114 else if (end != start) {
8115 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8116 start, end);
8117 }
8118 else {
8119 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8120 }
8121 }
8122
8123 return output;
8124}
8125#endif
8126
8127#if 0
8128void
8129S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
8130{
8131 /* Dumps out the ranges in an inversion list. The string 'header'
8132 * if present is output on a line before the first range */
8133
8134 UV start, end;
8135
8136 if (header && strlen(header)) {
8137 PerlIO_printf(Perl_debug_log, "%s\n", header);
8138 }
8139 invlist_iterinit(invlist);
8140 while (invlist_iternext(invlist, &start, &end)) {
8141 if (end == UV_MAX) {
8142 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8143 }
8144 else {
8145 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
8146 }
8147 }
8148}
8149#endif
8150
8151#undef HEADER_LENGTH
8152#undef INVLIST_INITIAL_LENGTH
8153#undef TO_INTERNAL_SIZE
8154#undef FROM_INTERNAL_SIZE
8155#undef INVLIST_LEN_OFFSET
8156#undef INVLIST_ZERO_OFFSET
8157#undef INVLIST_ITER_OFFSET
8158#undef INVLIST_VERSION_ID
8159
8160/* End of inversion list object */
8161
8162/*
8163 - reg - regular expression, i.e. main body or parenthesized thing
8164 *
8165 * Caller must absorb opening parenthesis.
8166 *
8167 * Combining parenthesis handling with the base level of regular expression
8168 * is a trifle forced, but the need to tie the tails of the branches to what
8169 * follows makes it hard to avoid.
8170 */
8171#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8172#ifdef DEBUGGING
8173#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8174#else
8175#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8176#endif
8177
8178STATIC regnode *
8179S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8180 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8181{
8182 dVAR;
8183 register regnode *ret; /* Will be the head of the group. */
8184 register regnode *br;
8185 register regnode *lastbr;
8186 register regnode *ender = NULL;
8187 register I32 parno = 0;
8188 I32 flags;
8189 U32 oregflags = RExC_flags;
8190 bool have_branch = 0;
8191 bool is_open = 0;
8192 I32 freeze_paren = 0;
8193 I32 after_freeze = 0;
8194
8195 /* for (?g), (?gc), and (?o) warnings; warning
8196 about (?c) will warn about (?g) -- japhy */
8197
8198#define WASTED_O 0x01
8199#define WASTED_G 0x02
8200#define WASTED_C 0x04
8201#define WASTED_GC (0x02|0x04)
8202 I32 wastedflags = 0x00;
8203
8204 char * parse_start = RExC_parse; /* MJD */
8205 char * const oregcomp_parse = RExC_parse;
8206
8207 GET_RE_DEBUG_FLAGS_DECL;
8208
8209 PERL_ARGS_ASSERT_REG;
8210 DEBUG_PARSE("reg ");
8211
8212 *flagp = 0; /* Tentatively. */
8213
8214
8215 /* Make an OPEN node, if parenthesized. */
8216 if (paren) {
8217 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8218 char *start_verb = RExC_parse;
8219 STRLEN verb_len = 0;
8220 char *start_arg = NULL;
8221 unsigned char op = 0;
8222 int argok = 1;
8223 int internal_argval = 0; /* internal_argval is only useful if !argok */
8224 while ( *RExC_parse && *RExC_parse != ')' ) {
8225 if ( *RExC_parse == ':' ) {
8226 start_arg = RExC_parse + 1;
8227 break;
8228 }
8229 RExC_parse++;
8230 }
8231 ++start_verb;
8232 verb_len = RExC_parse - start_verb;
8233 if ( start_arg ) {
8234 RExC_parse++;
8235 while ( *RExC_parse && *RExC_parse != ')' )
8236 RExC_parse++;
8237 if ( *RExC_parse != ')' )
8238 vFAIL("Unterminated verb pattern argument");
8239 if ( RExC_parse == start_arg )
8240 start_arg = NULL;
8241 } else {
8242 if ( *RExC_parse != ')' )
8243 vFAIL("Unterminated verb pattern");
8244 }
8245
8246 switch ( *start_verb ) {
8247 case 'A': /* (*ACCEPT) */
8248 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8249 op = ACCEPT;
8250 internal_argval = RExC_nestroot;
8251 }
8252 break;
8253 case 'C': /* (*COMMIT) */
8254 if ( memEQs(start_verb,verb_len,"COMMIT") )
8255 op = COMMIT;
8256 break;
8257 case 'F': /* (*FAIL) */
8258 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8259 op = OPFAIL;
8260 argok = 0;
8261 }
8262 break;
8263 case ':': /* (*:NAME) */
8264 case 'M': /* (*MARK:NAME) */
8265 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8266 op = MARKPOINT;
8267 argok = -1;
8268 }
8269 break;
8270 case 'P': /* (*PRUNE) */
8271 if ( memEQs(start_verb,verb_len,"PRUNE") )
8272 op = PRUNE;
8273 break;
8274 case 'S': /* (*SKIP) */
8275 if ( memEQs(start_verb,verb_len,"SKIP") )
8276 op = SKIP;
8277 break;
8278 case 'T': /* (*THEN) */
8279 /* [19:06] <TimToady> :: is then */
8280 if ( memEQs(start_verb,verb_len,"THEN") ) {
8281 op = CUTGROUP;
8282 RExC_seen |= REG_SEEN_CUTGROUP;
8283 }
8284 break;
8285 }
8286 if ( ! op ) {
8287 RExC_parse++;
8288 vFAIL3("Unknown verb pattern '%.*s'",
8289 verb_len, start_verb);
8290 }
8291 if ( argok ) {
8292 if ( start_arg && internal_argval ) {
8293 vFAIL3("Verb pattern '%.*s' may not have an argument",
8294 verb_len, start_verb);
8295 } else if ( argok < 0 && !start_arg ) {
8296 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8297 verb_len, start_verb);
8298 } else {
8299 ret = reganode(pRExC_state, op, internal_argval);
8300 if ( ! internal_argval && ! SIZE_ONLY ) {
8301 if (start_arg) {
8302 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8303 ARG(ret) = add_data( pRExC_state, 1, "S" );
8304 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8305 ret->flags = 0;
8306 } else {
8307 ret->flags = 1;
8308 }
8309 }
8310 }
8311 if (!internal_argval)
8312 RExC_seen |= REG_SEEN_VERBARG;
8313 } else if ( start_arg ) {
8314 vFAIL3("Verb pattern '%.*s' may not have an argument",
8315 verb_len, start_verb);
8316 } else {
8317 ret = reg_node(pRExC_state, op);
8318 }
8319 nextchar(pRExC_state);
8320 return ret;
8321 } else
8322 if (*RExC_parse == '?') { /* (?...) */
8323 bool is_logical = 0;
8324 const char * const seqstart = RExC_parse;
8325 bool has_use_defaults = FALSE;
8326
8327 RExC_parse++;
8328 paren = *RExC_parse++;
8329 ret = NULL; /* For look-ahead/behind. */
8330 switch (paren) {
8331
8332 case 'P': /* (?P...) variants for those used to PCRE/Python */
8333 paren = *RExC_parse++;
8334 if ( paren == '<') /* (?P<...>) named capture */
8335 goto named_capture;
8336 else if (paren == '>') { /* (?P>name) named recursion */
8337 goto named_recursion;
8338 }
8339 else if (paren == '=') { /* (?P=...) named backref */
8340 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8341 you change this make sure you change that */
8342 char* name_start = RExC_parse;
8343 U32 num = 0;
8344 SV *sv_dat = reg_scan_name(pRExC_state,
8345 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8346 if (RExC_parse == name_start || *RExC_parse != ')')
8347 vFAIL2("Sequence %.3s... not terminated",parse_start);
8348
8349 if (!SIZE_ONLY) {
8350 num = add_data( pRExC_state, 1, "S" );
8351 RExC_rxi->data->data[num]=(void*)sv_dat;
8352 SvREFCNT_inc_simple_void(sv_dat);
8353 }
8354 RExC_sawback = 1;
8355 ret = reganode(pRExC_state,
8356 ((! FOLD)
8357 ? NREF
8358 : (MORE_ASCII_RESTRICTED)
8359 ? NREFFA
8360 : (AT_LEAST_UNI_SEMANTICS)
8361 ? NREFFU
8362 : (LOC)
8363 ? NREFFL
8364 : NREFF),
8365 num);
8366 *flagp |= HASWIDTH;
8367
8368 Set_Node_Offset(ret, parse_start+1);
8369 Set_Node_Cur_Length(ret); /* MJD */
8370
8371 nextchar(pRExC_state);
8372 return ret;
8373 }
8374 RExC_parse++;
8375 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8376 /*NOTREACHED*/
8377 case '<': /* (?<...) */
8378 if (*RExC_parse == '!')
8379 paren = ',';
8380 else if (*RExC_parse != '=')
8381 named_capture:
8382 { /* (?<...>) */
8383 char *name_start;
8384 SV *svname;
8385 paren= '>';
8386 case '\'': /* (?'...') */
8387 name_start= RExC_parse;
8388 svname = reg_scan_name(pRExC_state,
8389 SIZE_ONLY ? /* reverse test from the others */
8390 REG_RSN_RETURN_NAME :
8391 REG_RSN_RETURN_NULL);
8392 if (RExC_parse == name_start) {
8393 RExC_parse++;
8394 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8395 /*NOTREACHED*/
8396 }
8397 if (*RExC_parse != paren)
8398 vFAIL2("Sequence (?%c... not terminated",
8399 paren=='>' ? '<' : paren);
8400 if (SIZE_ONLY) {
8401 HE *he_str;
8402 SV *sv_dat = NULL;
8403 if (!svname) /* shouldn't happen */
8404 Perl_croak(aTHX_
8405 "panic: reg_scan_name returned NULL");
8406 if (!RExC_paren_names) {
8407 RExC_paren_names= newHV();
8408 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8409#ifdef DEBUGGING
8410 RExC_paren_name_list= newAV();
8411 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8412#endif
8413 }
8414 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8415 if ( he_str )
8416 sv_dat = HeVAL(he_str);
8417 if ( ! sv_dat ) {
8418 /* croak baby croak */
8419 Perl_croak(aTHX_
8420 "panic: paren_name hash element allocation failed");
8421 } else if ( SvPOK(sv_dat) ) {
8422 /* (?|...) can mean we have dupes so scan to check
8423 its already been stored. Maybe a flag indicating
8424 we are inside such a construct would be useful,
8425 but the arrays are likely to be quite small, so
8426 for now we punt -- dmq */
8427 IV count = SvIV(sv_dat);
8428 I32 *pv = (I32*)SvPVX(sv_dat);
8429 IV i;
8430 for ( i = 0 ; i < count ; i++ ) {
8431 if ( pv[i] == RExC_npar ) {
8432 count = 0;
8433 break;
8434 }
8435 }
8436 if ( count ) {
8437 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8438 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8439 pv[count] = RExC_npar;
8440 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8441 }
8442 } else {
8443 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8444 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8445 SvIOK_on(sv_dat);
8446 SvIV_set(sv_dat, 1);
8447 }
8448#ifdef DEBUGGING
8449 /* Yes this does cause a memory leak in debugging Perls */
8450 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8451 SvREFCNT_dec(svname);
8452#endif
8453
8454 /*sv_dump(sv_dat);*/
8455 }
8456 nextchar(pRExC_state);
8457 paren = 1;
8458 goto capturing_parens;
8459 }
8460 RExC_seen |= REG_SEEN_LOOKBEHIND;
8461 RExC_in_lookbehind++;
8462 RExC_parse++;
8463 case '=': /* (?=...) */
8464 RExC_seen_zerolen++;
8465 break;
8466 case '!': /* (?!...) */
8467 RExC_seen_zerolen++;
8468 if (*RExC_parse == ')') {
8469 ret=reg_node(pRExC_state, OPFAIL);
8470 nextchar(pRExC_state);
8471 return ret;
8472 }
8473 break;
8474 case '|': /* (?|...) */
8475 /* branch reset, behave like a (?:...) except that
8476 buffers in alternations share the same numbers */
8477 paren = ':';
8478 after_freeze = freeze_paren = RExC_npar;
8479 break;
8480 case ':': /* (?:...) */
8481 case '>': /* (?>...) */
8482 break;
8483 case '$': /* (?$...) */
8484 case '@': /* (?@...) */
8485 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8486 break;
8487 case '#': /* (?#...) */
8488 while (*RExC_parse && *RExC_parse != ')')
8489 RExC_parse++;
8490 if (*RExC_parse != ')')
8491 FAIL("Sequence (?#... not terminated");
8492 nextchar(pRExC_state);
8493 *flagp = TRYAGAIN;
8494 return NULL;
8495 case '0' : /* (?0) */
8496 case 'R' : /* (?R) */
8497 if (*RExC_parse != ')')
8498 FAIL("Sequence (?R) not terminated");
8499 ret = reg_node(pRExC_state, GOSTART);
8500 *flagp |= POSTPONED;
8501 nextchar(pRExC_state);
8502 return ret;
8503 /*notreached*/
8504 { /* named and numeric backreferences */
8505 I32 num;
8506 case '&': /* (?&NAME) */
8507 parse_start = RExC_parse - 1;
8508 named_recursion:
8509 {
8510 SV *sv_dat = reg_scan_name(pRExC_state,
8511 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8512 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8513 }
8514 goto gen_recurse_regop;
8515 assert(0); /* NOT REACHED */
8516 case '+':
8517 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8518 RExC_parse++;
8519 vFAIL("Illegal pattern");
8520 }
8521 goto parse_recursion;
8522 /* NOT REACHED*/
8523 case '-': /* (?-1) */
8524 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8525 RExC_parse--; /* rewind to let it be handled later */
8526 goto parse_flags;
8527 }
8528 /*FALLTHROUGH */
8529 case '1': case '2': case '3': case '4': /* (?1) */
8530 case '5': case '6': case '7': case '8': case '9':
8531 RExC_parse--;
8532 parse_recursion:
8533 num = atoi(RExC_parse);
8534 parse_start = RExC_parse - 1; /* MJD */
8535 if (*RExC_parse == '-')
8536 RExC_parse++;
8537 while (isDIGIT(*RExC_parse))
8538 RExC_parse++;
8539 if (*RExC_parse!=')')
8540 vFAIL("Expecting close bracket");
8541
8542 gen_recurse_regop:
8543 if ( paren == '-' ) {
8544 /*
8545 Diagram of capture buffer numbering.
8546 Top line is the normal capture buffer numbers
8547 Bottom line is the negative indexing as from
8548 the X (the (?-2))
8549
8550 + 1 2 3 4 5 X 6 7
8551 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8552 - 5 4 3 2 1 X x x
8553
8554 */
8555 num = RExC_npar + num;
8556 if (num < 1) {
8557 RExC_parse++;
8558 vFAIL("Reference to nonexistent group");
8559 }
8560 } else if ( paren == '+' ) {
8561 num = RExC_npar + num - 1;
8562 }
8563
8564 ret = reganode(pRExC_state, GOSUB, num);
8565 if (!SIZE_ONLY) {
8566 if (num > (I32)RExC_rx->nparens) {
8567 RExC_parse++;
8568 vFAIL("Reference to nonexistent group");
8569 }
8570 ARG2L_SET( ret, RExC_recurse_count++);
8571 RExC_emit++;
8572 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8573 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8574 } else {
8575 RExC_size++;
8576 }
8577 RExC_seen |= REG_SEEN_RECURSE;
8578 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8579 Set_Node_Offset(ret, parse_start); /* MJD */
8580
8581 *flagp |= POSTPONED;
8582 nextchar(pRExC_state);
8583 return ret;
8584 } /* named and numeric backreferences */
8585 assert(0); /* NOT REACHED */
8586
8587 case '?': /* (??...) */
8588 is_logical = 1;
8589 if (*RExC_parse != '{') {
8590 RExC_parse++;
8591 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8592 /*NOTREACHED*/
8593 }
8594 *flagp |= POSTPONED;
8595 paren = *RExC_parse++;
8596 /* FALL THROUGH */
8597 case '{': /* (?{...}) */
8598 {
8599 U32 n = 0;
8600 struct reg_code_block *cb;
8601
8602 RExC_seen_zerolen++;
8603
8604 if ( !pRExC_state->num_code_blocks
8605 || pRExC_state->code_index >= pRExC_state->num_code_blocks
8606 || pRExC_state->code_blocks[pRExC_state->code_index].start
8607 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8608 - RExC_start)
8609 ) {
8610 if (RExC_pm_flags & PMf_USE_RE_EVAL)
8611 FAIL("panic: Sequence (?{...}): no code block found\n");
8612 FAIL("Eval-group not allowed at runtime, use re 'eval'");
8613 }
8614 /* this is a pre-compiled code block (?{...}) */
8615 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8616 RExC_parse = RExC_start + cb->end;
8617 if (!SIZE_ONLY) {
8618 OP *o = cb->block;
8619 if (cb->src_regex) {
8620 n = add_data(pRExC_state, 2, "rl");
8621 RExC_rxi->data->data[n] =
8622 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8623 RExC_rxi->data->data[n+1] = (void*)o;
8624 }
8625 else {
8626 n = add_data(pRExC_state, 1,
8627 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8628 RExC_rxi->data->data[n] = (void*)o;
8629 }
8630 }
8631 pRExC_state->code_index++;
8632 nextchar(pRExC_state);
8633
8634 if (is_logical) {
8635 regnode *eval;
8636 ret = reg_node(pRExC_state, LOGICAL);
8637 eval = reganode(pRExC_state, EVAL, n);
8638 if (!SIZE_ONLY) {
8639 ret->flags = 2;
8640 /* for later propagation into (??{}) return value */
8641 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8642 }
8643 REGTAIL(pRExC_state, ret, eval);
8644 /* deal with the length of this later - MJD */
8645 return ret;
8646 }
8647 ret = reganode(pRExC_state, EVAL, n);
8648 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8649 Set_Node_Offset(ret, parse_start);
8650 return ret;
8651 }
8652 case '(': /* (?(?{...})...) and (?(?=...)...) */
8653 {
8654 int is_define= 0;
8655 if (RExC_parse[0] == '?') { /* (?(?...)) */
8656 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8657 || RExC_parse[1] == '<'
8658 || RExC_parse[1] == '{') { /* Lookahead or eval. */
8659 I32 flag;
8660
8661 ret = reg_node(pRExC_state, LOGICAL);
8662 if (!SIZE_ONLY)
8663 ret->flags = 1;
8664 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8665 goto insert_if;
8666 }
8667 }
8668 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
8669 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8670 {
8671 char ch = RExC_parse[0] == '<' ? '>' : '\'';
8672 char *name_start= RExC_parse++;
8673 U32 num = 0;
8674 SV *sv_dat=reg_scan_name(pRExC_state,
8675 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8676 if (RExC_parse == name_start || *RExC_parse != ch)
8677 vFAIL2("Sequence (?(%c... not terminated",
8678 (ch == '>' ? '<' : ch));
8679 RExC_parse++;
8680 if (!SIZE_ONLY) {
8681 num = add_data( pRExC_state, 1, "S" );
8682 RExC_rxi->data->data[num]=(void*)sv_dat;
8683 SvREFCNT_inc_simple_void(sv_dat);
8684 }
8685 ret = reganode(pRExC_state,NGROUPP,num);
8686 goto insert_if_check_paren;
8687 }
8688 else if (RExC_parse[0] == 'D' &&
8689 RExC_parse[1] == 'E' &&
8690 RExC_parse[2] == 'F' &&
8691 RExC_parse[3] == 'I' &&
8692 RExC_parse[4] == 'N' &&
8693 RExC_parse[5] == 'E')
8694 {
8695 ret = reganode(pRExC_state,DEFINEP,0);
8696 RExC_parse +=6 ;
8697 is_define = 1;
8698 goto insert_if_check_paren;
8699 }
8700 else if (RExC_parse[0] == 'R') {
8701 RExC_parse++;
8702 parno = 0;
8703 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8704 parno = atoi(RExC_parse++);
8705 while (isDIGIT(*RExC_parse))
8706 RExC_parse++;
8707 } else if (RExC_parse[0] == '&') {
8708 SV *sv_dat;
8709 RExC_parse++;
8710 sv_dat = reg_scan_name(pRExC_state,
8711 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8712 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8713 }
8714 ret = reganode(pRExC_state,INSUBP,parno);
8715 goto insert_if_check_paren;
8716 }
8717 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8718 /* (?(1)...) */
8719 char c;
8720 parno = atoi(RExC_parse++);
8721
8722 while (isDIGIT(*RExC_parse))
8723 RExC_parse++;
8724 ret = reganode(pRExC_state, GROUPP, parno);
8725
8726 insert_if_check_paren:
8727 if ((c = *nextchar(pRExC_state)) != ')')
8728 vFAIL("Switch condition not recognized");
8729 insert_if:
8730 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8731 br = regbranch(pRExC_state, &flags, 1,depth+1);
8732 if (br == NULL)
8733 br = reganode(pRExC_state, LONGJMP, 0);
8734 else
8735 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8736 c = *nextchar(pRExC_state);
8737 if (flags&HASWIDTH)
8738 *flagp |= HASWIDTH;
8739 if (c == '|') {
8740 if (is_define)
8741 vFAIL("(?(DEFINE)....) does not allow branches");
8742 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8743 regbranch(pRExC_state, &flags, 1,depth+1);
8744 REGTAIL(pRExC_state, ret, lastbr);
8745 if (flags&HASWIDTH)
8746 *flagp |= HASWIDTH;
8747 c = *nextchar(pRExC_state);
8748 }
8749 else
8750 lastbr = NULL;
8751 if (c != ')')
8752 vFAIL("Switch (?(condition)... contains too many branches");
8753 ender = reg_node(pRExC_state, TAIL);
8754 REGTAIL(pRExC_state, br, ender);
8755 if (lastbr) {
8756 REGTAIL(pRExC_state, lastbr, ender);
8757 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8758 }
8759 else
8760 REGTAIL(pRExC_state, ret, ender);
8761 RExC_size++; /* XXX WHY do we need this?!!
8762 For large programs it seems to be required
8763 but I can't figure out why. -- dmq*/
8764 return ret;
8765 }
8766 else {
8767 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8768 }
8769 }
8770 case 0:
8771 RExC_parse--; /* for vFAIL to print correctly */
8772 vFAIL("Sequence (? incomplete");
8773 break;
8774 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8775 that follow */
8776 has_use_defaults = TRUE;
8777 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8778 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8779 ? REGEX_UNICODE_CHARSET
8780 : REGEX_DEPENDS_CHARSET);
8781 goto parse_flags;
8782 default:
8783 --RExC_parse;
8784 parse_flags: /* (?i) */
8785 {
8786 U32 posflags = 0, negflags = 0;
8787 U32 *flagsp = &posflags;
8788 char has_charset_modifier = '\0';
8789 regex_charset cs = get_regex_charset(RExC_flags);
8790 if (cs == REGEX_DEPENDS_CHARSET
8791 && (RExC_utf8 || RExC_uni_semantics))
8792 {
8793 cs = REGEX_UNICODE_CHARSET;
8794 }
8795
8796 while (*RExC_parse) {
8797 /* && strchr("iogcmsx", *RExC_parse) */
8798 /* (?g), (?gc) and (?o) are useless here
8799 and must be globally applied -- japhy */
8800 switch (*RExC_parse) {
8801 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8802 case LOCALE_PAT_MOD:
8803 if (has_charset_modifier) {
8804 goto excess_modifier;
8805 }
8806 else if (flagsp == &negflags) {
8807 goto neg_modifier;
8808 }
8809 cs = REGEX_LOCALE_CHARSET;
8810 has_charset_modifier = LOCALE_PAT_MOD;
8811 RExC_contains_locale = 1;
8812 break;
8813 case UNICODE_PAT_MOD:
8814 if (has_charset_modifier) {
8815 goto excess_modifier;
8816 }
8817 else if (flagsp == &negflags) {
8818 goto neg_modifier;
8819 }
8820 cs = REGEX_UNICODE_CHARSET;
8821 has_charset_modifier = UNICODE_PAT_MOD;
8822 break;
8823 case ASCII_RESTRICT_PAT_MOD:
8824 if (flagsp == &negflags) {
8825 goto neg_modifier;
8826 }
8827 if (has_charset_modifier) {
8828 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8829 goto excess_modifier;
8830 }
8831 /* Doubled modifier implies more restricted */
8832 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8833 }
8834 else {
8835 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8836 }
8837 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8838 break;
8839 case DEPENDS_PAT_MOD:
8840 if (has_use_defaults) {
8841 goto fail_modifiers;
8842 }
8843 else if (flagsp == &negflags) {
8844 goto neg_modifier;
8845 }
8846 else if (has_charset_modifier) {
8847 goto excess_modifier;
8848 }
8849
8850 /* The dual charset means unicode semantics if the
8851 * pattern (or target, not known until runtime) are
8852 * utf8, or something in the pattern indicates unicode
8853 * semantics */
8854 cs = (RExC_utf8 || RExC_uni_semantics)
8855 ? REGEX_UNICODE_CHARSET
8856 : REGEX_DEPENDS_CHARSET;
8857 has_charset_modifier = DEPENDS_PAT_MOD;
8858 break;
8859 excess_modifier:
8860 RExC_parse++;
8861 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8862 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8863 }
8864 else if (has_charset_modifier == *(RExC_parse - 1)) {
8865 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8866 }
8867 else {
8868 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8869 }
8870 /*NOTREACHED*/
8871 neg_modifier:
8872 RExC_parse++;
8873 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8874 /*NOTREACHED*/
8875 case ONCE_PAT_MOD: /* 'o' */
8876 case GLOBAL_PAT_MOD: /* 'g' */
8877 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8878 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8879 if (! (wastedflags & wflagbit) ) {
8880 wastedflags |= wflagbit;
8881 vWARN5(
8882 RExC_parse + 1,
8883 "Useless (%s%c) - %suse /%c modifier",
8884 flagsp == &negflags ? "?-" : "?",
8885 *RExC_parse,
8886 flagsp == &negflags ? "don't " : "",
8887 *RExC_parse
8888 );
8889 }
8890 }
8891 break;
8892
8893 case CONTINUE_PAT_MOD: /* 'c' */
8894 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8895 if (! (wastedflags & WASTED_C) ) {
8896 wastedflags |= WASTED_GC;
8897 vWARN3(
8898 RExC_parse + 1,
8899 "Useless (%sc) - %suse /gc modifier",
8900 flagsp == &negflags ? "?-" : "?",
8901 flagsp == &negflags ? "don't " : ""
8902 );
8903 }
8904 }
8905 break;
8906 case KEEPCOPY_PAT_MOD: /* 'p' */
8907 if (flagsp == &negflags) {
8908 if (SIZE_ONLY)
8909 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8910 } else {
8911 *flagsp |= RXf_PMf_KEEPCOPY;
8912 }
8913 break;
8914 case '-':
8915 /* A flag is a default iff it is following a minus, so
8916 * if there is a minus, it means will be trying to
8917 * re-specify a default which is an error */
8918 if (has_use_defaults || flagsp == &negflags) {
8919 fail_modifiers:
8920 RExC_parse++;
8921 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8922 /*NOTREACHED*/
8923 }
8924 flagsp = &negflags;
8925 wastedflags = 0; /* reset so (?g-c) warns twice */
8926 break;
8927 case ':':
8928 paren = ':';
8929 /*FALLTHROUGH*/
8930 case ')':
8931 RExC_flags |= posflags;
8932 RExC_flags &= ~negflags;
8933 set_regex_charset(&RExC_flags, cs);
8934 if (paren != ':') {
8935 oregflags |= posflags;
8936 oregflags &= ~negflags;
8937 set_regex_charset(&oregflags, cs);
8938 }
8939 nextchar(pRExC_state);
8940 if (paren != ':') {
8941 *flagp = TRYAGAIN;
8942 return NULL;
8943 } else {
8944 ret = NULL;
8945 goto parse_rest;
8946 }
8947 /*NOTREACHED*/
8948 default:
8949 RExC_parse++;
8950 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8951 /*NOTREACHED*/
8952 }
8953 ++RExC_parse;
8954 }
8955 }} /* one for the default block, one for the switch */
8956 }
8957 else { /* (...) */
8958 capturing_parens:
8959 parno = RExC_npar;
8960 RExC_npar++;
8961
8962 ret = reganode(pRExC_state, OPEN, parno);
8963 if (!SIZE_ONLY ){
8964 if (!RExC_nestroot)
8965 RExC_nestroot = parno;
8966 if (RExC_seen & REG_SEEN_RECURSE
8967 && !RExC_open_parens[parno-1])
8968 {
8969 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8970 "Setting open paren #%"IVdf" to %d\n",
8971 (IV)parno, REG_NODE_NUM(ret)));
8972 RExC_open_parens[parno-1]= ret;
8973 }
8974 }
8975 Set_Node_Length(ret, 1); /* MJD */
8976 Set_Node_Offset(ret, RExC_parse); /* MJD */
8977 is_open = 1;
8978 }
8979 }
8980 else /* ! paren */
8981 ret = NULL;
8982
8983 parse_rest:
8984 /* Pick up the branches, linking them together. */
8985 parse_start = RExC_parse; /* MJD */
8986 br = regbranch(pRExC_state, &flags, 1,depth+1);
8987
8988 /* branch_len = (paren != 0); */
8989
8990 if (br == NULL)
8991 return(NULL);
8992 if (*RExC_parse == '|') {
8993 if (!SIZE_ONLY && RExC_extralen) {
8994 reginsert(pRExC_state, BRANCHJ, br, depth+1);
8995 }
8996 else { /* MJD */
8997 reginsert(pRExC_state, BRANCH, br, depth+1);
8998 Set_Node_Length(br, paren != 0);
8999 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9000 }
9001 have_branch = 1;
9002 if (SIZE_ONLY)
9003 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9004 }
9005 else if (paren == ':') {
9006 *flagp |= flags&SIMPLE;
9007 }
9008 if (is_open) { /* Starts with OPEN. */
9009 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9010 }
9011 else if (paren != '?') /* Not Conditional */
9012 ret = br;
9013 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9014 lastbr = br;
9015 while (*RExC_parse == '|') {
9016 if (!SIZE_ONLY && RExC_extralen) {
9017 ender = reganode(pRExC_state, LONGJMP,0);
9018 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9019 }
9020 if (SIZE_ONLY)
9021 RExC_extralen += 2; /* Account for LONGJMP. */
9022 nextchar(pRExC_state);
9023 if (freeze_paren) {
9024 if (RExC_npar > after_freeze)
9025 after_freeze = RExC_npar;
9026 RExC_npar = freeze_paren;
9027 }
9028 br = regbranch(pRExC_state, &flags, 0, depth+1);
9029
9030 if (br == NULL)
9031 return(NULL);
9032 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9033 lastbr = br;
9034 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9035 }
9036
9037 if (have_branch || paren != ':') {
9038 /* Make a closing node, and hook it on the end. */
9039 switch (paren) {
9040 case ':':
9041 ender = reg_node(pRExC_state, TAIL);
9042 break;
9043 case 1:
9044 ender = reganode(pRExC_state, CLOSE, parno);
9045 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9046 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9047 "Setting close paren #%"IVdf" to %d\n",
9048 (IV)parno, REG_NODE_NUM(ender)));
9049 RExC_close_parens[parno-1]= ender;
9050 if (RExC_nestroot == parno)
9051 RExC_nestroot = 0;
9052 }
9053 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9054 Set_Node_Length(ender,1); /* MJD */
9055 break;
9056 case '<':
9057 case ',':
9058 case '=':
9059 case '!':
9060 *flagp &= ~HASWIDTH;
9061 /* FALL THROUGH */
9062 case '>':
9063 ender = reg_node(pRExC_state, SUCCEED);
9064 break;
9065 case 0:
9066 ender = reg_node(pRExC_state, END);
9067 if (!SIZE_ONLY) {
9068 assert(!RExC_opend); /* there can only be one! */
9069 RExC_opend = ender;
9070 }
9071 break;
9072 }
9073 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9074 SV * const mysv_val1=sv_newmortal();
9075 SV * const mysv_val2=sv_newmortal();
9076 DEBUG_PARSE_MSG("lsbr");
9077 regprop(RExC_rx, mysv_val1, lastbr);
9078 regprop(RExC_rx, mysv_val2, ender);
9079 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9080 SvPV_nolen_const(mysv_val1),
9081 (IV)REG_NODE_NUM(lastbr),
9082 SvPV_nolen_const(mysv_val2),
9083 (IV)REG_NODE_NUM(ender),
9084 (IV)(ender - lastbr)
9085 );
9086 });
9087 REGTAIL(pRExC_state, lastbr, ender);
9088
9089 if (have_branch && !SIZE_ONLY) {
9090 char is_nothing= 1;
9091 if (depth==1)
9092 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9093
9094 /* Hook the tails of the branches to the closing node. */
9095 for (br = ret; br; br = regnext(br)) {
9096 const U8 op = PL_regkind[OP(br)];
9097 if (op == BRANCH) {
9098 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9099 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9100 is_nothing= 0;
9101 }
9102 else if (op == BRANCHJ) {
9103 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9104 /* for now we always disable this optimisation * /
9105 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9106 */
9107 is_nothing= 0;
9108 }
9109 }
9110 if (is_nothing) {
9111 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9112 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9113 SV * const mysv_val1=sv_newmortal();
9114 SV * const mysv_val2=sv_newmortal();
9115 DEBUG_PARSE_MSG("NADA");
9116 regprop(RExC_rx, mysv_val1, ret);
9117 regprop(RExC_rx, mysv_val2, ender);
9118 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9119 SvPV_nolen_const(mysv_val1),
9120 (IV)REG_NODE_NUM(ret),
9121 SvPV_nolen_const(mysv_val2),
9122 (IV)REG_NODE_NUM(ender),
9123 (IV)(ender - ret)
9124 );
9125 });
9126 OP(br)= NOTHING;
9127 if (OP(ender) == TAIL) {
9128 NEXT_OFF(br)= 0;
9129 RExC_emit= br + 1;
9130 } else {
9131 regnode *opt;
9132 for ( opt= br + 1; opt < ender ; opt++ )
9133 OP(opt)= OPTIMIZED;
9134 NEXT_OFF(br)= ender - br;
9135 }
9136 }
9137 }
9138 }
9139
9140 {
9141 const char *p;
9142 static const char parens[] = "=!<,>";
9143
9144 if (paren && (p = strchr(parens, paren))) {
9145 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9146 int flag = (p - parens) > 1;
9147
9148 if (paren == '>')
9149 node = SUSPEND, flag = 0;
9150 reginsert(pRExC_state, node,ret, depth+1);
9151 Set_Node_Cur_Length(ret);
9152 Set_Node_Offset(ret, parse_start + 1);
9153 ret->flags = flag;
9154 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9155 }
9156 }
9157
9158 /* Check for proper termination. */
9159 if (paren) {
9160 RExC_flags = oregflags;
9161 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9162 RExC_parse = oregcomp_parse;
9163 vFAIL("Unmatched (");
9164 }
9165 }
9166 else if (!paren && RExC_parse < RExC_end) {
9167 if (*RExC_parse == ')') {
9168 RExC_parse++;
9169 vFAIL("Unmatched )");
9170 }
9171 else
9172 FAIL("Junk on end of regexp"); /* "Can't happen". */
9173 assert(0); /* NOTREACHED */
9174 }
9175
9176 if (RExC_in_lookbehind) {
9177 RExC_in_lookbehind--;
9178 }
9179 if (after_freeze > RExC_npar)
9180 RExC_npar = after_freeze;
9181 return(ret);
9182}
9183
9184/*
9185 - regbranch - one alternative of an | operator
9186 *
9187 * Implements the concatenation operator.
9188 */
9189STATIC regnode *
9190S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9191{
9192 dVAR;
9193 register regnode *ret;
9194 register regnode *chain = NULL;
9195 register regnode *latest;
9196 I32 flags = 0, c = 0;
9197 GET_RE_DEBUG_FLAGS_DECL;
9198
9199 PERL_ARGS_ASSERT_REGBRANCH;
9200
9201 DEBUG_PARSE("brnc");
9202
9203 if (first)
9204 ret = NULL;
9205 else {
9206 if (!SIZE_ONLY && RExC_extralen)
9207 ret = reganode(pRExC_state, BRANCHJ,0);
9208 else {
9209 ret = reg_node(pRExC_state, BRANCH);
9210 Set_Node_Length(ret, 1);
9211 }
9212 }
9213
9214 if (!first && SIZE_ONLY)
9215 RExC_extralen += 1; /* BRANCHJ */
9216
9217 *flagp = WORST; /* Tentatively. */
9218
9219 RExC_parse--;
9220 nextchar(pRExC_state);
9221 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9222 flags &= ~TRYAGAIN;
9223 latest = regpiece(pRExC_state, &flags,depth+1);
9224 if (latest == NULL) {
9225 if (flags & TRYAGAIN)
9226 continue;
9227 return(NULL);
9228 }
9229 else if (ret == NULL)
9230 ret = latest;
9231 *flagp |= flags&(HASWIDTH|POSTPONED);
9232 if (chain == NULL) /* First piece. */
9233 *flagp |= flags&SPSTART;
9234 else {
9235 RExC_naughty++;
9236 REGTAIL(pRExC_state, chain, latest);
9237 }
9238 chain = latest;
9239 c++;
9240 }
9241 if (chain == NULL) { /* Loop ran zero times. */
9242 chain = reg_node(pRExC_state, NOTHING);
9243 if (ret == NULL)
9244 ret = chain;
9245 }
9246 if (c == 1) {
9247 *flagp |= flags&SIMPLE;
9248 }
9249
9250 return ret;
9251}
9252
9253/*
9254 - regpiece - something followed by possible [*+?]
9255 *
9256 * Note that the branching code sequences used for ? and the general cases
9257 * of * and + are somewhat optimized: they use the same NOTHING node as
9258 * both the endmarker for their branch list and the body of the last branch.
9259 * It might seem that this node could be dispensed with entirely, but the
9260 * endmarker role is not redundant.
9261 */
9262STATIC regnode *
9263S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9264{
9265 dVAR;
9266 register regnode *ret;
9267 register char op;
9268 register char *next;
9269 I32 flags;
9270 const char * const origparse = RExC_parse;
9271 I32 min;
9272 I32 max = REG_INFTY;
9273#ifdef RE_TRACK_PATTERN_OFFSETS
9274 char *parse_start;
9275#endif
9276 const char *maxpos = NULL;
9277 GET_RE_DEBUG_FLAGS_DECL;
9278
9279 PERL_ARGS_ASSERT_REGPIECE;
9280
9281 DEBUG_PARSE("piec");
9282
9283 ret = regatom(pRExC_state, &flags,depth+1);
9284 if (ret == NULL) {
9285 if (flags & TRYAGAIN)
9286 *flagp |= TRYAGAIN;
9287 return(NULL);
9288 }
9289
9290 op = *RExC_parse;
9291
9292 if (op == '{' && regcurly(RExC_parse)) {
9293 maxpos = NULL;
9294#ifdef RE_TRACK_PATTERN_OFFSETS
9295 parse_start = RExC_parse; /* MJD */
9296#endif
9297 next = RExC_parse + 1;
9298 while (isDIGIT(*next) || *next == ',') {
9299 if (*next == ',') {
9300 if (maxpos)
9301 break;
9302 else
9303 maxpos = next;
9304 }
9305 next++;
9306 }
9307 if (*next == '}') { /* got one */
9308 if (!maxpos)
9309 maxpos = next;
9310 RExC_parse++;
9311 min = atoi(RExC_parse);
9312 if (*maxpos == ',')
9313 maxpos++;
9314 else
9315 maxpos = RExC_parse;
9316 max = atoi(maxpos);
9317 if (!max && *maxpos != '0')
9318 max = REG_INFTY; /* meaning "infinity" */
9319 else if (max >= REG_INFTY)
9320 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9321 RExC_parse = next;
9322 nextchar(pRExC_state);
9323
9324 do_curly:
9325 if ((flags&SIMPLE)) {
9326 RExC_naughty += 2 + RExC_naughty / 2;
9327 reginsert(pRExC_state, CURLY, ret, depth+1);
9328 Set_Node_Offset(ret, parse_start+1); /* MJD */
9329 Set_Node_Cur_Length(ret);
9330 }
9331 else {
9332 regnode * const w = reg_node(pRExC_state, WHILEM);
9333
9334 w->flags = 0;
9335 REGTAIL(pRExC_state, ret, w);
9336 if (!SIZE_ONLY && RExC_extralen) {
9337 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9338 reginsert(pRExC_state, NOTHING,ret, depth+1);
9339 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9340 }
9341 reginsert(pRExC_state, CURLYX,ret, depth+1);
9342 /* MJD hk */
9343 Set_Node_Offset(ret, parse_start+1);
9344 Set_Node_Length(ret,
9345 op == '{' ? (RExC_parse - parse_start) : 1);
9346
9347 if (!SIZE_ONLY && RExC_extralen)
9348 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9349 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9350 if (SIZE_ONLY)
9351 RExC_whilem_seen++, RExC_extralen += 3;
9352 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9353 }
9354 ret->flags = 0;
9355
9356 if (min > 0)
9357 *flagp = WORST;
9358 if (max > 0)
9359 *flagp |= HASWIDTH;
9360 if (max < min)
9361 vFAIL("Can't do {n,m} with n > m");
9362 if (!SIZE_ONLY) {
9363 ARG1_SET(ret, (U16)min);
9364 ARG2_SET(ret, (U16)max);
9365 }
9366
9367 goto nest_check;
9368 }
9369 }
9370
9371 if (!ISMULT1(op)) {
9372 *flagp = flags;
9373 return(ret);
9374 }
9375
9376#if 0 /* Now runtime fix should be reliable. */
9377
9378 /* if this is reinstated, don't forget to put this back into perldiag:
9379
9380 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9381
9382 (F) The part of the regexp subject to either the * or + quantifier
9383 could match an empty string. The {#} shows in the regular
9384 expression about where the problem was discovered.
9385
9386 */
9387
9388 if (!(flags&HASWIDTH) && op != '?')
9389 vFAIL("Regexp *+ operand could be empty");
9390#endif
9391
9392#ifdef RE_TRACK_PATTERN_OFFSETS
9393 parse_start = RExC_parse;
9394#endif
9395 nextchar(pRExC_state);
9396
9397 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9398
9399 if (op == '*' && (flags&SIMPLE)) {
9400 reginsert(pRExC_state, STAR, ret, depth+1);
9401 ret->flags = 0;
9402 RExC_naughty += 4;
9403 }
9404 else if (op == '*') {
9405 min = 0;
9406 goto do_curly;
9407 }
9408 else if (op == '+' && (flags&SIMPLE)) {
9409 reginsert(pRExC_state, PLUS, ret, depth+1);
9410 ret->flags = 0;
9411 RExC_naughty += 3;
9412 }
9413 else if (op == '+') {
9414 min = 1;
9415 goto do_curly;
9416 }
9417 else if (op == '?') {
9418 min = 0; max = 1;
9419 goto do_curly;
9420 }
9421 nest_check:
9422 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9423 ckWARN3reg(RExC_parse,
9424 "%.*s matches null string many times",
9425 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9426 origparse);
9427 }
9428
9429 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9430 nextchar(pRExC_state);
9431 reginsert(pRExC_state, MINMOD, ret, depth+1);
9432 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9433 }
9434#ifndef REG_ALLOW_MINMOD_SUSPEND
9435 else
9436#endif
9437 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9438 regnode *ender;
9439 nextchar(pRExC_state);
9440 ender = reg_node(pRExC_state, SUCCEED);
9441 REGTAIL(pRExC_state, ret, ender);
9442 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9443 ret->flags = 0;
9444 ender = reg_node(pRExC_state, TAIL);
9445 REGTAIL(pRExC_state, ret, ender);
9446 /*ret= ender;*/
9447 }
9448
9449 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9450 RExC_parse++;
9451 vFAIL("Nested quantifiers");
9452 }
9453
9454 return(ret);
9455}
9456
9457
9458/* reg_namedseq(pRExC_state,UVp, UV depth)
9459
9460 This is expected to be called by a parser routine that has
9461 recognized '\N' and needs to handle the rest. RExC_parse is
9462 expected to point at the first char following the N at the time
9463 of the call.
9464
9465 The \N may be inside (indicated by valuep not being NULL) or outside a
9466 character class.
9467
9468 \N may begin either a named sequence, or if outside a character class, mean
9469 to match a non-newline. For non single-quoted regexes, the tokenizer has
9470 attempted to decide which, and in the case of a named sequence converted it
9471 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9472 where c1... are the characters in the sequence. For single-quoted regexes,
9473 the tokenizer passes the \N sequence through unchanged; this code will not
9474 attempt to determine this nor expand those. The net effect is that if the
9475 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
9476 signals that this \N occurrence means to match a non-newline.
9477
9478 Only the \N{U+...} form should occur in a character class, for the same
9479 reason that '.' inside a character class means to just match a period: it
9480 just doesn't make sense.
9481
9482 If valuep is non-null then it is assumed that we are parsing inside
9483 of a charclass definition and the first codepoint in the resolved
9484 string is returned via *valuep and the routine will return NULL.
9485 In this mode if a multichar string is returned from the charnames
9486 handler, a warning will be issued, and only the first char in the
9487 sequence will be examined. If the string returned is zero length
9488 then the value of *valuep is undefined and NON-NULL will
9489 be returned to indicate failure. (This will NOT be a valid pointer
9490 to a regnode.)
9491
9492 If valuep is null then it is assumed that we are parsing normal text and a
9493 new EXACT node is inserted into the program containing the resolved string,
9494 and a pointer to the new node is returned. But if the string is zero length
9495 a NOTHING node is emitted instead.
9496
9497 On success RExC_parse is set to the char following the endbrace.
9498 Parsing failures will generate a fatal error via vFAIL(...)
9499 */
9500STATIC regnode *
9501S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
9502{
9503 char * endbrace; /* '}' following the name */
9504 regnode *ret = NULL;
9505 char* p;
9506
9507 GET_RE_DEBUG_FLAGS_DECL;
9508
9509 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
9510
9511 GET_RE_DEBUG_FLAGS;
9512
9513 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9514 * modifier. The other meaning does not */
9515 p = (RExC_flags & RXf_PMf_EXTENDED)
9516 ? regwhite( pRExC_state, RExC_parse )
9517 : RExC_parse;
9518
9519 /* Disambiguate between \N meaning a named character versus \N meaning
9520 * [^\n]. The former is assumed when it can't be the latter. */
9521 if (*p != '{' || regcurly(p)) {
9522 RExC_parse = p;
9523 if (valuep) {
9524 /* no bare \N in a charclass */
9525 vFAIL("\\N in a character class must be a named character: \\N{...}");
9526 }
9527 nextchar(pRExC_state);
9528 ret = reg_node(pRExC_state, REG_ANY);
9529 *flagp |= HASWIDTH|SIMPLE;
9530 RExC_naughty++;
9531 RExC_parse--;
9532 Set_Node_Length(ret, 1); /* MJD */
9533 return ret;
9534 }
9535
9536 /* Here, we have decided it should be a named sequence */
9537
9538 /* The test above made sure that the next real character is a '{', but
9539 * under the /x modifier, it could be separated by space (or a comment and
9540 * \n) and this is not allowed (for consistency with \x{...} and the
9541 * tokenizer handling of \N{NAME}). */
9542 if (*RExC_parse != '{') {
9543 vFAIL("Missing braces on \\N{}");
9544 }
9545
9546 RExC_parse++; /* Skip past the '{' */
9547
9548 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9549 || ! (endbrace == RExC_parse /* nothing between the {} */
9550 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9551 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9552 {
9553 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9554 vFAIL("\\N{NAME} must be resolved by the lexer");
9555 }
9556
9557 if (endbrace == RExC_parse) { /* empty: \N{} */
9558 if (! valuep) {
9559 RExC_parse = endbrace + 1;
9560 return reg_node(pRExC_state,NOTHING);
9561 }
9562
9563 if (SIZE_ONLY) {
9564 ckWARNreg(RExC_parse,
9565 "Ignoring zero length \\N{} in character class"
9566 );
9567 RExC_parse = endbrace + 1;
9568 }
9569 *valuep = 0;
9570 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
9571 }
9572
9573 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
9574 RExC_parse += 2; /* Skip past the 'U+' */
9575
9576 if (valuep) { /* In a bracketed char class */
9577 /* We only pay attention to the first char of
9578 multichar strings being returned. I kinda wonder
9579 if this makes sense as it does change the behaviour
9580 from earlier versions, OTOH that behaviour was broken
9581 as well. XXX Solution is to recharacterize as
9582 [rest-of-class]|multi1|multi2... */
9583
9584 STRLEN length_of_hex;
9585 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9586 | PERL_SCAN_DISALLOW_PREFIX
9587 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9588
9589 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
9590 if (endchar < endbrace) {
9591 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9592 }
9593
9594 length_of_hex = (STRLEN)(endchar - RExC_parse);
9595 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
9596
9597 /* The tokenizer should have guaranteed validity, but it's possible to
9598 * bypass it by using single quoting, so check */
9599 if (length_of_hex == 0
9600 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9601 {
9602 RExC_parse += length_of_hex; /* Includes all the valid */
9603 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9604 ? UTF8SKIP(RExC_parse)
9605 : 1;
9606 /* Guard against malformed utf8 */
9607 if (RExC_parse >= endchar) RExC_parse = endchar;
9608 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9609 }
9610
9611 RExC_parse = endbrace + 1;
9612 if (endchar == endbrace) return NULL;
9613
9614 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
9615 }
9616 else { /* Not a char class */
9617
9618 /* What is done here is to convert this to a sub-pattern of the form
9619 * (?:\x{char1}\x{char2}...)
9620 * and then call reg recursively. That way, it retains its atomicness,
9621 * while not having to worry about special handling that some code
9622 * points may have. toke.c has converted the original Unicode values
9623 * to native, so that we can just pass on the hex values unchanged. We
9624 * do have to set a flag to keep recoding from happening in the
9625 * recursion */
9626
9627 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9628 STRLEN len;
9629 char *endchar; /* Points to '.' or '}' ending cur char in the input
9630 stream */
9631 char *orig_end = RExC_end;
9632
9633 while (RExC_parse < endbrace) {
9634
9635 /* Code points are separated by dots. If none, there is only one
9636 * code point, and is terminated by the brace */
9637 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9638
9639 /* Convert to notation the rest of the code understands */
9640 sv_catpv(substitute_parse, "\\x{");
9641 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9642 sv_catpv(substitute_parse, "}");
9643
9644 /* Point to the beginning of the next character in the sequence. */
9645 RExC_parse = endchar + 1;
9646 }
9647 sv_catpv(substitute_parse, ")");
9648
9649 RExC_parse = SvPV(substitute_parse, len);
9650
9651 /* Don't allow empty number */
9652 if (len < 8) {
9653 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9654 }
9655 RExC_end = RExC_parse + len;
9656
9657 /* The values are Unicode, and therefore not subject to recoding */
9658 RExC_override_recoding = 1;
9659
9660 ret = reg(pRExC_state, 1, flagp, depth+1);
9661
9662 RExC_parse = endbrace;
9663 RExC_end = orig_end;
9664 RExC_override_recoding = 0;
9665
9666 nextchar(pRExC_state);
9667 }
9668
9669 return ret;
9670}
9671
9672
9673/*
9674 * reg_recode
9675 *
9676 * It returns the code point in utf8 for the value in *encp.
9677 * value: a code value in the source encoding
9678 * encp: a pointer to an Encode object
9679 *
9680 * If the result from Encode is not a single character,
9681 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9682 */
9683STATIC UV
9684S_reg_recode(pTHX_ const char value, SV **encp)
9685{
9686 STRLEN numlen = 1;
9687 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9688 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9689 const STRLEN newlen = SvCUR(sv);
9690 UV uv = UNICODE_REPLACEMENT;
9691
9692 PERL_ARGS_ASSERT_REG_RECODE;
9693
9694 if (newlen)
9695 uv = SvUTF8(sv)
9696 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9697 : *(U8*)s;
9698
9699 if (!newlen || numlen != newlen) {
9700 uv = UNICODE_REPLACEMENT;
9701 *encp = NULL;
9702 }
9703 return uv;
9704}
9705
9706
9707/*
9708 - regatom - the lowest level
9709
9710 Try to identify anything special at the start of the pattern. If there
9711 is, then handle it as required. This may involve generating a single regop,
9712 such as for an assertion; or it may involve recursing, such as to
9713 handle a () structure.
9714
9715 If the string doesn't start with something special then we gobble up
9716 as much literal text as we can.
9717
9718 Once we have been able to handle whatever type of thing started the
9719 sequence, we return.
9720
9721 Note: we have to be careful with escapes, as they can be both literal
9722 and special, and in the case of \10 and friends, context determines which.
9723
9724 A summary of the code structure is:
9725
9726 switch (first_byte) {
9727 cases for each special:
9728 handle this special;
9729 break;
9730 case '\\':
9731 switch (2nd byte) {
9732 cases for each unambiguous special:
9733 handle this special;
9734 break;
9735 cases for each ambigous special/literal:
9736 disambiguate;
9737 if (special) handle here
9738 else goto defchar;
9739 default: // unambiguously literal:
9740 goto defchar;
9741 }
9742 default: // is a literal char
9743 // FALL THROUGH
9744 defchar:
9745 create EXACTish node for literal;
9746 while (more input and node isn't full) {
9747 switch (input_byte) {
9748 cases for each special;
9749 make sure parse pointer is set so that the next call to
9750 regatom will see this special first
9751 goto loopdone; // EXACTish node terminated by prev. char
9752 default:
9753 append char to EXACTISH node;
9754 }
9755 get next input byte;
9756 }
9757 loopdone:
9758 }
9759 return the generated node;
9760
9761 Specifically there are two separate switches for handling
9762 escape sequences, with the one for handling literal escapes requiring
9763 a dummy entry for all of the special escapes that are actually handled
9764 by the other.
9765*/
9766
9767STATIC regnode *
9768S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9769{
9770 dVAR;
9771 register regnode *ret = NULL;
9772 I32 flags;
9773 char *parse_start = RExC_parse;
9774 U8 op;
9775 GET_RE_DEBUG_FLAGS_DECL;
9776 DEBUG_PARSE("atom");
9777 *flagp = WORST; /* Tentatively. */
9778
9779 PERL_ARGS_ASSERT_REGATOM;
9780
9781tryagain:
9782 switch ((U8)*RExC_parse) {
9783 case '^':
9784 RExC_seen_zerolen++;
9785 nextchar(pRExC_state);
9786 if (RExC_flags & RXf_PMf_MULTILINE)
9787 ret = reg_node(pRExC_state, MBOL);
9788 else if (RExC_flags & RXf_PMf_SINGLELINE)
9789 ret = reg_node(pRExC_state, SBOL);
9790 else
9791 ret = reg_node(pRExC_state, BOL);
9792 Set_Node_Length(ret, 1); /* MJD */
9793 break;
9794 case '$':
9795 nextchar(pRExC_state);
9796 if (*RExC_parse)
9797 RExC_seen_zerolen++;
9798 if (RExC_flags & RXf_PMf_MULTILINE)
9799 ret = reg_node(pRExC_state, MEOL);
9800 else if (RExC_flags & RXf_PMf_SINGLELINE)
9801 ret = reg_node(pRExC_state, SEOL);
9802 else
9803 ret = reg_node(pRExC_state, EOL);
9804 Set_Node_Length(ret, 1); /* MJD */
9805 break;
9806 case '.':
9807 nextchar(pRExC_state);
9808 if (RExC_flags & RXf_PMf_SINGLELINE)
9809 ret = reg_node(pRExC_state, SANY);
9810 else
9811 ret = reg_node(pRExC_state, REG_ANY);
9812 *flagp |= HASWIDTH|SIMPLE;
9813 RExC_naughty++;
9814 Set_Node_Length(ret, 1); /* MJD */
9815 break;
9816 case '[':
9817 {
9818 char * const oregcomp_parse = ++RExC_parse;
9819 ret = regclass(pRExC_state,depth+1);
9820 if (*RExC_parse != ']') {
9821 RExC_parse = oregcomp_parse;
9822 vFAIL("Unmatched [");
9823 }
9824 nextchar(pRExC_state);
9825 *flagp |= HASWIDTH|SIMPLE;
9826 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
9827 break;
9828 }
9829 case '(':
9830 nextchar(pRExC_state);
9831 ret = reg(pRExC_state, 1, &flags,depth+1);
9832 if (ret == NULL) {
9833 if (flags & TRYAGAIN) {
9834 if (RExC_parse == RExC_end) {
9835 /* Make parent create an empty node if needed. */
9836 *flagp |= TRYAGAIN;
9837 return(NULL);
9838 }
9839 goto tryagain;
9840 }
9841 return(NULL);
9842 }
9843 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9844 break;
9845 case '|':
9846 case ')':
9847 if (flags & TRYAGAIN) {
9848 *flagp |= TRYAGAIN;
9849 return NULL;
9850 }
9851 vFAIL("Internal urp");
9852 /* Supposed to be caught earlier. */
9853 break;
9854 case '?':
9855 case '+':
9856 case '*':
9857 RExC_parse++;
9858 vFAIL("Quantifier follows nothing");
9859 break;
9860 case '\\':
9861 /* Special Escapes
9862
9863 This switch handles escape sequences that resolve to some kind
9864 of special regop and not to literal text. Escape sequnces that
9865 resolve to literal text are handled below in the switch marked
9866 "Literal Escapes".
9867
9868 Every entry in this switch *must* have a corresponding entry
9869 in the literal escape switch. However, the opposite is not
9870 required, as the default for this switch is to jump to the
9871 literal text handling code.
9872 */
9873 switch ((U8)*++RExC_parse) {
9874 /* Special Escapes */
9875 case 'A':
9876 RExC_seen_zerolen++;
9877 ret = reg_node(pRExC_state, SBOL);
9878 *flagp |= SIMPLE;
9879 goto finish_meta_pat;
9880 case 'G':
9881 ret = reg_node(pRExC_state, GPOS);
9882 RExC_seen |= REG_SEEN_GPOS;
9883 *flagp |= SIMPLE;
9884 goto finish_meta_pat;
9885 case 'K':
9886 RExC_seen_zerolen++;
9887 ret = reg_node(pRExC_state, KEEPS);
9888 *flagp |= SIMPLE;
9889 /* XXX:dmq : disabling in-place substitution seems to
9890 * be necessary here to avoid cases of memory corruption, as
9891 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
9892 */
9893 RExC_seen |= REG_SEEN_LOOKBEHIND;
9894 goto finish_meta_pat;
9895 case 'Z':
9896 ret = reg_node(pRExC_state, SEOL);
9897 *flagp |= SIMPLE;
9898 RExC_seen_zerolen++; /* Do not optimize RE away */
9899 goto finish_meta_pat;
9900 case 'z':
9901 ret = reg_node(pRExC_state, EOS);
9902 *flagp |= SIMPLE;
9903 RExC_seen_zerolen++; /* Do not optimize RE away */
9904 goto finish_meta_pat;
9905 case 'C':
9906 ret = reg_node(pRExC_state, CANY);
9907 RExC_seen |= REG_SEEN_CANY;
9908 *flagp |= HASWIDTH|SIMPLE;
9909 goto finish_meta_pat;
9910 case 'X':
9911 ret = reg_node(pRExC_state, CLUMP);
9912 *flagp |= HASWIDTH;
9913 goto finish_meta_pat;
9914 case 'w':
9915 op = ALNUM + get_regex_charset(RExC_flags);
9916 if (op > ALNUMA) { /* /aa is same as /a */
9917 op = ALNUMA;
9918 }
9919 ret = reg_node(pRExC_state, op);
9920 *flagp |= HASWIDTH|SIMPLE;
9921 goto finish_meta_pat;
9922 case 'W':
9923 op = NALNUM + get_regex_charset(RExC_flags);
9924 if (op > NALNUMA) { /* /aa is same as /a */
9925 op = NALNUMA;
9926 }
9927 ret = reg_node(pRExC_state, op);
9928 *flagp |= HASWIDTH|SIMPLE;
9929 goto finish_meta_pat;
9930 case 'b':
9931 RExC_seen_zerolen++;
9932 RExC_seen |= REG_SEEN_LOOKBEHIND;
9933 op = BOUND + get_regex_charset(RExC_flags);
9934 if (op > BOUNDA) { /* /aa is same as /a */
9935 op = BOUNDA;
9936 }
9937 ret = reg_node(pRExC_state, op);
9938 FLAGS(ret) = get_regex_charset(RExC_flags);
9939 *flagp |= SIMPLE;
9940 goto finish_meta_pat;
9941 case 'B':
9942 RExC_seen_zerolen++;
9943 RExC_seen |= REG_SEEN_LOOKBEHIND;
9944 op = NBOUND + get_regex_charset(RExC_flags);
9945 if (op > NBOUNDA) { /* /aa is same as /a */
9946 op = NBOUNDA;
9947 }
9948 ret = reg_node(pRExC_state, op);
9949 FLAGS(ret) = get_regex_charset(RExC_flags);
9950 *flagp |= SIMPLE;
9951 goto finish_meta_pat;
9952 case 's':
9953 op = SPACE + get_regex_charset(RExC_flags);
9954 if (op > SPACEA) { /* /aa is same as /a */
9955 op = SPACEA;
9956 }
9957 ret = reg_node(pRExC_state, op);
9958 *flagp |= HASWIDTH|SIMPLE;
9959 goto finish_meta_pat;
9960 case 'S':
9961 op = NSPACE + get_regex_charset(RExC_flags);
9962 if (op > NSPACEA) { /* /aa is same as /a */
9963 op = NSPACEA;
9964 }
9965 ret = reg_node(pRExC_state, op);
9966 *flagp |= HASWIDTH|SIMPLE;
9967 goto finish_meta_pat;
9968 case 'D':
9969 op = NDIGIT;
9970 goto join_D_and_d;
9971 case 'd':
9972 op = DIGIT;
9973 join_D_and_d:
9974 {
9975 U8 offset = get_regex_charset(RExC_flags);
9976 if (offset == REGEX_UNICODE_CHARSET) {
9977 offset = REGEX_DEPENDS_CHARSET;
9978 }
9979 else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
9980 offset = REGEX_ASCII_RESTRICTED_CHARSET;
9981 }
9982 op += offset;
9983 }
9984 ret = reg_node(pRExC_state, op);
9985 *flagp |= HASWIDTH|SIMPLE;
9986 goto finish_meta_pat;
9987 case 'R':
9988 ret = reg_node(pRExC_state, LNBREAK);
9989 *flagp |= HASWIDTH|SIMPLE;
9990 goto finish_meta_pat;
9991 case 'h':
9992 ret = reg_node(pRExC_state, HORIZWS);
9993 *flagp |= HASWIDTH|SIMPLE;
9994 goto finish_meta_pat;
9995 case 'H':
9996 ret = reg_node(pRExC_state, NHORIZWS);
9997 *flagp |= HASWIDTH|SIMPLE;
9998 goto finish_meta_pat;
9999 case 'v':
10000 ret = reg_node(pRExC_state, VERTWS);
10001 *flagp |= HASWIDTH|SIMPLE;
10002 goto finish_meta_pat;
10003 case 'V':
10004 ret = reg_node(pRExC_state, NVERTWS);
10005 *flagp |= HASWIDTH|SIMPLE;
10006 finish_meta_pat:
10007 nextchar(pRExC_state);
10008 Set_Node_Length(ret, 2); /* MJD */
10009 break;
10010 case 'p':
10011 case 'P':
10012 {
10013 char* const oldregxend = RExC_end;
10014#ifdef DEBUGGING
10015 char* parse_start = RExC_parse - 2;
10016#endif
10017
10018 if (RExC_parse[1] == '{') {
10019 /* a lovely hack--pretend we saw [\pX] instead */
10020 RExC_end = strchr(RExC_parse, '}');
10021 if (!RExC_end) {
10022 const U8 c = (U8)*RExC_parse;
10023 RExC_parse += 2;
10024 RExC_end = oldregxend;
10025 vFAIL2("Missing right brace on \\%c{}", c);
10026 }
10027 RExC_end++;
10028 }
10029 else {
10030 RExC_end = RExC_parse + 2;
10031 if (RExC_end > oldregxend)
10032 RExC_end = oldregxend;
10033 }
10034 RExC_parse--;
10035
10036 ret = regclass(pRExC_state,depth+1);
10037
10038 RExC_end = oldregxend;
10039 RExC_parse--;
10040
10041 Set_Node_Offset(ret, parse_start + 2);
10042 Set_Node_Cur_Length(ret);
10043 nextchar(pRExC_state);
10044 *flagp |= HASWIDTH|SIMPLE;
10045 }
10046 break;
10047 case 'N':
10048 /* Handle \N and \N{NAME} here and not below because it can be
10049 multicharacter. join_exact() will join them up later on.
10050 Also this makes sure that things like /\N{BLAH}+/ and
10051 \N{BLAH} being multi char Just Happen. dmq*/
10052 ++RExC_parse;
10053 ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
10054 break;
10055 case 'k': /* Handle \k<NAME> and \k'NAME' */
10056 parse_named_seq:
10057 {
10058 char ch= RExC_parse[1];
10059 if (ch != '<' && ch != '\'' && ch != '{') {
10060 RExC_parse++;
10061 vFAIL2("Sequence %.2s... not terminated",parse_start);
10062 } else {
10063 /* this pretty much dupes the code for (?P=...) in reg(), if
10064 you change this make sure you change that */
10065 char* name_start = (RExC_parse += 2);
10066 U32 num = 0;
10067 SV *sv_dat = reg_scan_name(pRExC_state,
10068 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10069 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10070 if (RExC_parse == name_start || *RExC_parse != ch)
10071 vFAIL2("Sequence %.3s... not terminated",parse_start);
10072
10073 if (!SIZE_ONLY) {
10074 num = add_data( pRExC_state, 1, "S" );
10075 RExC_rxi->data->data[num]=(void*)sv_dat;
10076 SvREFCNT_inc_simple_void(sv_dat);
10077 }
10078
10079 RExC_sawback = 1;
10080 ret = reganode(pRExC_state,
10081 ((! FOLD)
10082 ? NREF
10083 : (MORE_ASCII_RESTRICTED)
10084 ? NREFFA
10085 : (AT_LEAST_UNI_SEMANTICS)
10086 ? NREFFU
10087 : (LOC)
10088 ? NREFFL
10089 : NREFF),
10090 num);
10091 *flagp |= HASWIDTH;
10092
10093 /* override incorrect value set in reganode MJD */
10094 Set_Node_Offset(ret, parse_start+1);
10095 Set_Node_Cur_Length(ret); /* MJD */
10096 nextchar(pRExC_state);
10097
10098 }
10099 break;
10100 }
10101 case 'g':
10102 case '1': case '2': case '3': case '4':
10103 case '5': case '6': case '7': case '8': case '9':
10104 {
10105 I32 num;
10106 bool isg = *RExC_parse == 'g';
10107 bool isrel = 0;
10108 bool hasbrace = 0;
10109 if (isg) {
10110 RExC_parse++;
10111 if (*RExC_parse == '{') {
10112 RExC_parse++;
10113 hasbrace = 1;
10114 }
10115 if (*RExC_parse == '-') {
10116 RExC_parse++;
10117 isrel = 1;
10118 }
10119 if (hasbrace && !isDIGIT(*RExC_parse)) {
10120 if (isrel) RExC_parse--;
10121 RExC_parse -= 2;
10122 goto parse_named_seq;
10123 } }
10124 num = atoi(RExC_parse);
10125 if (isg && num == 0)
10126 vFAIL("Reference to invalid group 0");
10127 if (isrel) {
10128 num = RExC_npar - num;
10129 if (num < 1)
10130 vFAIL("Reference to nonexistent or unclosed group");
10131 }
10132 if (!isg && num > 9 && num >= RExC_npar)
10133 /* Probably a character specified in octal, e.g. \35 */
10134 goto defchar;
10135 else {
10136 char * const parse_start = RExC_parse - 1; /* MJD */
10137 while (isDIGIT(*RExC_parse))
10138 RExC_parse++;
10139 if (parse_start == RExC_parse - 1)
10140 vFAIL("Unterminated \\g... pattern");
10141 if (hasbrace) {
10142 if (*RExC_parse != '}')
10143 vFAIL("Unterminated \\g{...} pattern");
10144 RExC_parse++;
10145 }
10146 if (!SIZE_ONLY) {
10147 if (num > (I32)RExC_rx->nparens)
10148 vFAIL("Reference to nonexistent group");
10149 }
10150 RExC_sawback = 1;
10151 ret = reganode(pRExC_state,
10152 ((! FOLD)
10153 ? REF
10154 : (MORE_ASCII_RESTRICTED)
10155 ? REFFA
10156 : (AT_LEAST_UNI_SEMANTICS)
10157 ? REFFU
10158 : (LOC)
10159 ? REFFL
10160 : REFF),
10161 num);
10162 *flagp |= HASWIDTH;
10163
10164 /* override incorrect value set in reganode MJD */
10165 Set_Node_Offset(ret, parse_start+1);
10166 Set_Node_Cur_Length(ret); /* MJD */
10167 RExC_parse--;
10168 nextchar(pRExC_state);
10169 }
10170 }
10171 break;
10172 case '\0':
10173 if (RExC_parse >= RExC_end)
10174 FAIL("Trailing \\");
10175 /* FALL THROUGH */
10176 default:
10177 /* Do not generate "unrecognized" warnings here, we fall
10178 back into the quick-grab loop below */
10179 parse_start--;
10180 goto defchar;
10181 }
10182 break;
10183
10184 case '#':
10185 if (RExC_flags & RXf_PMf_EXTENDED) {
10186 if ( reg_skipcomment( pRExC_state ) )
10187 goto tryagain;
10188 }
10189 /* FALL THROUGH */
10190
10191 default:
10192
10193 parse_start = RExC_parse - 1;
10194
10195 RExC_parse++;
10196
10197 defchar: {
10198 register STRLEN len;
10199 register UV ender;
10200 register char *p;
10201 char *s;
10202 STRLEN foldlen;
10203 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
10204 U8 node_type;
10205
10206 /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
10207 * it is folded to 'ss' even if not utf8 */
10208 bool is_exactfu_sharp_s;
10209
10210 ender = 0;
10211 if (! FOLD) {
10212 node_type = EXACT;
10213 }
10214 else {
10215 node_type = get_regex_charset(RExC_flags);
10216 if (node_type >= REGEX_ASCII_RESTRICTED_CHARSET) {
10217 node_type--; /* /a is same as /u, and map /aa's offset to
10218 what /a's would have been, so there is no
10219 hole */
10220 }
10221 node_type += EXACTF;
10222 }
10223 ret = reg_node(pRExC_state, node_type);
10224 s = STRING(ret);
10225
10226 /* XXX The node can hold up to 255 bytes, yet this only goes to
10227 * 127. I (khw) do not know why. Keeping it somewhat less than
10228 * 255 allows us to not have to worry about overflow due to
10229 * converting to utf8 and fold expansion, but that value is
10230 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10231 * split up by this limit into a single one using the real max of
10232 * 255. Even at 127, this breaks under rare circumstances. If
10233 * folding, we do not want to split a node at a character that is a
10234 * non-final in a multi-char fold, as an input string could just
10235 * happen to want to match across the node boundary. The join
10236 * would solve that problem if the join actually happens. But a
10237 * series of more than two nodes in a row each of 127 would cause
10238 * the first join to succeed to get to 254, but then there wouldn't
10239 * be room for the next one, which could at be one of those split
10240 * multi-char folds. I don't know of any fool-proof solution. One
10241 * could back off to end with only a code point that isn't such a
10242 * non-final, but it is possible for there not to be any in the
10243 * entire node. */
10244 for (len = 0, p = RExC_parse - 1;
10245 len < 127 && p < RExC_end;
10246 len++)
10247 {
10248 char * const oldp = p;
10249
10250 if (RExC_flags & RXf_PMf_EXTENDED)
10251 p = regwhite( pRExC_state, p );
10252 switch ((U8)*p) {
10253 case '^':
10254 case '$':
10255 case '.':
10256 case '[':
10257 case '(':
10258 case ')':
10259 case '|':
10260 goto loopdone;
10261 case '\\':
10262 /* Literal Escapes Switch
10263
10264 This switch is meant to handle escape sequences that
10265 resolve to a literal character.
10266
10267 Every escape sequence that represents something
10268 else, like an assertion or a char class, is handled
10269 in the switch marked 'Special Escapes' above in this
10270 routine, but also has an entry here as anything that
10271 isn't explicitly mentioned here will be treated as
10272 an unescaped equivalent literal.
10273 */
10274
10275 switch ((U8)*++p) {
10276 /* These are all the special escapes. */
10277 case 'A': /* Start assertion */
10278 case 'b': case 'B': /* Word-boundary assertion*/
10279 case 'C': /* Single char !DANGEROUS! */
10280 case 'd': case 'D': /* digit class */
10281 case 'g': case 'G': /* generic-backref, pos assertion */
10282 case 'h': case 'H': /* HORIZWS */
10283 case 'k': case 'K': /* named backref, keep marker */
10284 case 'N': /* named char sequence */
10285 case 'p': case 'P': /* Unicode property */
10286 case 'R': /* LNBREAK */
10287 case 's': case 'S': /* space class */
10288 case 'v': case 'V': /* VERTWS */
10289 case 'w': case 'W': /* word class */
10290 case 'X': /* eXtended Unicode "combining character sequence" */
10291 case 'z': case 'Z': /* End of line/string assertion */
10292 --p;
10293 goto loopdone;
10294
10295 /* Anything after here is an escape that resolves to a
10296 literal. (Except digits, which may or may not)
10297 */
10298 case 'n':
10299 ender = '\n';
10300 p++;
10301 break;
10302 case 'r':
10303 ender = '\r';
10304 p++;
10305 break;
10306 case 't':
10307 ender = '\t';
10308 p++;
10309 break;
10310 case 'f':
10311 ender = '\f';
10312 p++;
10313 break;
10314 case 'e':
10315 ender = ASCII_TO_NATIVE('\033');
10316 p++;
10317 break;
10318 case 'a':
10319 ender = ASCII_TO_NATIVE('\007');
10320 p++;
10321 break;
10322 case 'o':
10323 {
10324 STRLEN brace_len = len;
10325 UV result;
10326 const char* error_msg;
10327
10328 bool valid = grok_bslash_o(p,
10329 &result,
10330 &brace_len,
10331 &error_msg,
10332 1);
10333 p += brace_len;
10334 if (! valid) {
10335 RExC_parse = p; /* going to die anyway; point
10336 to exact spot of failure */
10337 vFAIL(error_msg);
10338 }
10339 else
10340 {
10341 ender = result;
10342 }
10343 if (PL_encoding && ender < 0x100) {
10344 goto recode_encoding;
10345 }
10346 if (ender > 0xff) {
10347 REQUIRE_UTF8;
10348 }
10349 break;
10350 }
10351 case 'x':
10352 {
10353 STRLEN brace_len = len;
10354 UV result;
10355 const char* error_msg;
10356
10357 bool valid = grok_bslash_x(p,
10358 &result,
10359 &brace_len,
10360 &error_msg,
10361 1);
10362 p += brace_len;
10363 if (! valid) {
10364 RExC_parse = p; /* going to die anyway; point
10365 to exact spot of failure */
10366 vFAIL(error_msg);
10367 }
10368 else {
10369 ender = result;
10370 }
10371 if (PL_encoding && ender < 0x100) {
10372 goto recode_encoding;
10373 }
10374 if (ender > 0xff) {
10375 REQUIRE_UTF8;
10376 }
10377 break;
10378 }
10379 case 'c':
10380 p++;
10381 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10382 break;
10383 case '0': case '1': case '2': case '3':case '4':
10384 case '5': case '6': case '7':
10385 if (*p == '0' ||
10386 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10387 {
10388 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10389 STRLEN numlen = 3;
10390 ender = grok_oct(p, &numlen, &flags, NULL);
10391 if (ender > 0xff) {
10392 REQUIRE_UTF8;
10393 }
10394 p += numlen;
10395 }
10396 else {
10397 --p;
10398 goto loopdone;
10399 }
10400 if (PL_encoding && ender < 0x100)
10401 goto recode_encoding;
10402 break;
10403 recode_encoding:
10404 if (! RExC_override_recoding) {
10405 SV* enc = PL_encoding;
10406 ender = reg_recode((const char)(U8)ender, &enc);
10407 if (!enc && SIZE_ONLY)
10408 ckWARNreg(p, "Invalid escape in the specified encoding");
10409 REQUIRE_UTF8;
10410 }
10411 break;
10412 case '\0':
10413 if (p >= RExC_end)
10414 FAIL("Trailing \\");
10415 /* FALL THROUGH */
10416 default:
10417 if (!SIZE_ONLY&& isALNUMC(*p)) {
10418 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10419 }
10420 goto normal_default;
10421 }
10422 break;
10423 case '{':
10424 /* Currently we don't warn when the lbrace is at the start
10425 * of a construct. This catches it in the middle of a
10426 * literal string, or when its the first thing after
10427 * something like "\b" */
10428 if (! SIZE_ONLY
10429 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10430 {
10431 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10432 }
10433 /*FALLTHROUGH*/
10434 default:
10435 normal_default:
10436 if (UTF8_IS_START(*p) && UTF) {
10437 STRLEN numlen;
10438 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10439 &numlen, UTF8_ALLOW_DEFAULT);
10440 p += numlen;
10441 }
10442 else
10443 ender = (U8) *p++;
10444 break;
10445 } /* End of switch on the literal */
10446
10447 is_exactfu_sharp_s = (node_type == EXACTFU
10448 && ender == LATIN_SMALL_LETTER_SHARP_S);
10449 if ( RExC_flags & RXf_PMf_EXTENDED)
10450 p = regwhite( pRExC_state, p );
10451 if ((UTF && FOLD) || is_exactfu_sharp_s) {
10452 /* Prime the casefolded buffer. Locale rules, which apply
10453 * only to code points < 256, aren't known until execution,
10454 * so for them, just output the original character using
10455 * utf8. If we start to fold non-UTF patterns, be sure to
10456 * update join_exact() */
10457 if (LOC && ender < 256) {
10458 if (UNI_IS_INVARIANT(ender)) {
10459 *tmpbuf = (U8) ender;
10460 foldlen = 1;
10461 } else {
10462 *tmpbuf = UTF8_TWO_BYTE_HI(ender);
10463 *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
10464 foldlen = 2;
10465 }
10466 }
10467 else if (isASCII(ender)) { /* Note: Here can't also be LOC
10468 */
10469 ender = toLOWER(ender);
10470 *tmpbuf = (U8) ender;
10471 foldlen = 1;
10472 }
10473 else if (! MORE_ASCII_RESTRICTED && ! LOC) {
10474
10475 /* Locale and /aa require more selectivity about the
10476 * fold, so are handled below. Otherwise, here, just
10477 * use the fold */
10478 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
10479 }
10480 else {
10481 /* Under locale rules or /aa we are not to mix,
10482 * respectively, ords < 256 or ASCII with non-. So
10483 * reject folds that mix them, using only the
10484 * non-folded code point. So do the fold to a
10485 * temporary, and inspect each character in it. */
10486 U8 trialbuf[UTF8_MAXBYTES_CASE+1];
10487 U8* s = trialbuf;
10488 UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
10489 U8* e = s + foldlen;
10490 bool fold_ok = TRUE;
10491
10492 while (s < e) {
10493 if (isASCII(*s)
10494 || (LOC && (UTF8_IS_INVARIANT(*s)
10495 || UTF8_IS_DOWNGRADEABLE_START(*s))))
10496 {
10497 fold_ok = FALSE;
10498 break;
10499 }
10500 s += UTF8SKIP(s);
10501 }
10502 if (fold_ok) {
10503 Copy(trialbuf, tmpbuf, foldlen, U8);
10504 ender = tmpender;
10505 }
10506 else {
10507 uvuni_to_utf8(tmpbuf, ender);
10508 foldlen = UNISKIP(ender);
10509 }
10510 }
10511 }
10512 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
10513 if (len)
10514 p = oldp;
10515 else if (UTF || is_exactfu_sharp_s) {
10516 if (FOLD) {
10517 /* Emit all the Unicode characters. */
10518 STRLEN numlen;
10519 for (foldbuf = tmpbuf;
10520 foldlen;
10521 foldlen -= numlen) {
10522
10523 /* tmpbuf has been constructed by us, so we
10524 * know it is valid utf8 */
10525 ender = valid_utf8_to_uvchr(foldbuf, &numlen);
10526 if (numlen > 0) {
10527 const STRLEN unilen = reguni(pRExC_state, ender, s);
10528 s += unilen;
10529 len += unilen;
10530 /* In EBCDIC the numlen
10531 * and unilen can differ. */
10532 foldbuf += numlen;
10533 if (numlen >= foldlen)
10534 break;
10535 }
10536 else
10537 break; /* "Can't happen." */
10538 }
10539 }
10540 else {
10541 const STRLEN unilen = reguni(pRExC_state, ender, s);
10542 if (unilen > 0) {
10543 s += unilen;
10544 len += unilen;
10545 }
10546 }
10547 }
10548 else {
10549 len++;
10550 REGC((char)ender, s++);
10551 }
10552 break;
10553 }
10554 if (UTF || is_exactfu_sharp_s) {
10555 if (FOLD) {
10556 /* Emit all the Unicode characters. */
10557 STRLEN numlen;
10558 for (foldbuf = tmpbuf;
10559 foldlen;
10560 foldlen -= numlen) {
10561 ender = valid_utf8_to_uvchr(foldbuf, &numlen);
10562 if (numlen > 0) {
10563 const STRLEN unilen = reguni(pRExC_state, ender, s);
10564 len += unilen;
10565 s += unilen;
10566 /* In EBCDIC the numlen
10567 * and unilen can differ. */
10568 foldbuf += numlen;
10569 if (numlen >= foldlen)
10570 break;
10571 }
10572 else
10573 break;
10574 }
10575 }
10576 else {
10577 const STRLEN unilen = reguni(pRExC_state, ender, s);
10578 if (unilen > 0) {
10579 s += unilen;
10580 len += unilen;
10581 }
10582 }
10583 len--;
10584 }
10585 else {
10586 REGC((char)ender, s++);
10587 }
10588 }
10589 loopdone: /* Jumped to when encounters something that shouldn't be in
10590 the node */
10591 RExC_parse = p - 1;
10592 Set_Node_Cur_Length(ret); /* MJD */
10593 nextchar(pRExC_state);
10594 {
10595 /* len is STRLEN which is unsigned, need to copy to signed */
10596 IV iv = len;
10597 if (iv < 0)
10598 vFAIL("Internal disaster");
10599 }
10600 if (len > 0)
10601 *flagp |= HASWIDTH;
10602 if (len == 1 && UNI_IS_INVARIANT(ender))
10603 *flagp |= SIMPLE;
10604
10605 if (SIZE_ONLY)
10606 RExC_size += STR_SZ(len);
10607 else {
10608 STR_LEN(ret) = len;
10609 RExC_emit += STR_SZ(len);
10610 }
10611 }
10612 break;
10613 }
10614
10615 return(ret);
10616}
10617
10618STATIC char *
10619S_regwhite( RExC_state_t *pRExC_state, char *p )
10620{
10621 const char *e = RExC_end;
10622
10623 PERL_ARGS_ASSERT_REGWHITE;
10624
10625 while (p < e) {
10626 if (isSPACE(*p))
10627 ++p;
10628 else if (*p == '#') {
10629 bool ended = 0;
10630 do {
10631 if (*p++ == '\n') {
10632 ended = 1;
10633 break;
10634 }
10635 } while (p < e);
10636 if (!ended)
10637 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10638 }
10639 else
10640 break;
10641 }
10642 return p;
10643}
10644
10645/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
10646 Character classes ([:foo:]) can also be negated ([:^foo:]).
10647 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
10648 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
10649 but trigger failures because they are currently unimplemented. */
10650
10651#define POSIXCC_DONE(c) ((c) == ':')
10652#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
10653#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
10654
10655STATIC I32
10656S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
10657{
10658 dVAR;
10659 I32 namedclass = OOB_NAMEDCLASS;
10660
10661 PERL_ARGS_ASSERT_REGPPOSIXCC;
10662
10663 if (value == '[' && RExC_parse + 1 < RExC_end &&
10664 /* I smell either [: or [= or [. -- POSIX has been here, right? */
10665 POSIXCC(UCHARAT(RExC_parse))) {
10666 const char c = UCHARAT(RExC_parse);
10667 char* const s = RExC_parse++;
10668
10669 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
10670 RExC_parse++;
10671 if (RExC_parse == RExC_end)
10672 /* Grandfather lone [:, [=, [. */
10673 RExC_parse = s;
10674 else {
10675 const char* const t = RExC_parse++; /* skip over the c */
10676 assert(*t == c);
10677
10678 if (UCHARAT(RExC_parse) == ']') {
10679 const char *posixcc = s + 1;
10680 RExC_parse++; /* skip over the ending ] */
10681
10682 if (*s == ':') {
10683 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
10684 const I32 skip = t - posixcc;
10685
10686 /* Initially switch on the length of the name. */
10687 switch (skip) {
10688 case 4:
10689 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
10690 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
10691 break;
10692 case 5:
10693 /* Names all of length 5. */
10694 /* alnum alpha ascii blank cntrl digit graph lower
10695 print punct space upper */
10696 /* Offset 4 gives the best switch position. */
10697 switch (posixcc[4]) {
10698 case 'a':
10699 if (memEQ(posixcc, "alph", 4)) /* alpha */
10700 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
10701 break;
10702 case 'e':
10703 if (memEQ(posixcc, "spac", 4)) /* space */
10704 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
10705 break;
10706 case 'h':
10707 if (memEQ(posixcc, "grap", 4)) /* graph */
10708 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
10709 break;
10710 case 'i':
10711 if (memEQ(posixcc, "asci", 4)) /* ascii */
10712 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
10713 break;
10714 case 'k':
10715 if (memEQ(posixcc, "blan", 4)) /* blank */
10716 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
10717 break;
10718 case 'l':
10719 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
10720 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
10721 break;
10722 case 'm':
10723 if (memEQ(posixcc, "alnu", 4)) /* alnum */
10724 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
10725 break;
10726 case 'r':
10727 if (memEQ(posixcc, "lowe", 4)) /* lower */
10728 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
10729 else if (memEQ(posixcc, "uppe", 4)) /* upper */
10730 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
10731 break;
10732 case 't':
10733 if (memEQ(posixcc, "digi", 4)) /* digit */
10734 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
10735 else if (memEQ(posixcc, "prin", 4)) /* print */
10736 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
10737 else if (memEQ(posixcc, "punc", 4)) /* punct */
10738 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
10739 break;
10740 }
10741 break;
10742 case 6:
10743 if (memEQ(posixcc, "xdigit", 6))
10744 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
10745 break;
10746 }
10747
10748 if (namedclass == OOB_NAMEDCLASS)
10749 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
10750 t - s - 1, s + 1);
10751 assert (posixcc[skip] == ':');
10752 assert (posixcc[skip+1] == ']');
10753 } else if (!SIZE_ONLY) {
10754 /* [[=foo=]] and [[.foo.]] are still future. */
10755
10756 /* adjust RExC_parse so the warning shows after
10757 the class closes */
10758 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
10759 RExC_parse++;
10760 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10761 }
10762 } else {
10763 /* Maternal grandfather:
10764 * "[:" ending in ":" but not in ":]" */
10765 RExC_parse = s;
10766 }
10767 }
10768 }
10769
10770 return namedclass;
10771}
10772
10773STATIC void
10774S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
10775{
10776 dVAR;
10777
10778 PERL_ARGS_ASSERT_CHECKPOSIXCC;
10779
10780 if (POSIXCC(UCHARAT(RExC_parse))) {
10781 const char *s = RExC_parse;
10782 const char c = *s++;
10783
10784 while (isALNUM(*s))
10785 s++;
10786 if (*s && c == *s && s[1] == ']') {
10787 ckWARN3reg(s+2,
10788 "POSIX syntax [%c %c] belongs inside character classes",
10789 c, c);
10790
10791 /* [[=foo=]] and [[.foo.]] are still future. */
10792 if (POSIXCC_NOTYET(c)) {
10793 /* adjust RExC_parse so the error shows after
10794 the class closes */
10795 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
10796 NOOP;
10797 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10798 }
10799 }
10800 }
10801}
10802
10803/* Generate the code to add a full posix character <class> to the bracketed
10804 * character class given by <node>. (<node> is needed only under locale rules)
10805 * destlist is the inversion list for non-locale rules that this class is
10806 * to be added to
10807 * sourcelist is the ASCII-range inversion list to add under /a rules
10808 * Xsourcelist is the full Unicode range list to use otherwise. */
10809#define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10810 if (LOC) { \
10811 SV* scratch_list = NULL; \
10812 \
10813 /* Set this class in the node for runtime matching */ \
10814 ANYOF_CLASS_SET(node, class); \
10815 \
10816 /* For above Latin1 code points, we use the full Unicode range */ \
10817 _invlist_intersection(PL_AboveLatin1, \
10818 Xsourcelist, \
10819 &scratch_list); \
10820 /* And set the output to it, adding instead if there already is an \
10821 * output. Checking if <destlist> is NULL first saves an extra \
10822 * clone. Its reference count will be decremented at the next \
10823 * union, etc, or if this is the only instance, at the end of the \
10824 * routine */ \
10825 if (! destlist) { \
10826 destlist = scratch_list; \
10827 } \
10828 else { \
10829 _invlist_union(destlist, scratch_list, &destlist); \
10830 SvREFCNT_dec(scratch_list); \
10831 } \
10832 } \
10833 else { \
10834 /* For non-locale, just add it to any existing list */ \
10835 _invlist_union(destlist, \
10836 (AT_LEAST_ASCII_RESTRICTED) \
10837 ? sourcelist \
10838 : Xsourcelist, \
10839 &destlist); \
10840 }
10841
10842/* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
10843 */
10844#define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10845 if (LOC) { \
10846 SV* scratch_list = NULL; \
10847 ANYOF_CLASS_SET(node, class); \
10848 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
10849 if (! destlist) { \
10850 destlist = scratch_list; \
10851 } \
10852 else { \
10853 _invlist_union(destlist, scratch_list, &destlist); \
10854 SvREFCNT_dec(scratch_list); \
10855 } \
10856 } \
10857 else { \
10858 _invlist_union_complement_2nd(destlist, \
10859 (AT_LEAST_ASCII_RESTRICTED) \
10860 ? sourcelist \
10861 : Xsourcelist, \
10862 &destlist); \
10863 /* Under /d, everything in the upper half of the Latin1 range \
10864 * matches this complement */ \
10865 if (DEPENDS_SEMANTICS) { \
10866 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10867 } \
10868 }
10869
10870/* Generate the code to add a posix character <class> to the bracketed
10871 * character class given by <node>. (<node> is needed only under locale rules)
10872 * destlist is the inversion list for non-locale rules that this class is
10873 * to be added to
10874 * sourcelist is the ASCII-range inversion list to add under /a rules
10875 * l1_sourcelist is the Latin1 range list to use otherwise.
10876 * Xpropertyname is the name to add to <run_time_list> of the property to
10877 * specify the code points above Latin1 that will have to be
10878 * determined at run-time
10879 * run_time_list is a SV* that contains text names of properties that are to
10880 * be computed at run time. This concatenates <Xpropertyname>
10881 * to it, apppropriately
10882 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
10883 * time */
10884#define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10885 l1_sourcelist, Xpropertyname, run_time_list) \
10886 /* First, resolve whether to use the ASCII-only list or the L1 \
10887 * list */ \
10888 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
10889 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
10890 Xpropertyname, run_time_list)
10891
10892#define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
10893 Xpropertyname, run_time_list) \
10894 /* If not /a matching, there are going to be code points we will have \
10895 * to defer to runtime to look-up */ \
10896 if (! AT_LEAST_ASCII_RESTRICTED) { \
10897 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
10898 } \
10899 if (LOC) { \
10900 ANYOF_CLASS_SET(node, class); \
10901 } \
10902 else { \
10903 _invlist_union(destlist, sourcelist, &destlist); \
10904 }
10905
10906/* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
10907 * this and DO_N_POSIX */
10908#define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10909 l1_sourcelist, Xpropertyname, run_time_list) \
10910 if (AT_LEAST_ASCII_RESTRICTED) { \
10911 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
10912 } \
10913 else { \
10914 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
10915 if (LOC) { \
10916 ANYOF_CLASS_SET(node, namedclass); \
10917 } \
10918 else { \
10919 SV* scratch_list = NULL; \
10920 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
10921 if (! destlist) { \
10922 destlist = scratch_list; \
10923 } \
10924 else { \
10925 _invlist_union(destlist, scratch_list, &destlist); \
10926 SvREFCNT_dec(scratch_list); \
10927 } \
10928 if (DEPENDS_SEMANTICS) { \
10929 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10930 } \
10931 } \
10932 }
10933
10934STATIC void
10935S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
10936{
10937 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
10938 * alternate list, pointed to by 'alternate_ptr'. This is an array of
10939 * the multi-character folds of characters in the node */
10940 SV *sv;
10941
10942 PERL_ARGS_ASSERT_ADD_ALTERNATE;
10943
10944 if (! *alternate_ptr) {
10945 *alternate_ptr = newAV();
10946 }
10947 sv = newSVpvn_utf8((char*)string, len, TRUE);
10948 av_push(*alternate_ptr, sv);
10949 return;
10950}
10951
10952/*
10953 parse a class specification and produce either an ANYOF node that
10954 matches the pattern or perhaps will be optimized into an EXACTish node
10955 instead. The node contains a bit map for the first 256 characters, with the
10956 corresponding bit set if that character is in the list. For characters
10957 above 255, a range list is used */
10958
10959STATIC regnode *
10960S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
10961{
10962 dVAR;
10963 register UV nextvalue;
10964 register IV prevvalue = OOB_UNICODE;
10965 register IV range = 0;
10966 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
10967 register regnode *ret;
10968 STRLEN numlen;
10969 IV namedclass;
10970 char *rangebegin = NULL;
10971 bool need_class = 0;
10972 bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
10973 SV *listsv = NULL;
10974 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
10975 than just initialized. */
10976 SV* properties = NULL; /* Code points that match \p{} \P{} */
10977 UV element_count = 0; /* Number of distinct elements in the class.
10978 Optimizations may be possible if this is tiny */
10979 UV n;
10980
10981 /* Certain named classes have equivalents that can appear outside a
10982 * character class, e.g. \w. These flags are set for these classes. The
10983 * first flag indicates the op depends on the character set modifier, like
10984 * /d, /u.... The second is for those that don't have this dependency. */
10985 bool has_special_charset_op = FALSE;
10986 bool has_special_non_charset_op = FALSE;
10987
10988 /* Unicode properties are stored in a swash; this holds the current one
10989 * being parsed. If this swash is the only above-latin1 component of the
10990 * character class, an optimization is to pass it directly on to the
10991 * execution engine. Otherwise, it is set to NULL to indicate that there
10992 * are other things in the class that have to be dealt with at execution
10993 * time */
10994 SV* swash = NULL; /* Code points that match \p{} \P{} */
10995
10996 /* Set if a component of this character class is user-defined; just passed
10997 * on to the engine */
10998 UV has_user_defined_property = 0;
10999
11000 /* inversion list of code points this node matches only when the target
11001 * string is in UTF-8. (Because is under /d) */
11002 SV* depends_list = NULL;
11003
11004 /* inversion list of code points this node matches. For much of the
11005 * function, it includes only those that match regardless of the utf8ness
11006 * of the target string */
11007 SV* cp_list = NULL;
11008
11009 /* List of multi-character folds that are matched by this node */
11010 AV* unicode_alternate = NULL;
11011#ifdef EBCDIC
11012 /* In a range, counts how many 0-2 of the ends of it came from literals,
11013 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
11014 UV literal_endpoint = 0;
11015#endif
11016 UV stored = 0; /* how many chars stored in the bitmap */
11017
11018 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11019 case we need to change the emitted regop to an EXACT. */
11020 const char * orig_parse = RExC_parse;
11021 GET_RE_DEBUG_FLAGS_DECL;
11022
11023 PERL_ARGS_ASSERT_REGCLASS;
11024#ifndef DEBUGGING
11025 PERL_UNUSED_ARG(depth);
11026#endif
11027
11028 DEBUG_PARSE("clas");
11029
11030 /* Assume we are going to generate an ANYOF node. */
11031 ret = reganode(pRExC_state, ANYOF, 0);
11032
11033
11034 if (!SIZE_ONLY) {
11035 ANYOF_FLAGS(ret) = 0;
11036 }
11037
11038 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
11039 RExC_naughty++;
11040 RExC_parse++;
11041 if (!SIZE_ONLY)
11042 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
11043
11044 /* We have decided to not allow multi-char folds in inverted character
11045 * classes, due to the confusion that can happen, especially with
11046 * classes that are designed for a non-Unicode world: You have the
11047 * peculiar case that:
11048 "s s" =~ /^[^\xDF]+$/i => Y
11049 "ss" =~ /^[^\xDF]+$/i => N
11050 *
11051 * See [perl #89750] */
11052 allow_full_fold = FALSE;
11053 }
11054
11055 if (SIZE_ONLY) {
11056 RExC_size += ANYOF_SKIP;
11057 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11058 }
11059 else {
11060 RExC_emit += ANYOF_SKIP;
11061 if (LOC) {
11062 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11063 }
11064 ANYOF_BITMAP_ZERO(ret);
11065 listsv = newSVpvs("# comment\n");
11066 initial_listsv_len = SvCUR(listsv);
11067 }
11068
11069 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11070
11071 if (!SIZE_ONLY && POSIXCC(nextvalue))
11072 checkposixcc(pRExC_state);
11073
11074 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11075 if (UCHARAT(RExC_parse) == ']')
11076 goto charclassloop;
11077
11078parseit:
11079 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11080
11081 charclassloop:
11082
11083 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11084
11085 if (!range) {
11086 rangebegin = RExC_parse;
11087 element_count++;
11088 }
11089 if (UTF) {
11090 value = utf8n_to_uvchr((U8*)RExC_parse,
11091 RExC_end - RExC_parse,
11092 &numlen, UTF8_ALLOW_DEFAULT);
11093 RExC_parse += numlen;
11094 }
11095 else
11096 value = UCHARAT(RExC_parse++);
11097
11098 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11099 if (value == '[' && POSIXCC(nextvalue))
11100 namedclass = regpposixcc(pRExC_state, value);
11101 else if (value == '\\') {
11102 if (UTF) {
11103 value = utf8n_to_uvchr((U8*)RExC_parse,
11104 RExC_end - RExC_parse,
11105 &numlen, UTF8_ALLOW_DEFAULT);
11106 RExC_parse += numlen;
11107 }
11108 else
11109 value = UCHARAT(RExC_parse++);
11110 /* Some compilers cannot handle switching on 64-bit integer
11111 * values, therefore value cannot be an UV. Yes, this will
11112 * be a problem later if we want switch on Unicode.
11113 * A similar issue a little bit later when switching on
11114 * namedclass. --jhi */
11115 switch ((I32)value) {
11116 case 'w': namedclass = ANYOF_ALNUM; break;
11117 case 'W': namedclass = ANYOF_NALNUM; break;
11118 case 's': namedclass = ANYOF_SPACE; break;
11119 case 'S': namedclass = ANYOF_NSPACE; break;
11120 case 'd': namedclass = ANYOF_DIGIT; break;
11121 case 'D': namedclass = ANYOF_NDIGIT; break;
11122 case 'v': namedclass = ANYOF_VERTWS; break;
11123 case 'V': namedclass = ANYOF_NVERTWS; break;
11124 case 'h': namedclass = ANYOF_HORIZWS; break;
11125 case 'H': namedclass = ANYOF_NHORIZWS; break;
11126 case 'N': /* Handle \N{NAME} in class */
11127 {
11128 /* We only pay attention to the first char of
11129 multichar strings being returned. I kinda wonder
11130 if this makes sense as it does change the behaviour
11131 from earlier versions, OTOH that behaviour was broken
11132 as well. */
11133 UV v; /* value is register so we cant & it /grrr */
11134 if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
11135 goto parseit;
11136 }
11137 value= v;
11138 }
11139 break;
11140 case 'p':
11141 case 'P':
11142 {
11143 char *e;
11144 if (RExC_parse >= RExC_end)
11145 vFAIL2("Empty \\%c{}", (U8)value);
11146 if (*RExC_parse == '{') {
11147 const U8 c = (U8)value;
11148 e = strchr(RExC_parse++, '}');
11149 if (!e)
11150 vFAIL2("Missing right brace on \\%c{}", c);
11151 while (isSPACE(UCHARAT(RExC_parse)))
11152 RExC_parse++;
11153 if (e == RExC_parse)
11154 vFAIL2("Empty \\%c{}", c);
11155 n = e - RExC_parse;
11156 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11157 n--;
11158 }
11159 else {
11160 e = RExC_parse;
11161 n = 1;
11162 }
11163 if (!SIZE_ONLY) {
11164 SV** invlistsvp;
11165 SV* invlist;
11166 char* name;
11167 if (UCHARAT(RExC_parse) == '^') {
11168 RExC_parse++;
11169 n--;
11170 value = value == 'p' ? 'P' : 'p'; /* toggle */
11171 while (isSPACE(UCHARAT(RExC_parse))) {
11172 RExC_parse++;
11173 n--;
11174 }
11175 }
11176 /* Try to get the definition of the property into
11177 * <invlist>. If /i is in effect, the effective property
11178 * will have its name be <__NAME_i>. The design is
11179 * discussed in commit
11180 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11181 Newx(name, n + sizeof("_i__\n"), char);
11182
11183 sprintf(name, "%s%.*s%s\n",
11184 (FOLD) ? "__" : "",
11185 (int)n,
11186 RExC_parse,
11187 (FOLD) ? "_i" : ""
11188 );
11189
11190 /* Look up the property name, and get its swash and
11191 * inversion list, if the property is found */
11192 if (swash) {
11193 SvREFCNT_dec(swash);
11194 }
11195 swash = _core_swash_init("utf8", name, &PL_sv_undef,
11196 1, /* binary */
11197 0, /* not tr/// */
11198 TRUE, /* this routine will handle
11199 undefined properties */
11200 NULL, FALSE /* No inversion list */
11201 );
11202 if ( ! swash
11203 || ! SvROK(swash)
11204 || ! SvTYPE(SvRV(swash)) == SVt_PVHV
11205 || ! (invlistsvp =
11206 hv_fetchs(MUTABLE_HV(SvRV(swash)),
11207 "INVLIST", FALSE))
11208 || ! (invlist = *invlistsvp))
11209 {
11210 if (swash) {
11211 SvREFCNT_dec(swash);
11212 swash = NULL;
11213 }
11214
11215 /* Here didn't find it. It could be a user-defined
11216 * property that will be available at run-time. Add it
11217 * to the list to look up then */
11218 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11219 (value == 'p' ? '+' : '!'),
11220 name);
11221 has_user_defined_property = 1;
11222
11223 /* We don't know yet, so have to assume that the
11224 * property could match something in the Latin1 range,
11225 * hence something that isn't utf8 */
11226 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11227 }
11228 else {
11229
11230 /* Here, did get the swash and its inversion list. If
11231 * the swash is from a user-defined property, then this
11232 * whole character class should be regarded as such */
11233 SV** user_defined_svp =
11234 hv_fetchs(MUTABLE_HV(SvRV(swash)),
11235 "USER_DEFINED", FALSE);
11236 if (user_defined_svp) {
11237 has_user_defined_property
11238 |= SvUV(*user_defined_svp);
11239 }
11240
11241 /* Invert if asking for the complement */
11242 if (value == 'P') {
11243 _invlist_union_complement_2nd(properties, invlist, &properties);
11244
11245 /* The swash can't be used as-is, because we've
11246 * inverted things; delay removing it to here after
11247 * have copied its invlist above */
11248 SvREFCNT_dec(swash);
11249 swash = NULL;
11250 }
11251 else {
11252 _invlist_union(properties, invlist, &properties);
11253 }
11254 }
11255 Safefree(name);
11256 }
11257 RExC_parse = e + 1;
11258 namedclass = ANYOF_MAX; /* no official name, but it's named */
11259
11260 /* \p means they want Unicode semantics */
11261 RExC_uni_semantics = 1;
11262 }
11263 break;
11264 case 'n': value = '\n'; break;
11265 case 'r': value = '\r'; break;
11266 case 't': value = '\t'; break;
11267 case 'f': value = '\f'; break;
11268 case 'b': value = '\b'; break;
11269 case 'e': value = ASCII_TO_NATIVE('\033');break;
11270 case 'a': value = ASCII_TO_NATIVE('\007');break;
11271 case 'o':
11272 RExC_parse--; /* function expects to be pointed at the 'o' */
11273 {
11274 const char* error_msg;
11275 bool valid = grok_bslash_o(RExC_parse,
11276 &value,
11277 &numlen,
11278 &error_msg,
11279 SIZE_ONLY);
11280 RExC_parse += numlen;
11281 if (! valid) {
11282 vFAIL(error_msg);
11283 }
11284 }
11285 if (PL_encoding && value < 0x100) {
11286 goto recode_encoding;
11287 }
11288 break;
11289 case 'x':
11290 RExC_parse--; /* function expects to be pointed at the 'x' */
11291 {
11292 const char* error_msg;
11293 bool valid = grok_bslash_x(RExC_parse,
11294 &value,
11295 &numlen,
11296 &error_msg,
11297 1);
11298 RExC_parse += numlen;
11299 if (! valid) {
11300 vFAIL(error_msg);
11301 }
11302 }
11303 if (PL_encoding && value < 0x100)
11304 goto recode_encoding;
11305 break;
11306 case 'c':
11307 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11308 break;
11309 case '0': case '1': case '2': case '3': case '4':
11310 case '5': case '6': case '7':
11311 {
11312 /* Take 1-3 octal digits */
11313 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11314 numlen = 3;
11315 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11316 RExC_parse += numlen;
11317 if (PL_encoding && value < 0x100)
11318 goto recode_encoding;
11319 break;
11320 }
11321 recode_encoding:
11322 if (! RExC_override_recoding) {
11323 SV* enc = PL_encoding;
11324 value = reg_recode((const char)(U8)value, &enc);
11325 if (!enc && SIZE_ONLY)
11326 ckWARNreg(RExC_parse,
11327 "Invalid escape in the specified encoding");
11328 break;
11329 }
11330 default:
11331 /* Allow \_ to not give an error */
11332 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11333 ckWARN2reg(RExC_parse,
11334 "Unrecognized escape \\%c in character class passed through",
11335 (int)value);
11336 }
11337 break;
11338 }
11339 } /* end of \blah */
11340#ifdef EBCDIC
11341 else
11342 literal_endpoint++;
11343#endif
11344
11345 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11346
11347 /* What matches in a locale is not known until runtime, so need to
11348 * (one time per class) allocate extra space to pass to regexec.
11349 * The space will contain a bit for each named class that is to be
11350 * matched against. This isn't needed for \p{} and pseudo-classes,
11351 * as they are not affected by locale, and hence are dealt with
11352 * separately */
11353 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
11354 need_class = 1;
11355 if (SIZE_ONLY) {
11356 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11357 }
11358 else {
11359 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11360 ANYOF_CLASS_ZERO(ret);
11361 }
11362 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11363 }
11364
11365 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
11366 * literal, as is the character that began the false range, i.e.
11367 * the 'a' in the examples */
11368 if (range) {
11369 if (!SIZE_ONLY) {
11370 const int w =
11371 RExC_parse >= rangebegin ?
11372 RExC_parse - rangebegin : 0;
11373 ckWARN4reg(RExC_parse,
11374 "False [] range \"%*.*s\"",
11375 w, w, rangebegin);
11376 cp_list = add_cp_to_invlist(cp_list, '-');
11377 cp_list = add_cp_to_invlist(cp_list, prevvalue);
11378 }
11379
11380 range = 0; /* this was not a true range */
11381 element_count += 2; /* So counts for three values */
11382 }
11383
11384 if (SIZE_ONLY) {
11385
11386 /* In the first pass, do a little extra work so below can
11387 * possibly optimize the whole node to one of the nodes that
11388 * correspond to the classes given below */
11389
11390 /* The optimization will only take place if there is a single
11391 * element in the class, so can skip if there is more than one
11392 */
11393 if (element_count == 1) {
11394
11395 /* Possible truncation here but in some 64-bit environments
11396 * the compiler gets heartburn about switch on 64-bit values.
11397 * A similar issue a little earlier when switching on value.
11398 * --jhi */
11399 switch ((I32)namedclass) {
11400 case ANYOF_ALNUM:
11401 case ANYOF_NALNUM:
11402 case ANYOF_DIGIT:
11403 case ANYOF_NDIGIT:
11404 case ANYOF_SPACE:
11405 case ANYOF_NSPACE:
11406 has_special_charset_op = TRUE;
11407 break;
11408
11409 case ANYOF_HORIZWS:
11410 case ANYOF_NHORIZWS:
11411 case ANYOF_VERTWS:
11412 case ANYOF_NVERTWS:
11413 has_special_non_charset_op = TRUE;
11414 break;
11415 }
11416 }
11417 }
11418 else {
11419 switch ((I32)namedclass) {
11420
11421 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11422 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11423 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11424 break;
11425 case ANYOF_NALNUMC:
11426 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11427 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11428 break;
11429 case ANYOF_ALPHA:
11430 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11431 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11432 break;
11433 case ANYOF_NALPHA:
11434 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11435 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11436 break;
11437 case ANYOF_ASCII:
11438 if (LOC) {
11439 ANYOF_CLASS_SET(ret, namedclass);
11440 }
11441 else {
11442 _invlist_union(properties, PL_ASCII, &properties);
11443 }
11444 break;
11445 case ANYOF_NASCII:
11446 if (LOC) {
11447 ANYOF_CLASS_SET(ret, namedclass);
11448 }
11449 else {
11450 _invlist_union_complement_2nd(properties,
11451 PL_ASCII, &properties);
11452 if (DEPENDS_SEMANTICS) {
11453 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11454 }
11455 }
11456 break;
11457 case ANYOF_BLANK:
11458 DO_POSIX(ret, namedclass, properties,
11459 PL_PosixBlank, PL_XPosixBlank);
11460 break;
11461 case ANYOF_NBLANK:
11462 DO_N_POSIX(ret, namedclass, properties,
11463 PL_PosixBlank, PL_XPosixBlank);
11464 break;
11465 case ANYOF_CNTRL:
11466 DO_POSIX(ret, namedclass, properties,
11467 PL_PosixCntrl, PL_XPosixCntrl);
11468 break;
11469 case ANYOF_NCNTRL:
11470 DO_N_POSIX(ret, namedclass, properties,
11471 PL_PosixCntrl, PL_XPosixCntrl);
11472 break;
11473 case ANYOF_DIGIT:
11474 /* There are no digits in the Latin1 range outside of
11475 * ASCII, so call the macro that doesn't have to resolve
11476 * them */
11477 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
11478 PL_PosixDigit, "XPosixDigit", listsv);
11479 has_special_charset_op = TRUE;
11480 break;
11481 case ANYOF_NDIGIT:
11482 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11483 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
11484 has_special_charset_op = TRUE;
11485 break;
11486 case ANYOF_GRAPH:
11487 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11488 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11489 break;
11490 case ANYOF_NGRAPH:
11491 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11492 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11493 break;
11494 case ANYOF_HORIZWS:
11495 /* For these, we use the cp_list, as /d doesn't make a
11496 * difference in what these match. There would be problems
11497 * if these characters had folds other than themselves, as
11498 * cp_list is subject to folding. It turns out that \h
11499 * is just a synonym for XPosixBlank */
11500 _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
11501 has_special_non_charset_op = TRUE;
11502 break;
11503 case ANYOF_NHORIZWS:
11504 _invlist_union_complement_2nd(cp_list,
11505 PL_XPosixBlank, &cp_list);
11506 has_special_non_charset_op = TRUE;
11507 break;
11508 case ANYOF_LOWER:
11509 case ANYOF_NLOWER:
11510 { /* These require special handling, as they differ under
11511 folding, matching Cased there (which in the ASCII range
11512 is the same as Alpha */
11513
11514 SV* ascii_source;
11515 SV* l1_source;
11516 const char *Xname;
11517
11518 if (FOLD && ! LOC) {
11519 ascii_source = PL_PosixAlpha;
11520 l1_source = PL_L1Cased;
11521 Xname = "Cased";
11522 }
11523 else {
11524 ascii_source = PL_PosixLower;
11525 l1_source = PL_L1PosixLower;
11526 Xname = "XPosixLower";
11527 }
11528 if (namedclass == ANYOF_LOWER) {
11529 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11530 ascii_source, l1_source, Xname, listsv);
11531 }
11532 else {
11533 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11534 properties, ascii_source, l1_source, Xname, listsv);
11535 }
11536 break;
11537 }
11538 case ANYOF_PRINT:
11539 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11540 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11541 break;
11542 case ANYOF_NPRINT:
11543 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11544 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11545 break;
11546 case ANYOF_PUNCT:
11547 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11548 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11549 break;
11550 case ANYOF_NPUNCT:
11551 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11552 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11553 break;
11554 case ANYOF_PSXSPC:
11555 DO_POSIX(ret, namedclass, properties,
11556 PL_PosixSpace, PL_XPosixSpace);
11557 break;
11558 case ANYOF_NPSXSPC:
11559 DO_N_POSIX(ret, namedclass, properties,
11560 PL_PosixSpace, PL_XPosixSpace);
11561 break;
11562 case ANYOF_SPACE:
11563 DO_POSIX(ret, namedclass, properties,
11564 PL_PerlSpace, PL_XPerlSpace);
11565 has_special_charset_op = TRUE;
11566 break;
11567 case ANYOF_NSPACE:
11568 DO_N_POSIX(ret, namedclass, properties,
11569 PL_PerlSpace, PL_XPerlSpace);
11570 has_special_charset_op = TRUE;
11571 break;
11572 case ANYOF_UPPER: /* Same as LOWER, above */
11573 case ANYOF_NUPPER:
11574 {
11575 SV* ascii_source;
11576 SV* l1_source;
11577 const char *Xname;
11578
11579 if (FOLD && ! LOC) {
11580 ascii_source = PL_PosixAlpha;
11581 l1_source = PL_L1Cased;
11582 Xname = "Cased";
11583 }
11584 else {
11585 ascii_source = PL_PosixUpper;
11586 l1_source = PL_L1PosixUpper;
11587 Xname = "XPosixUpper";
11588 }
11589 if (namedclass == ANYOF_UPPER) {
11590 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11591 ascii_source, l1_source, Xname, listsv);
11592 }
11593 else {
11594 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11595 properties, ascii_source, l1_source, Xname, listsv);
11596 }
11597 break;
11598 }
11599 case ANYOF_ALNUM: /* Really is 'Word' */
11600 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11601 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11602 has_special_charset_op = TRUE;
11603 break;
11604 case ANYOF_NALNUM:
11605 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11606 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11607 has_special_charset_op = TRUE;
11608 break;
11609 case ANYOF_VERTWS:
11610 /* For these, we use the cp_list, as /d doesn't make a
11611 * difference in what these match. There would be problems
11612 * if these characters had folds other than themselves, as
11613 * cp_list is subject to folding */
11614 _invlist_union(cp_list, PL_VertSpace, &cp_list);
11615 has_special_non_charset_op = TRUE;
11616 break;
11617 case ANYOF_NVERTWS:
11618 _invlist_union_complement_2nd(cp_list,
11619 PL_VertSpace, &cp_list);
11620 has_special_non_charset_op = TRUE;
11621 break;
11622 case ANYOF_XDIGIT:
11623 DO_POSIX(ret, namedclass, properties,
11624 PL_PosixXDigit, PL_XPosixXDigit);
11625 break;
11626 case ANYOF_NXDIGIT:
11627 DO_N_POSIX(ret, namedclass, properties,
11628 PL_PosixXDigit, PL_XPosixXDigit);
11629 break;
11630 case ANYOF_MAX:
11631 /* this is to handle \p and \P */
11632 break;
11633 default:
11634 vFAIL("Invalid [::] class");
11635 break;
11636 }
11637
11638 continue;
11639 }
11640 } /* end of namedclass \blah */
11641
11642 if (range) {
11643 if (prevvalue > (IV)value) /* b-a */ {
11644 const int w = RExC_parse - rangebegin;
11645 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11646 range = 0; /* not a valid range */
11647 }
11648 }
11649 else {
11650 prevvalue = value; /* save the beginning of the range */
11651 if (RExC_parse+1 < RExC_end
11652 && *RExC_parse == '-'
11653 && RExC_parse[1] != ']')
11654 {
11655 RExC_parse++;
11656
11657 /* a bad range like \w-, [:word:]- ? */
11658 if (namedclass > OOB_NAMEDCLASS) {
11659 if (ckWARN(WARN_REGEXP)) {
11660 const int w =
11661 RExC_parse >= rangebegin ?
11662 RExC_parse - rangebegin : 0;
11663 vWARN4(RExC_parse,
11664 "False [] range \"%*.*s\"",
11665 w, w, rangebegin);
11666 }
11667 if (!SIZE_ONLY)
11668 cp_list = add_cp_to_invlist(cp_list, '-');
11669 } else
11670 range = 1; /* yeah, it's a range! */
11671 continue; /* but do it the next time */
11672 }
11673 }
11674
11675 /* non-Latin1 code point implies unicode semantics. Must be set in
11676 * pass1 so is there for the whole of pass 2 */
11677 if (value > 255) {
11678 RExC_uni_semantics = 1;
11679 }
11680
11681 /* now is the next time */
11682 if (!SIZE_ONLY) {
11683#ifndef EBCDIC
11684 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
11685#else
11686 UV* this_range = _new_invlist(1);
11687 _append_range_to_invlist(this_range, prevvalue, value);
11688
11689 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
11690 * If this range was specified using something like 'i-j', we want
11691 * to include only the 'i' and the 'j', and not anything in
11692 * between, so exclude non-ASCII, non-alphabetics from it.
11693 * However, if the range was specified with something like
11694 * [\x89-\x91] or [\x89-j], all code points within it should be
11695 * included. literal_endpoint==2 means both ends of the range used
11696 * a literal character, not \x{foo} */
11697 if (literal_endpoint == 2
11698 && (prevvalue >= 'a' && value <= 'z')
11699 || (prevvalue >= 'A' && value <= 'Z'))
11700 {
11701 _invlist_intersection(this_range, PL_ASCII, &this_range, );
11702 _invlist_intersection(this_range, PL_Alpha, &this_range, );
11703 }
11704 _invlist_union(cp_list, this_range, &cp_list);
11705 literal_endpoint = 0;
11706#endif
11707 }
11708
11709 range = 0; /* this range (if it was one) is done now */
11710 }
11711
11712 /* [\w] can be optimized into \w, but not if there is anything else in the
11713 * brackets (except for an initial '^' which indictes omplementing). We
11714 * also can optimize the common special case /[0-9]/ into /\d/a */
11715 if (element_count == 1 &&
11716 (has_special_charset_op
11717 || has_special_non_charset_op
11718 || (prevvalue == '0' && value == '9')))
11719 {
11720 U8 op;
11721 bool invert = ANYOF_FLAGS(ret) & ANYOF_INVERT;
11722 const char * cur_parse = RExC_parse;
11723
11724 if (has_special_charset_op) {
11725 U8 offset = get_regex_charset(RExC_flags);
11726
11727 /* /aa is the same as /a for these */
11728 if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
11729 offset = REGEX_ASCII_RESTRICTED_CHARSET;
11730 }
11731 switch ((I32)namedclass) {
11732 case ANYOF_NALNUM:
11733 invert = ! invert;
11734 /* FALLTHROUGH */
11735 case ANYOF_ALNUM:
11736 op = ALNUM;
11737 break;
11738 case ANYOF_NSPACE:
11739 invert = ! invert;
11740 /* FALLTHROUGH */
11741 case ANYOF_SPACE:
11742 op = SPACE;
11743 break;
11744 case ANYOF_NDIGIT:
11745 invert = ! invert;
11746 /* FALLTHROUGH */
11747 case ANYOF_DIGIT:
11748 op = DIGIT;
11749
11750 /* There is no DIGITU */
11751 if (offset == REGEX_UNICODE_CHARSET) {
11752 offset = REGEX_DEPENDS_CHARSET;
11753 }
11754 break;
11755 default:
11756 Perl_croak(aTHX_ "panic: Named character class %"IVdf" is not expected to have a non-[...] version", namedclass);
11757 }
11758
11759 /* The number of varieties of each of these is the same, hence, so
11760 * is the delta between the normal and complemented nodes */
11761 if (invert) {
11762 offset += NALNUM - ALNUM;
11763 }
11764
11765 op += offset;
11766 }
11767 else if (has_special_non_charset_op) {
11768 switch ((I32)namedclass) {
11769 case ANYOF_NHORIZWS:
11770 invert = ! invert;
11771 /* FALLTHROUGH */
11772 case ANYOF_HORIZWS:
11773 op = HORIZWS;
11774 break;
11775 case ANYOF_NVERTWS:
11776 invert = ! invert;
11777 /* FALLTHROUGH */
11778 case ANYOF_VERTWS:
11779 op = VERTWS;
11780 break;
11781 default:
11782 Perl_croak(aTHX_ "panic: Named character class %"IVdf" is not expected to have a non-[...] version", namedclass);
11783 }
11784
11785 /* The complement version of each of these nodes is adjacently next
11786 * */
11787 if (invert) {
11788 op++;
11789 }
11790 }
11791 else { /* The remaining possibility is [0-9] */
11792 op = (invert) ? NDIGITA : DIGITA;
11793 }
11794
11795 /* Throw away this ANYOF regnode, and emit the calculated one, which
11796 * should correspond to the beginning, not current, state of the parse
11797 */
11798 RExC_parse = (char *)orig_parse;
11799 RExC_emit = (regnode *)orig_emit;
11800 ret = reg_node(pRExC_state, op);
11801 RExC_parse = (char *) cur_parse;
11802
11803 SvREFCNT_dec(listsv);
11804 return ret;
11805 }
11806
11807 if (SIZE_ONLY)
11808 return ret;
11809 /****** !SIZE_ONLY AFTER HERE *********/
11810
11811 /* If folding, we calculate all characters that could fold to or from the
11812 * ones already on the list */
11813 if (FOLD && cp_list) {
11814 UV start, end; /* End points of code point ranges */
11815
11816 SV* fold_intersection = NULL;
11817
11818 const UV highest_index = invlist_len(cp_list) - 1;
11819
11820 /* In the Latin1 range, the characters that can be folded-to or -from
11821 * are precisely the alphabetic characters. If the highest code point
11822 * is within Latin1, we can use the compiled-in list, and not have to
11823 * go out to disk. If the last element in the array is in the
11824 * inversion list set, it starts a range that goes to infinity, so the
11825 * maximum of the inversion list is definitely above Latin1.
11826 * Otherwise, it starts a range that isn't in the set, so the max is
11827 * one less than it */
11828 if (! ELEMENT_RANGE_MATCHES_INVLIST(highest_index)
11829 && invlist_array(cp_list)[highest_index] <= 256)
11830 {
11831 _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
11832 }
11833 else {
11834
11835 /* This is a list of all the characters that participate in folds
11836 * (except marks, etc in multi-char folds */
11837 if (! PL_utf8_foldable) {
11838 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
11839 PL_utf8_foldable = _swash_to_invlist(swash);
11840 SvREFCNT_dec(swash);
11841 }
11842
11843 /* This is a hash that for a particular fold gives all characters
11844 * that are involved in it */
11845 if (! PL_utf8_foldclosures) {
11846
11847 /* If we were unable to find any folds, then we likely won't be
11848 * able to find the closures. So just create an empty list.
11849 * Folding will effectively be restricted to the non-Unicode
11850 * rules hard-coded into Perl. (This case happens legitimately
11851 * during compilation of Perl itself before the Unicode tables
11852 * are generated) */
11853 if (invlist_len(PL_utf8_foldable) == 0) {
11854 PL_utf8_foldclosures = newHV();
11855 }
11856 else {
11857 /* If the folds haven't been read in, call a fold function
11858 * to force that */
11859 if (! PL_utf8_tofold) {
11860 U8 dummy[UTF8_MAXBYTES+1];
11861 STRLEN dummy_len;
11862
11863 /* This particular string is above \xff in both UTF-8
11864 * and UTFEBCDIC */
11865 to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
11866 assert(PL_utf8_tofold); /* Verify that worked */
11867 }
11868 PL_utf8_foldclosures =
11869 _swash_inversion_hash(PL_utf8_tofold);
11870 }
11871 }
11872
11873 /* Only the characters in this class that participate in folds need
11874 * be checked. Get the intersection of this class and all the
11875 * possible characters that are foldable. This can quickly narrow
11876 * down a large class */
11877 _invlist_intersection(PL_utf8_foldable, cp_list,
11878 &fold_intersection);
11879 }
11880
11881 /* Now look at the foldable characters in this class individually */
11882 invlist_iterinit(fold_intersection);
11883 while (invlist_iternext(fold_intersection, &start, &end)) {
11884 UV j;
11885
11886 /* Locale folding for Latin1 characters is deferred until runtime */
11887 if (LOC && start < 256) {
11888 start = 256;
11889 }
11890
11891 /* Look at every character in the range */
11892 for (j = start; j <= end; j++) {
11893
11894 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
11895 STRLEN foldlen;
11896 UV f;
11897
11898 if (j < 256) {
11899
11900 /* We have the latin1 folding rules hard-coded here so that
11901 * an innocent-looking character class, like /[ks]/i won't
11902 * have to go out to disk to find the possible matches.
11903 * XXX It would be better to generate these via regen, in
11904 * case a new version of the Unicode standard adds new
11905 * mappings, though that is not really likely, and may be
11906 * caught by the default: case of the switch below. */
11907
11908 if (PL_fold_latin1[j] != j) {
11909
11910 /* ASCII is always matched; non-ASCII is matched only
11911 * under Unicode rules */
11912 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
11913 cp_list =
11914 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
11915 }
11916 else {
11917 depends_list =
11918 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
11919 }
11920 }
11921
11922 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
11923 && (! isASCII(j) || ! MORE_ASCII_RESTRICTED))
11924 {
11925 /* Certain Latin1 characters have matches outside
11926 * Latin1, or are multi-character. To get here, 'j' is
11927 * one of those characters. None of these matches is
11928 * valid for ASCII characters under /aa, which is why
11929 * the 'if' just above excludes those. The matches
11930 * fall into three categories:
11931 * 1) They are singly folded-to or -from an above 255
11932 * character, e.g., LATIN SMALL LETTER Y WITH
11933 * DIAERESIS and LATIN CAPITAL LETTER Y WITH
11934 * DIAERESIS;
11935 * 2) They are part of a multi-char fold with another
11936 * latin1 character; only LATIN SMALL LETTER
11937 * SHARP S => "ss" fits this;
11938 * 3) They are part of a multi-char fold with a
11939 * character outside of Latin1, such as various
11940 * ligatures.
11941 * We aren't dealing fully with multi-char folds, except
11942 * we do deal with the pattern containing a character
11943 * that has a multi-char fold (not so much the inverse).
11944 * For types 1) and 3), the matches only happen when the
11945 * target string is utf8; that's not true for 2), and we
11946 * set a flag for it.
11947 *
11948 * The code below adds the single fold closures for 'j'
11949 * to the inversion list. */
11950 switch (j) {
11951 case 'k':
11952 case 'K':
11953 /* KELVIN SIGN */
11954 cp_list =
11955 add_cp_to_invlist(cp_list, 0x212A);
11956 break;
11957 case 's':
11958 case 'S':
11959 /* LATIN SMALL LETTER LONG S */
11960 cp_list =
11961 add_cp_to_invlist(cp_list, 0x017F);
11962 break;
11963 case MICRO_SIGN:
11964 cp_list = add_cp_to_invlist(cp_list,
11965 GREEK_SMALL_LETTER_MU);
11966 cp_list = add_cp_to_invlist(cp_list,
11967 GREEK_CAPITAL_LETTER_MU);
11968 break;
11969 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
11970 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
11971 /* ANGSTROM SIGN */
11972 cp_list =
11973 add_cp_to_invlist(cp_list, 0x212B);
11974 break;
11975 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
11976 cp_list = add_cp_to_invlist(cp_list,
11977 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
11978 break;
11979 case LATIN_SMALL_LETTER_SHARP_S:
11980 cp_list = add_cp_to_invlist(cp_list,
11981 LATIN_CAPITAL_LETTER_SHARP_S);
11982
11983 /* Under /a, /d, and /u, this can match the two
11984 * chars "ss" */
11985 if (! MORE_ASCII_RESTRICTED) {
11986 add_alternate(&unicode_alternate,
11987 (U8 *) "ss", 2);
11988
11989 /* And under /u or /a, it can match even if
11990 * the target is not utf8 */
11991 if (AT_LEAST_UNI_SEMANTICS) {
11992 ANYOF_FLAGS(ret) |=
11993 ANYOF_NONBITMAP_NON_UTF8;
11994 }
11995 }
11996 break;
11997 case 'F': case 'f':
11998 case 'I': case 'i':
11999 case 'L': case 'l':
12000 case 'T': case 't':
12001 case 'A': case 'a':
12002 case 'H': case 'h':
12003 case 'J': case 'j':
12004 case 'N': case 'n':
12005 case 'W': case 'w':
12006 case 'Y': case 'y':
12007 /* These all are targets of multi-character
12008 * folds from code points that require UTF8 to
12009 * express, so they can't match unless the
12010 * target string is in UTF-8, so no action here
12011 * is necessary, as regexec.c properly handles
12012 * the general case for UTF-8 matching */
12013 break;
12014 default:
12015 /* Use deprecated warning to increase the
12016 * chances of this being output */
12017 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
12018 break;
12019 }
12020 }
12021 continue;
12022 }
12023
12024 /* Here is an above Latin1 character. We don't have the rules
12025 * hard-coded for it. First, get its fold */
12026 f = _to_uni_fold_flags(j, foldbuf, &foldlen,
12027 ((allow_full_fold) ? FOLD_FLAGS_FULL : 0)
12028 | ((LOC)
12029 ? FOLD_FLAGS_LOCALE
12030 : (MORE_ASCII_RESTRICTED)
12031 ? FOLD_FLAGS_NOMIX_ASCII
12032 : 0));
12033
12034 if (foldlen > (STRLEN)UNISKIP(f)) {
12035
12036 /* Any multicharacter foldings (disallowed in lookbehind
12037 * patterns) require the following transform: [ABCDEF] ->
12038 * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
12039 * folds into "rst", all other characters fold to single
12040 * characters. We save away these multicharacter foldings,
12041 * to be later saved as part of the additional "s" data. */
12042 if (! RExC_in_lookbehind) {
12043 U8* loc = foldbuf;
12044 U8* e = foldbuf + foldlen;
12045
12046 /* If any of the folded characters of this are in the
12047 * Latin1 range, tell the regex engine that this can
12048 * match a non-utf8 target string. */
12049 while (loc < e) {
12050 if (UTF8_IS_INVARIANT(*loc)
12051 || UTF8_IS_DOWNGRADEABLE_START(*loc))
12052 {
12053 ANYOF_FLAGS(ret)
12054 |= ANYOF_NONBITMAP_NON_UTF8;
12055 break;
12056 }
12057 loc += UTF8SKIP(loc);
12058 }
12059
12060 add_alternate(&unicode_alternate, foldbuf, foldlen);
12061 }
12062 }
12063 else {
12064 /* Single character fold of above Latin1. Add everything
12065 * in its fold closure to the list that this node should
12066 * match */
12067 SV** listp;
12068
12069 /* The fold closures data structure is a hash with the keys
12070 * being every character that is folded to, like 'k', and
12071 * the values each an array of everything that folds to its
12072 * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
12073 if ((listp = hv_fetch(PL_utf8_foldclosures,
12074 (char *) foldbuf, foldlen, FALSE)))
12075 {
12076 AV* list = (AV*) *listp;
12077 IV k;
12078 for (k = 0; k <= av_len(list); k++) {
12079 SV** c_p = av_fetch(list, k, FALSE);
12080 UV c;
12081 if (c_p == NULL) {
12082 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12083 }
12084 c = SvUV(*c_p);
12085
12086 /* /aa doesn't allow folds between ASCII and non-;
12087 * /l doesn't allow them between above and below
12088 * 256 */
12089 if ((MORE_ASCII_RESTRICTED && (isASCII(c) != isASCII(j)))
12090 || (LOC && ((c < 256) != (j < 256))))
12091 {
12092 continue;
12093 }
12094
12095 /* Folds involving non-ascii Latin1 characters
12096 * under /d are added to a separate list */
12097 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
12098 {
12099 cp_list = add_cp_to_invlist(cp_list, c);
12100 }
12101 else {
12102 depends_list = add_cp_to_invlist(depends_list, c);
12103 }
12104 }
12105 }
12106 }
12107 }
12108 }
12109 SvREFCNT_dec(fold_intersection);
12110 }
12111
12112 /* And combine the result (if any) with any inversion list from properties.
12113 * The lists are kept separate up to now because we don't want to fold the
12114 * properties */
12115 if (properties) {
12116 if (AT_LEAST_UNI_SEMANTICS) {
12117 if (cp_list) {
12118 _invlist_union(cp_list, properties, &cp_list);
12119 SvREFCNT_dec(properties);
12120 }
12121 else {
12122 cp_list = properties;
12123 }
12124 }
12125 else {
12126
12127 /* Under /d, we put the things that match only when the target
12128 * string is utf8, into a separate list */
12129 SV* nonascii_but_latin1_properties = NULL;
12130 _invlist_intersection(properties, PL_Latin1,
12131 &nonascii_but_latin1_properties);
12132 _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
12133 &nonascii_but_latin1_properties);
12134 _invlist_subtract(properties, nonascii_but_latin1_properties,
12135 &properties);
12136 if (cp_list) {
12137 _invlist_union(cp_list, properties, &cp_list);
12138 SvREFCNT_dec(properties);
12139 }
12140 else {
12141 cp_list = properties;
12142 }
12143
12144 if (depends_list) {
12145 _invlist_union(depends_list, nonascii_but_latin1_properties,
12146 &depends_list);
12147 SvREFCNT_dec(nonascii_but_latin1_properties);
12148 }
12149 else {
12150 depends_list = nonascii_but_latin1_properties;
12151 }
12152 }
12153 }
12154
12155 /* Here, we have calculated what code points should be in the character
12156 * class.
12157 *
12158 * Now we can see about various optimizations. Fold calculation (which we
12159 * did above) needs to take place before inversion. Otherwise /[^k]/i
12160 * would invert to include K, which under /i would match k, which it
12161 * shouldn't. */
12162
12163 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
12164 * set the FOLD flag yet, so this does optimize those. It doesn't
12165 * optimize locale. Doing so perhaps could be done as long as there is
12166 * nothing like \w in it; some thought also would have to be given to the
12167 * interaction with above 0x100 chars */
12168 if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
12169 && ! LOC
12170 && ! depends_list
12171 && ! unicode_alternate
12172 && SvCUR(listsv) == initial_listsv_len)
12173 {
12174 _invlist_invert(cp_list);
12175
12176 /* Any swash can't be used as-is, because we've inverted things */
12177 if (swash) {
12178 SvREFCNT_dec(swash);
12179 swash = NULL;
12180 }
12181
12182 /* Clear the invert flag since have just done it here */
12183 ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
12184 }
12185
12186 /* Here, <cp_list> contains all the code points we can determine at
12187 * compile time that match under all conditions. Go through it, and
12188 * for things that belong in the bitmap, put them there, and delete from
12189 * <cp_list> */
12190 if (cp_list) {
12191
12192 /* This gets set if we actually need to modify things */
12193 bool change_invlist = FALSE;
12194
12195 UV start, end;
12196
12197 /* Start looking through <cp_list> */
12198 invlist_iterinit(cp_list);
12199 while (invlist_iternext(cp_list, &start, &end)) {
12200 UV high;
12201 int i;
12202
12203 /* Quit if are above what we should change */
12204 if (start > 255) {
12205 break;
12206 }
12207
12208 change_invlist = TRUE;
12209
12210 /* Set all the bits in the range, up to the max that we are doing */
12211 high = (end < 255) ? end : 255;
12212 for (i = start; i <= (int) high; i++) {
12213 if (! ANYOF_BITMAP_TEST(ret, i)) {
12214 ANYOF_BITMAP_SET(ret, i);
12215 stored++;
12216 prevvalue = value;
12217 value = i;
12218 }
12219 }
12220 }
12221
12222 /* Done with loop; remove any code points that are in the bitmap from
12223 * <cp_list> */
12224 if (change_invlist) {
12225 _invlist_subtract(cp_list, PL_Latin1, &cp_list);
12226 }
12227
12228 /* If have completely emptied it, remove it completely */
12229 if (invlist_len(cp_list) == 0) {
12230 SvREFCNT_dec(cp_list);
12231 cp_list = NULL;
12232 }
12233 }
12234
12235 /* Combine the two lists into one. */
12236 if (depends_list) {
12237 if (cp_list) {
12238 _invlist_union(cp_list, depends_list, &cp_list);
12239 SvREFCNT_dec(depends_list);
12240 }
12241 else {
12242 cp_list = depends_list;
12243 }
12244 }
12245
12246 /* Folding in the bitmap is taken care of above, but not for locale (for
12247 * which we have to wait to see what folding is in effect at runtime), and
12248 * for some things not in the bitmap (only the upper latin folds in this
12249 * case, as all other single-char folding has been set above). Set
12250 * run-time fold flag for these */
12251 if (FOLD && (LOC
12252 || (DEPENDS_SEMANTICS
12253 && cp_list
12254 && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
12255 || unicode_alternate))
12256 {
12257 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
12258 }
12259
12260 /* A single character class can be "optimized" into an EXACTish node.
12261 * Note that since we don't currently count how many characters there are
12262 * outside the bitmap, we are XXX missing optimization possibilities for
12263 * them. This optimization can't happen unless this is a truly single
12264 * character class, which means that it can't be an inversion into a
12265 * many-character class, and there must be no possibility of there being
12266 * things outside the bitmap. 'stored' (only) for locales doesn't include
12267 * \w, etc, so have to make a special test that they aren't present
12268 *
12269 * Similarly A 2-character class of the very special form like [bB] can be
12270 * optimized into an EXACTFish node, but only for non-locales, and for
12271 * characters which only have the two folds; so things like 'fF' and 'Ii'
12272 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
12273 * FI'. */
12274 if (! cp_list
12275 && ! unicode_alternate
12276 && SvCUR(listsv) == initial_listsv_len
12277 && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
12278 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
12279 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
12280 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
12281 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
12282 /* If the latest code point has a fold whose
12283 * bit is set, it must be the only other one */
12284 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
12285 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
12286 {
12287 /* Note that the information needed to decide to do this optimization
12288 * is not currently available until the 2nd pass, and that the actually
12289 * used EXACTish node takes less space than the calculated ANYOF node,
12290 * and hence the amount of space calculated in the first pass is larger
12291 * than actually used, so this optimization doesn't gain us any space.
12292 * But an EXACT node is faster than an ANYOF node, and can be combined
12293 * with any adjacent EXACT nodes later by the optimizer for further
12294 * gains. The speed of executing an EXACTF is similar to an ANYOF
12295 * node, so the optimization advantage comes from the ability to join
12296 * it to adjacent EXACT nodes */
12297
12298 const char * cur_parse= RExC_parse;
12299 U8 op;
12300 RExC_emit = (regnode *)orig_emit;
12301 RExC_parse = (char *)orig_parse;
12302
12303 if (stored == 1) {
12304
12305 /* A locale node with one point can be folded; all the other cases
12306 * with folding will have two points, since we calculate them above
12307 */
12308 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
12309 op = EXACTFL;
12310 }
12311 else {
12312 op = EXACT;
12313 }
12314 }
12315 else { /* else 2 chars in the bit map: the folds of each other */
12316
12317 /* Use the folded value, which for the cases where we get here,
12318 * is just the lower case of the current one (which may resolve to
12319 * itself, or to the other one */
12320 value = toLOWER_LATIN1(value);
12321
12322 /* To join adjacent nodes, they must be the exact EXACTish type.
12323 * Try to use the most likely type, by using EXACTFA if possible,
12324 * then EXACTFU if the regex calls for it, or is required because
12325 * the character is non-ASCII. (If <value> is ASCII, its fold is
12326 * also ASCII for the cases where we get here.) */
12327 if (MORE_ASCII_RESTRICTED && isASCII(value)) {
12328 op = EXACTFA;
12329 }
12330 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
12331 op = EXACTFU;
12332 }
12333 else { /* Otherwise, more likely to be EXACTF type */
12334 op = EXACTF;
12335 }
12336 }
12337
12338 ret = reg_node(pRExC_state, op);
12339 RExC_parse = (char *)cur_parse;
12340 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
12341 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
12342 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
12343 STR_LEN(ret)= 2;
12344 RExC_emit += STR_SZ(2);
12345 }
12346 else {
12347 *STRING(ret)= (char)value;
12348 STR_LEN(ret)= 1;
12349 RExC_emit += STR_SZ(1);
12350 }
12351 SvREFCNT_dec(listsv);
12352 return ret;
12353 }
12354
12355 /* If there is a swash and more than one element, we can't use the swash in
12356 * the optimization below. */
12357 if (swash && element_count > 1) {
12358 SvREFCNT_dec(swash);
12359 swash = NULL;
12360 }
12361 if (! cp_list
12362 && SvCUR(listsv) == initial_listsv_len
12363 && ! unicode_alternate)
12364 {
12365 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
12366 SvREFCNT_dec(listsv);
12367 SvREFCNT_dec(unicode_alternate);
12368 }
12369 else {
12370 /* av[0] stores the character class description in its textual form:
12371 * used later (regexec.c:Perl_regclass_swash()) to initialize the
12372 * appropriate swash, and is also useful for dumping the regnode.
12373 * av[1] if NULL, is a placeholder to later contain the swash computed
12374 * from av[0]. But if no further computation need be done, the
12375 * swash is stored there now.
12376 * av[2] stores the multicharacter foldings, used later in
12377 * regexec.c:S_reginclass().
12378 * av[3] stores the cp_list inversion list for use in addition or
12379 * instead of av[0]; not used if av[1] isn't NULL
12380 * av[4] is set if any component of the class is from a user-defined
12381 * property; not used if av[1] isn't NULL */
12382 AV * const av = newAV();
12383 SV *rv;
12384
12385 av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
12386 ? &PL_sv_undef
12387 : listsv);
12388 if (swash) {
12389 av_store(av, 1, swash);
12390 SvREFCNT_dec(cp_list);
12391 }
12392 else {
12393 av_store(av, 1, NULL);
12394 if (cp_list) {
12395 av_store(av, 3, cp_list);
12396 av_store(av, 4, newSVuv(has_user_defined_property));
12397 }
12398 }
12399
12400 /* Store any computed multi-char folds only if we are allowing
12401 * them */
12402 if (allow_full_fold) {
12403 av_store(av, 2, MUTABLE_SV(unicode_alternate));
12404 if (unicode_alternate) { /* This node is variable length */
12405 OP(ret) = ANYOFV;
12406 }
12407 }
12408 else {
12409 av_store(av, 2, NULL);
12410 }
12411 rv = newRV_noinc(MUTABLE_SV(av));
12412 n = add_data(pRExC_state, 1, "s");
12413 RExC_rxi->data->data[n] = (void*)rv;
12414 ARG_SET(ret, n);
12415 }
12416 return ret;
12417}
12418
12419
12420/* reg_skipcomment()
12421
12422 Absorbs an /x style # comments from the input stream.
12423 Returns true if there is more text remaining in the stream.
12424 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
12425 terminates the pattern without including a newline.
12426
12427 Note its the callers responsibility to ensure that we are
12428 actually in /x mode
12429
12430*/
12431
12432STATIC bool
12433S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
12434{
12435 bool ended = 0;
12436
12437 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12438
12439 while (RExC_parse < RExC_end)
12440 if (*RExC_parse++ == '\n') {
12441 ended = 1;
12442 break;
12443 }
12444 if (!ended) {
12445 /* we ran off the end of the pattern without ending
12446 the comment, so we have to add an \n when wrapping */
12447 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
12448 return 0;
12449 } else
12450 return 1;
12451}
12452
12453/* nextchar()
12454
12455 Advances the parse position, and optionally absorbs
12456 "whitespace" from the inputstream.
12457
12458 Without /x "whitespace" means (?#...) style comments only,
12459 with /x this means (?#...) and # comments and whitespace proper.
12460
12461 Returns the RExC_parse point from BEFORE the scan occurs.
12462
12463 This is the /x friendly way of saying RExC_parse++.
12464*/
12465
12466STATIC char*
12467S_nextchar(pTHX_ RExC_state_t *pRExC_state)
12468{
12469 char* const retval = RExC_parse++;
12470
12471 PERL_ARGS_ASSERT_NEXTCHAR;
12472
12473 for (;;) {
12474 if (RExC_end - RExC_parse >= 3
12475 && *RExC_parse == '('
12476 && RExC_parse[1] == '?'
12477 && RExC_parse[2] == '#')
12478 {
12479 while (*RExC_parse != ')') {
12480 if (RExC_parse == RExC_end)
12481 FAIL("Sequence (?#... not terminated");
12482 RExC_parse++;
12483 }
12484 RExC_parse++;
12485 continue;
12486 }
12487 if (RExC_flags & RXf_PMf_EXTENDED) {
12488 if (isSPACE(*RExC_parse)) {
12489 RExC_parse++;
12490 continue;
12491 }
12492 else if (*RExC_parse == '#') {
12493 if ( reg_skipcomment( pRExC_state ) )
12494 continue;
12495 }
12496 }
12497 return retval;
12498 }
12499}
12500
12501/*
12502- reg_node - emit a node
12503*/
12504STATIC regnode * /* Location. */
12505S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
12506{
12507 dVAR;
12508 register regnode *ptr;
12509 regnode * const ret = RExC_emit;
12510 GET_RE_DEBUG_FLAGS_DECL;
12511
12512 PERL_ARGS_ASSERT_REG_NODE;
12513
12514 if (SIZE_ONLY) {
12515 SIZE_ALIGN(RExC_size);
12516 RExC_size += 1;
12517 return(ret);
12518 }
12519 if (RExC_emit >= RExC_emit_bound)
12520 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
12521 op, RExC_emit, RExC_emit_bound);
12522
12523 NODE_ALIGN_FILL(ret);
12524 ptr = ret;
12525 FILL_ADVANCE_NODE(ptr, op);
12526#ifdef RE_TRACK_PATTERN_OFFSETS
12527 if (RExC_offsets) { /* MJD */
12528 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
12529 "reg_node", __LINE__,
12530 PL_reg_name[op],
12531 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
12532 ? "Overwriting end of array!\n" : "OK",
12533 (UV)(RExC_emit - RExC_emit_start),
12534 (UV)(RExC_parse - RExC_start),
12535 (UV)RExC_offsets[0]));
12536 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
12537 }
12538#endif
12539 RExC_emit = ptr;
12540 return(ret);
12541}
12542
12543/*
12544- reganode - emit a node with an argument
12545*/
12546STATIC regnode * /* Location. */
12547S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
12548{
12549 dVAR;
12550 register regnode *ptr;
12551 regnode * const ret = RExC_emit;
12552 GET_RE_DEBUG_FLAGS_DECL;
12553
12554 PERL_ARGS_ASSERT_REGANODE;
12555
12556 if (SIZE_ONLY) {
12557 SIZE_ALIGN(RExC_size);
12558 RExC_size += 2;
12559 /*
12560 We can't do this:
12561
12562 assert(2==regarglen[op]+1);
12563
12564 Anything larger than this has to allocate the extra amount.
12565 If we changed this to be:
12566
12567 RExC_size += (1 + regarglen[op]);
12568
12569 then it wouldn't matter. Its not clear what side effect
12570 might come from that so its not done so far.
12571 -- dmq
12572 */
12573 return(ret);
12574 }
12575 if (RExC_emit >= RExC_emit_bound)
12576 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
12577 op, RExC_emit, RExC_emit_bound);
12578
12579 NODE_ALIGN_FILL(ret);
12580 ptr = ret;
12581 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
12582#ifdef RE_TRACK_PATTERN_OFFSETS
12583 if (RExC_offsets) { /* MJD */
12584 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
12585 "reganode",
12586 __LINE__,
12587 PL_reg_name[op],
12588 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
12589 "Overwriting end of array!\n" : "OK",
12590 (UV)(RExC_emit - RExC_emit_start),
12591 (UV)(RExC_parse - RExC_start),
12592 (UV)RExC_offsets[0]));
12593 Set_Cur_Node_Offset;
12594 }
12595#endif
12596 RExC_emit = ptr;
12597 return(ret);
12598}
12599
12600/*
12601- reguni - emit (if appropriate) a Unicode character
12602*/
12603STATIC STRLEN
12604S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
12605{
12606 dVAR;
12607
12608 PERL_ARGS_ASSERT_REGUNI;
12609
12610 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
12611}
12612
12613/*
12614- reginsert - insert an operator in front of already-emitted operand
12615*
12616* Means relocating the operand.
12617*/
12618STATIC void
12619S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
12620{
12621 dVAR;
12622 register regnode *src;
12623 register regnode *dst;
12624 register regnode *place;
12625 const int offset = regarglen[(U8)op];
12626 const int size = NODE_STEP_REGNODE + offset;
12627 GET_RE_DEBUG_FLAGS_DECL;
12628
12629 PERL_ARGS_ASSERT_REGINSERT;
12630 PERL_UNUSED_ARG(depth);
12631/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
12632 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
12633 if (SIZE_ONLY) {
12634 RExC_size += size;
12635 return;
12636 }
12637
12638 src = RExC_emit;
12639 RExC_emit += size;
12640 dst = RExC_emit;
12641 if (RExC_open_parens) {
12642 int paren;
12643 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
12644 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
12645 if ( RExC_open_parens[paren] >= opnd ) {
12646 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
12647 RExC_open_parens[paren] += size;
12648 } else {
12649 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
12650 }
12651 if ( RExC_close_parens[paren] >= opnd ) {
12652 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
12653 RExC_close_parens[paren] += size;
12654 } else {
12655 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
12656 }
12657 }
12658 }
12659
12660 while (src > opnd) {
12661 StructCopy(--src, --dst, regnode);
12662#ifdef RE_TRACK_PATTERN_OFFSETS
12663 if (RExC_offsets) { /* MJD 20010112 */
12664 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
12665 "reg_insert",
12666 __LINE__,
12667 PL_reg_name[op],
12668 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
12669 ? "Overwriting end of array!\n" : "OK",
12670 (UV)(src - RExC_emit_start),
12671 (UV)(dst - RExC_emit_start),
12672 (UV)RExC_offsets[0]));
12673 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
12674 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
12675 }
12676#endif
12677 }
12678
12679
12680 place = opnd; /* Op node, where operand used to be. */
12681#ifdef RE_TRACK_PATTERN_OFFSETS
12682 if (RExC_offsets) { /* MJD */
12683 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
12684 "reginsert",
12685 __LINE__,
12686 PL_reg_name[op],
12687 (UV)(place - RExC_emit_start) > RExC_offsets[0]
12688 ? "Overwriting end of array!\n" : "OK",
12689 (UV)(place - RExC_emit_start),
12690 (UV)(RExC_parse - RExC_start),
12691 (UV)RExC_offsets[0]));
12692 Set_Node_Offset(place, RExC_parse);
12693 Set_Node_Length(place, 1);
12694 }
12695#endif
12696 src = NEXTOPER(place);
12697 FILL_ADVANCE_NODE(place, op);
12698 Zero(src, offset, regnode);
12699}
12700
12701/*
12702- regtail - set the next-pointer at the end of a node chain of p to val.
12703- SEE ALSO: regtail_study
12704*/
12705/* TODO: All three parms should be const */
12706STATIC void
12707S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12708{
12709 dVAR;
12710 register regnode *scan;
12711 GET_RE_DEBUG_FLAGS_DECL;
12712
12713 PERL_ARGS_ASSERT_REGTAIL;
12714#ifndef DEBUGGING
12715 PERL_UNUSED_ARG(depth);
12716#endif
12717
12718 if (SIZE_ONLY)
12719 return;
12720
12721 /* Find last node. */
12722 scan = p;
12723 for (;;) {
12724 regnode * const temp = regnext(scan);
12725 DEBUG_PARSE_r({
12726 SV * const mysv=sv_newmortal();
12727 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12728 regprop(RExC_rx, mysv, scan);
12729 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
12730 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
12731 (temp == NULL ? "->" : ""),
12732 (temp == NULL ? PL_reg_name[OP(val)] : "")
12733 );
12734 });
12735 if (temp == NULL)
12736 break;
12737 scan = temp;
12738 }
12739
12740 if (reg_off_by_arg[OP(scan)]) {
12741 ARG_SET(scan, val - scan);
12742 }
12743 else {
12744 NEXT_OFF(scan) = val - scan;
12745 }
12746}
12747
12748#ifdef DEBUGGING
12749/*
12750- regtail_study - set the next-pointer at the end of a node chain of p to val.
12751- Look for optimizable sequences at the same time.
12752- currently only looks for EXACT chains.
12753
12754This is experimental code. The idea is to use this routine to perform
12755in place optimizations on branches and groups as they are constructed,
12756with the long term intention of removing optimization from study_chunk so
12757that it is purely analytical.
12758
12759Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
12760to control which is which.
12761
12762*/
12763/* TODO: All four parms should be const */
12764
12765STATIC U8
12766S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12767{
12768 dVAR;
12769 register regnode *scan;
12770 U8 exact = PSEUDO;
12771#ifdef EXPERIMENTAL_INPLACESCAN
12772 I32 min = 0;
12773#endif
12774 GET_RE_DEBUG_FLAGS_DECL;
12775
12776 PERL_ARGS_ASSERT_REGTAIL_STUDY;
12777
12778
12779 if (SIZE_ONLY)
12780 return exact;
12781
12782 /* Find last node. */
12783
12784 scan = p;
12785 for (;;) {
12786 regnode * const temp = regnext(scan);
12787#ifdef EXPERIMENTAL_INPLACESCAN
12788 if (PL_regkind[OP(scan)] == EXACT) {
12789 bool has_exactf_sharp_s; /* Unexamined in this routine */
12790 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
12791 return EXACT;
12792 }
12793#endif
12794 if ( exact ) {
12795 switch (OP(scan)) {
12796 case EXACT:
12797 case EXACTF:
12798 case EXACTFA:
12799 case EXACTFU:
12800 case EXACTFU_SS:
12801 case EXACTFU_TRICKYFOLD:
12802 case EXACTFL:
12803 if( exact == PSEUDO )
12804 exact= OP(scan);
12805 else if ( exact != OP(scan) )
12806 exact= 0;
12807 case NOTHING:
12808 break;
12809 default:
12810 exact= 0;
12811 }
12812 }
12813 DEBUG_PARSE_r({
12814 SV * const mysv=sv_newmortal();
12815 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12816 regprop(RExC_rx, mysv, scan);
12817 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
12818 SvPV_nolen_const(mysv),
12819 REG_NODE_NUM(scan),
12820 PL_reg_name[exact]);
12821 });
12822 if (temp == NULL)
12823 break;
12824 scan = temp;
12825 }
12826 DEBUG_PARSE_r({
12827 SV * const mysv_val=sv_newmortal();
12828 DEBUG_PARSE_MSG("");
12829 regprop(RExC_rx, mysv_val, val);
12830 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
12831 SvPV_nolen_const(mysv_val),
12832 (IV)REG_NODE_NUM(val),
12833 (IV)(val - scan)
12834 );
12835 });
12836 if (reg_off_by_arg[OP(scan)]) {
12837 ARG_SET(scan, val - scan);
12838 }
12839 else {
12840 NEXT_OFF(scan) = val - scan;
12841 }
12842
12843 return exact;
12844}
12845#endif
12846
12847/*
12848 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
12849 */
12850#ifdef DEBUGGING
12851static void
12852S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
12853{
12854 int bit;
12855 int set=0;
12856 regex_charset cs;
12857
12858 for (bit=0; bit<32; bit++) {
12859 if (flags & (1<<bit)) {
12860 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
12861 continue;
12862 }
12863 if (!set++ && lead)
12864 PerlIO_printf(Perl_debug_log, "%s",lead);
12865 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
12866 }
12867 }
12868 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
12869 if (!set++ && lead) {
12870 PerlIO_printf(Perl_debug_log, "%s",lead);
12871 }
12872 switch (cs) {
12873 case REGEX_UNICODE_CHARSET:
12874 PerlIO_printf(Perl_debug_log, "UNICODE");
12875 break;
12876 case REGEX_LOCALE_CHARSET:
12877 PerlIO_printf(Perl_debug_log, "LOCALE");
12878 break;
12879 case REGEX_ASCII_RESTRICTED_CHARSET:
12880 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
12881 break;
12882 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
12883 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
12884 break;
12885 default:
12886 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
12887 break;
12888 }
12889 }
12890 if (lead) {
12891 if (set)
12892 PerlIO_printf(Perl_debug_log, "\n");
12893 else
12894 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
12895 }
12896}
12897#endif
12898
12899void
12900Perl_regdump(pTHX_ const regexp *r)
12901{
12902#ifdef DEBUGGING
12903 dVAR;
12904 SV * const sv = sv_newmortal();
12905 SV *dsv= sv_newmortal();
12906 RXi_GET_DECL(r,ri);
12907 GET_RE_DEBUG_FLAGS_DECL;
12908
12909 PERL_ARGS_ASSERT_REGDUMP;
12910
12911 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
12912
12913 /* Header fields of interest. */
12914 if (r->anchored_substr) {
12915 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
12916 RE_SV_DUMPLEN(r->anchored_substr), 30);
12917 PerlIO_printf(Perl_debug_log,
12918 "anchored %s%s at %"IVdf" ",
12919 s, RE_SV_TAIL(r->anchored_substr),
12920 (IV)r->anchored_offset);
12921 } else if (r->anchored_utf8) {
12922 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
12923 RE_SV_DUMPLEN(r->anchored_utf8), 30);
12924 PerlIO_printf(Perl_debug_log,
12925 "anchored utf8 %s%s at %"IVdf" ",
12926 s, RE_SV_TAIL(r->anchored_utf8),
12927 (IV)r->anchored_offset);
12928 }
12929 if (r->float_substr) {
12930 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
12931 RE_SV_DUMPLEN(r->float_substr), 30);
12932 PerlIO_printf(Perl_debug_log,
12933 "floating %s%s at %"IVdf"..%"UVuf" ",
12934 s, RE_SV_TAIL(r->float_substr),
12935 (IV)r->float_min_offset, (UV)r->float_max_offset);
12936 } else if (r->float_utf8) {
12937 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
12938 RE_SV_DUMPLEN(r->float_utf8), 30);
12939 PerlIO_printf(Perl_debug_log,
12940 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
12941 s, RE_SV_TAIL(r->float_utf8),
12942 (IV)r->float_min_offset, (UV)r->float_max_offset);
12943 }
12944 if (r->check_substr || r->check_utf8)
12945 PerlIO_printf(Perl_debug_log,
12946 (const char *)
12947 (r->check_substr == r->float_substr
12948 && r->check_utf8 == r->float_utf8
12949 ? "(checking floating" : "(checking anchored"));
12950 if (r->extflags & RXf_NOSCAN)
12951 PerlIO_printf(Perl_debug_log, " noscan");
12952 if (r->extflags & RXf_CHECK_ALL)
12953 PerlIO_printf(Perl_debug_log, " isall");
12954 if (r->check_substr || r->check_utf8)
12955 PerlIO_printf(Perl_debug_log, ") ");
12956
12957 if (ri->regstclass) {
12958 regprop(r, sv, ri->regstclass);
12959 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
12960 }
12961 if (r->extflags & RXf_ANCH) {
12962 PerlIO_printf(Perl_debug_log, "anchored");
12963 if (r->extflags & RXf_ANCH_BOL)
12964 PerlIO_printf(Perl_debug_log, "(BOL)");
12965 if (r->extflags & RXf_ANCH_MBOL)
12966 PerlIO_printf(Perl_debug_log, "(MBOL)");
12967 if (r->extflags & RXf_ANCH_SBOL)
12968 PerlIO_printf(Perl_debug_log, "(SBOL)");
12969 if (r->extflags & RXf_ANCH_GPOS)
12970 PerlIO_printf(Perl_debug_log, "(GPOS)");
12971 PerlIO_putc(Perl_debug_log, ' ');
12972 }
12973 if (r->extflags & RXf_GPOS_SEEN)
12974 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
12975 if (r->intflags & PREGf_SKIP)
12976 PerlIO_printf(Perl_debug_log, "plus ");
12977 if (r->intflags & PREGf_IMPLICIT)
12978 PerlIO_printf(Perl_debug_log, "implicit ");
12979 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
12980 if (r->extflags & RXf_EVAL_SEEN)
12981 PerlIO_printf(Perl_debug_log, "with eval ");
12982 PerlIO_printf(Perl_debug_log, "\n");
12983 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
12984#else
12985 PERL_ARGS_ASSERT_REGDUMP;
12986 PERL_UNUSED_CONTEXT;
12987 PERL_UNUSED_ARG(r);
12988#endif /* DEBUGGING */
12989}
12990
12991/*
12992- regprop - printable representation of opcode
12993*/
12994#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
12995STMT_START { \
12996 if (do_sep) { \
12997 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
12998 if (flags & ANYOF_INVERT) \
12999 /*make sure the invert info is in each */ \
13000 sv_catpvs(sv, "^"); \
13001 do_sep = 0; \
13002 } \
13003} STMT_END
13004
13005void
13006Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13007{
13008#ifdef DEBUGGING
13009 dVAR;
13010 register int k;
13011 RXi_GET_DECL(prog,progi);
13012 GET_RE_DEBUG_FLAGS_DECL;
13013
13014 PERL_ARGS_ASSERT_REGPROP;
13015
13016 sv_setpvs(sv, "");
13017
13018 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
13019 /* It would be nice to FAIL() here, but this may be called from
13020 regexec.c, and it would be hard to supply pRExC_state. */
13021 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13022 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13023
13024 k = PL_regkind[OP(o)];
13025
13026 if (k == EXACT) {
13027 sv_catpvs(sv, " ");
13028 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13029 * is a crude hack but it may be the best for now since
13030 * we have no flag "this EXACTish node was UTF-8"
13031 * --jhi */
13032 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13033 PERL_PV_ESCAPE_UNI_DETECT |
13034 PERL_PV_ESCAPE_NONASCII |
13035 PERL_PV_PRETTY_ELLIPSES |
13036 PERL_PV_PRETTY_LTGT |
13037 PERL_PV_PRETTY_NOCLEAR
13038 );
13039 } else if (k == TRIE) {
13040 /* print the details of the trie in dumpuntil instead, as
13041 * progi->data isn't available here */
13042 const char op = OP(o);
13043 const U32 n = ARG(o);
13044 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13045 (reg_ac_data *)progi->data->data[n] :
13046 NULL;
13047 const reg_trie_data * const trie
13048 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13049
13050 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13051 DEBUG_TRIE_COMPILE_r(
13052 Perl_sv_catpvf(aTHX_ sv,
13053 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13054 (UV)trie->startstate,
13055 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13056 (UV)trie->wordcount,
13057 (UV)trie->minlen,
13058 (UV)trie->maxlen,
13059 (UV)TRIE_CHARCOUNT(trie),
13060 (UV)trie->uniquecharcount
13061 )
13062 );
13063 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13064 int i;
13065 int rangestart = -1;
13066 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13067 sv_catpvs(sv, "[");
13068 for (i = 0; i <= 256; i++) {
13069 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13070 if (rangestart == -1)
13071 rangestart = i;
13072 } else if (rangestart != -1) {
13073 if (i <= rangestart + 3)
13074 for (; rangestart < i; rangestart++)
13075 put_byte(sv, rangestart);
13076 else {
13077 put_byte(sv, rangestart);
13078 sv_catpvs(sv, "-");
13079 put_byte(sv, i - 1);
13080 }
13081 rangestart = -1;
13082 }
13083 }
13084 sv_catpvs(sv, "]");
13085 }
13086
13087 } else if (k == CURLY) {
13088 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13089 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13090 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13091 }
13092 else if (k == WHILEM && o->flags) /* Ordinal/of */
13093 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13094 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13095 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
13096 if ( RXp_PAREN_NAMES(prog) ) {
13097 if ( k != REF || (OP(o) < NREF)) {
13098 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13099 SV **name= av_fetch(list, ARG(o), 0 );
13100 if (name)
13101 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13102 }
13103 else {
13104 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13105 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13106 I32 *nums=(I32*)SvPVX(sv_dat);
13107 SV **name= av_fetch(list, nums[0], 0 );
13108 I32 n;
13109 if (name) {
13110 for ( n=0; n<SvIVX(sv_dat); n++ ) {
13111 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13112 (n ? "," : ""), (IV)nums[n]);
13113 }
13114 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13115 }
13116 }
13117 }
13118 } else if (k == GOSUB)
13119 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13120 else if (k == VERB) {
13121 if (!o->flags)
13122 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13123 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13124 } else if (k == LOGICAL)
13125 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
13126 else if (k == ANYOF) {
13127 int i, rangestart = -1;
13128 const U8 flags = ANYOF_FLAGS(o);
13129 int do_sep = 0;
13130
13131 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13132 static const char * const anyofs[] = {
13133 "\\w",
13134 "\\W",
13135 "\\s",
13136 "\\S",
13137 "\\d",
13138 "\\D",
13139 "[:alnum:]",
13140 "[:^alnum:]",
13141 "[:alpha:]",
13142 "[:^alpha:]",
13143 "[:ascii:]",
13144 "[:^ascii:]",
13145 "[:cntrl:]",
13146 "[:^cntrl:]",
13147 "[:graph:]",
13148 "[:^graph:]",
13149 "[:lower:]",
13150 "[:^lower:]",
13151 "[:print:]",
13152 "[:^print:]",
13153 "[:punct:]",
13154 "[:^punct:]",
13155 "[:upper:]",
13156 "[:^upper:]",
13157 "[:xdigit:]",
13158 "[:^xdigit:]",
13159 "[:space:]",
13160 "[:^space:]",
13161 "[:blank:]",
13162 "[:^blank:]"
13163 };
13164
13165 if (flags & ANYOF_LOCALE)
13166 sv_catpvs(sv, "{loc}");
13167 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
13168 sv_catpvs(sv, "{i}");
13169 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13170 if (flags & ANYOF_INVERT)
13171 sv_catpvs(sv, "^");
13172
13173 /* output what the standard cp 0-255 bitmap matches */
13174 for (i = 0; i <= 256; i++) {
13175 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13176 if (rangestart == -1)
13177 rangestart = i;
13178 } else if (rangestart != -1) {
13179 if (i <= rangestart + 3)
13180 for (; rangestart < i; rangestart++)
13181 put_byte(sv, rangestart);
13182 else {
13183 put_byte(sv, rangestart);
13184 sv_catpvs(sv, "-");
13185 put_byte(sv, i - 1);
13186 }
13187 do_sep = 1;
13188 rangestart = -1;
13189 }
13190 }
13191
13192 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13193 /* output any special charclass tests (used entirely under use locale) */
13194 if (ANYOF_CLASS_TEST_ANY_SET(o))
13195 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13196 if (ANYOF_CLASS_TEST(o,i)) {
13197 sv_catpv(sv, anyofs[i]);
13198 do_sep = 1;
13199 }
13200
13201 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13202
13203 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13204 sv_catpvs(sv, "{non-utf8-latin1-all}");
13205 }
13206
13207 /* output information about the unicode matching */
13208 if (flags & ANYOF_UNICODE_ALL)
13209 sv_catpvs(sv, "{unicode_all}");
13210 else if (ANYOF_NONBITMAP(o))
13211 sv_catpvs(sv, "{unicode}");
13212 if (flags & ANYOF_NONBITMAP_NON_UTF8)
13213 sv_catpvs(sv, "{outside bitmap}");
13214
13215 if (ANYOF_NONBITMAP(o)) {
13216 SV *lv; /* Set if there is something outside the bit map */
13217 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
13218 bool byte_output = FALSE; /* If something in the bitmap has been
13219 output */
13220
13221 if (lv && lv != &PL_sv_undef) {
13222 if (sw) {
13223 U8 s[UTF8_MAXBYTES_CASE+1];
13224
13225 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
13226 uvchr_to_utf8(s, i);
13227
13228 if (i < 256
13229 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
13230 things already
13231 output as part
13232 of the bitmap */
13233 && swash_fetch(sw, s, TRUE))
13234 {
13235 if (rangestart == -1)
13236 rangestart = i;
13237 } else if (rangestart != -1) {
13238 byte_output = TRUE;
13239 if (i <= rangestart + 3)
13240 for (; rangestart < i; rangestart++) {
13241 put_byte(sv, rangestart);
13242 }
13243 else {
13244 put_byte(sv, rangestart);
13245 sv_catpvs(sv, "-");
13246 put_byte(sv, i-1);
13247 }
13248 rangestart = -1;
13249 }
13250 }
13251 }
13252
13253 {
13254 char *s = savesvpv(lv);
13255 char * const origs = s;
13256
13257 while (*s && *s != '\n')
13258 s++;
13259
13260 if (*s == '\n') {
13261 const char * const t = ++s;
13262
13263 if (byte_output) {
13264 sv_catpvs(sv, " ");
13265 }
13266
13267 while (*s) {
13268 if (*s == '\n') {
13269
13270 /* Truncate very long output */
13271 if (s - origs > 256) {
13272 Perl_sv_catpvf(aTHX_ sv,
13273 "%.*s...",
13274 (int) (s - origs - 1),
13275 t);
13276 goto out_dump;
13277 }
13278 *s = ' ';
13279 }
13280 else if (*s == '\t') {
13281 *s = '-';
13282 }
13283 s++;
13284 }
13285 if (s[-1] == ' ')
13286 s[-1] = 0;
13287
13288 sv_catpv(sv, t);
13289 }
13290
13291 out_dump:
13292
13293 Safefree(origs);
13294 }
13295 SvREFCNT_dec(lv);
13296 }
13297 }
13298
13299 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
13300 }
13301 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
13302 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
13303#else
13304 PERL_UNUSED_CONTEXT;
13305 PERL_UNUSED_ARG(sv);
13306 PERL_UNUSED_ARG(o);
13307 PERL_UNUSED_ARG(prog);
13308#endif /* DEBUGGING */
13309}
13310
13311SV *
13312Perl_re_intuit_string(pTHX_ REGEXP * const r)
13313{ /* Assume that RE_INTUIT is set */
13314 dVAR;
13315 struct regexp *const prog = (struct regexp *)SvANY(r);
13316 GET_RE_DEBUG_FLAGS_DECL;
13317
13318 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13319 PERL_UNUSED_CONTEXT;
13320
13321 DEBUG_COMPILE_r(
13322 {
13323 const char * const s = SvPV_nolen_const(prog->check_substr
13324 ? prog->check_substr : prog->check_utf8);
13325
13326 if (!PL_colorset) reginitcolors();
13327 PerlIO_printf(Perl_debug_log,
13328 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13329 PL_colors[4],
13330 prog->check_substr ? "" : "utf8 ",
13331 PL_colors[5],PL_colors[0],
13332 s,
13333 PL_colors[1],
13334 (strlen(s) > 60 ? "..." : ""));
13335 } );
13336
13337 return prog->check_substr ? prog->check_substr : prog->check_utf8;
13338}
13339
13340/*
13341 pregfree()
13342
13343 handles refcounting and freeing the perl core regexp structure. When
13344 it is necessary to actually free the structure the first thing it
13345 does is call the 'free' method of the regexp_engine associated to
13346 the regexp, allowing the handling of the void *pprivate; member
13347 first. (This routine is not overridable by extensions, which is why
13348 the extensions free is called first.)
13349
13350 See regdupe and regdupe_internal if you change anything here.
13351*/
13352#ifndef PERL_IN_XSUB_RE
13353void
13354Perl_pregfree(pTHX_ REGEXP *r)
13355{
13356 SvREFCNT_dec(r);
13357}
13358
13359void
13360Perl_pregfree2(pTHX_ REGEXP *rx)
13361{
13362 dVAR;
13363 struct regexp *const r = (struct regexp *)SvANY(rx);
13364 GET_RE_DEBUG_FLAGS_DECL;
13365
13366 PERL_ARGS_ASSERT_PREGFREE2;
13367
13368 if (r->mother_re) {
13369 ReREFCNT_dec(r->mother_re);
13370 } else {
13371 CALLREGFREE_PVT(rx); /* free the private data */
13372 SvREFCNT_dec(RXp_PAREN_NAMES(r));
13373 }
13374 if (r->substrs) {
13375 SvREFCNT_dec(r->anchored_substr);
13376 SvREFCNT_dec(r->anchored_utf8);
13377 SvREFCNT_dec(r->float_substr);
13378 SvREFCNT_dec(r->float_utf8);
13379 Safefree(r->substrs);
13380 }
13381 RX_MATCH_COPY_FREE(rx);
13382#ifdef PERL_OLD_COPY_ON_WRITE
13383 SvREFCNT_dec(r->saved_copy);
13384#endif
13385 Safefree(r->offs);
13386 SvREFCNT_dec(r->qr_anoncv);
13387}
13388
13389/* reg_temp_copy()
13390
13391 This is a hacky workaround to the structural issue of match results
13392 being stored in the regexp structure which is in turn stored in
13393 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13394 could be PL_curpm in multiple contexts, and could require multiple
13395 result sets being associated with the pattern simultaneously, such
13396 as when doing a recursive match with (??{$qr})
13397
13398 The solution is to make a lightweight copy of the regexp structure
13399 when a qr// is returned from the code executed by (??{$qr}) this
13400 lightweight copy doesn't actually own any of its data except for
13401 the starp/end and the actual regexp structure itself.
13402
13403*/
13404
13405
13406REGEXP *
13407Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
13408{
13409 struct regexp *ret;
13410 struct regexp *const r = (struct regexp *)SvANY(rx);
13411
13412 PERL_ARGS_ASSERT_REG_TEMP_COPY;
13413
13414 if (!ret_x)
13415 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
13416 ret = (struct regexp *)SvANY(ret_x);
13417
13418 (void)ReREFCNT_inc(rx);
13419 /* We can take advantage of the existing "copied buffer" mechanism in SVs
13420 by pointing directly at the buffer, but flagging that the allocated
13421 space in the copy is zero. As we've just done a struct copy, it's now
13422 a case of zero-ing that, rather than copying the current length. */
13423 SvPV_set(ret_x, RX_WRAPPED(rx));
13424 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
13425 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
13426 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
13427 SvLEN_set(ret_x, 0);
13428 SvSTASH_set(ret_x, NULL);
13429 SvMAGIC_set(ret_x, NULL);
13430 if (r->offs) {
13431 const I32 npar = r->nparens+1;
13432 Newx(ret->offs, npar, regexp_paren_pair);
13433 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13434 }
13435 if (r->substrs) {
13436 Newx(ret->substrs, 1, struct reg_substr_data);
13437 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13438
13439 SvREFCNT_inc_void(ret->anchored_substr);
13440 SvREFCNT_inc_void(ret->anchored_utf8);
13441 SvREFCNT_inc_void(ret->float_substr);
13442 SvREFCNT_inc_void(ret->float_utf8);
13443
13444 /* check_substr and check_utf8, if non-NULL, point to either their
13445 anchored or float namesakes, and don't hold a second reference. */
13446 }
13447 RX_MATCH_COPIED_off(ret_x);
13448#ifdef PERL_OLD_COPY_ON_WRITE
13449 ret->saved_copy = NULL;
13450#endif
13451 ret->mother_re = rx;
13452 SvREFCNT_inc_void(ret->qr_anoncv);
13453
13454 return ret_x;
13455}
13456#endif
13457
13458/* regfree_internal()
13459
13460 Free the private data in a regexp. This is overloadable by
13461 extensions. Perl takes care of the regexp structure in pregfree(),
13462 this covers the *pprivate pointer which technically perl doesn't
13463 know about, however of course we have to handle the
13464 regexp_internal structure when no extension is in use.
13465
13466 Note this is called before freeing anything in the regexp
13467 structure.
13468 */
13469
13470void
13471Perl_regfree_internal(pTHX_ REGEXP * const rx)
13472{
13473 dVAR;
13474 struct regexp *const r = (struct regexp *)SvANY(rx);
13475 RXi_GET_DECL(r,ri);
13476 GET_RE_DEBUG_FLAGS_DECL;
13477
13478 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
13479
13480 DEBUG_COMPILE_r({
13481 if (!PL_colorset)
13482 reginitcolors();
13483 {
13484 SV *dsv= sv_newmortal();
13485 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
13486 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
13487 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
13488 PL_colors[4],PL_colors[5],s);
13489 }
13490 });
13491#ifdef RE_TRACK_PATTERN_OFFSETS
13492 if (ri->u.offsets)
13493 Safefree(ri->u.offsets); /* 20010421 MJD */
13494#endif
13495 if (ri->code_blocks) {
13496 int n;
13497 for (n = 0; n < ri->num_code_blocks; n++)
13498 SvREFCNT_dec(ri->code_blocks[n].src_regex);
13499 Safefree(ri->code_blocks);
13500 }
13501
13502 if (ri->data) {
13503 int n = ri->data->count;
13504
13505 while (--n >= 0) {
13506 /* If you add a ->what type here, update the comment in regcomp.h */
13507 switch (ri->data->what[n]) {
13508 case 'a':
13509 case 'r':
13510 case 's':
13511 case 'S':
13512 case 'u':
13513 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
13514 break;
13515 case 'f':
13516 Safefree(ri->data->data[n]);
13517 break;
13518 case 'l':
13519 case 'L':
13520 break;
13521 case 'T':
13522 { /* Aho Corasick add-on structure for a trie node.
13523 Used in stclass optimization only */
13524 U32 refcount;
13525 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
13526 OP_REFCNT_LOCK;
13527 refcount = --aho->refcount;
13528 OP_REFCNT_UNLOCK;
13529 if ( !refcount ) {
13530 PerlMemShared_free(aho->states);
13531 PerlMemShared_free(aho->fail);
13532 /* do this last!!!! */
13533 PerlMemShared_free(ri->data->data[n]);
13534 PerlMemShared_free(ri->regstclass);
13535 }
13536 }
13537 break;
13538 case 't':
13539 {
13540 /* trie structure. */
13541 U32 refcount;
13542 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
13543 OP_REFCNT_LOCK;
13544 refcount = --trie->refcount;
13545 OP_REFCNT_UNLOCK;
13546 if ( !refcount ) {
13547 PerlMemShared_free(trie->charmap);
13548 PerlMemShared_free(trie->states);
13549 PerlMemShared_free(trie->trans);
13550 if (trie->bitmap)
13551 PerlMemShared_free(trie->bitmap);
13552 if (trie->jump)
13553 PerlMemShared_free(trie->jump);
13554 PerlMemShared_free(trie->wordinfo);
13555 /* do this last!!!! */
13556 PerlMemShared_free(ri->data->data[n]);
13557 }
13558 }
13559 break;
13560 default:
13561 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
13562 }
13563 }
13564 Safefree(ri->data->what);
13565 Safefree(ri->data);
13566 }
13567
13568 Safefree(ri);
13569}
13570
13571#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
13572#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
13573#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
13574
13575/*
13576 re_dup - duplicate a regexp.
13577
13578 This routine is expected to clone a given regexp structure. It is only
13579 compiled under USE_ITHREADS.
13580
13581 After all of the core data stored in struct regexp is duplicated
13582 the regexp_engine.dupe method is used to copy any private data
13583 stored in the *pprivate pointer. This allows extensions to handle
13584 any duplication it needs to do.
13585
13586 See pregfree() and regfree_internal() if you change anything here.
13587*/
13588#if defined(USE_ITHREADS)
13589#ifndef PERL_IN_XSUB_RE
13590void
13591Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
13592{
13593 dVAR;
13594 I32 npar;
13595 const struct regexp *r = (const struct regexp *)SvANY(sstr);
13596 struct regexp *ret = (struct regexp *)SvANY(dstr);
13597
13598 PERL_ARGS_ASSERT_RE_DUP_GUTS;
13599
13600 npar = r->nparens+1;
13601 Newx(ret->offs, npar, regexp_paren_pair);
13602 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13603 if(ret->swap) {
13604 /* no need to copy these */
13605 Newx(ret->swap, npar, regexp_paren_pair);
13606 }
13607
13608 if (ret->substrs) {
13609 /* Do it this way to avoid reading from *r after the StructCopy().
13610 That way, if any of the sv_dup_inc()s dislodge *r from the L1
13611 cache, it doesn't matter. */
13612 const bool anchored = r->check_substr
13613 ? r->check_substr == r->anchored_substr
13614 : r->check_utf8 == r->anchored_utf8;
13615 Newx(ret->substrs, 1, struct reg_substr_data);
13616 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13617
13618 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
13619 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
13620 ret->float_substr = sv_dup_inc(ret->float_substr, param);
13621 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
13622
13623 /* check_substr and check_utf8, if non-NULL, point to either their
13624 anchored or float namesakes, and don't hold a second reference. */
13625
13626 if (ret->check_substr) {
13627 if (anchored) {
13628 assert(r->check_utf8 == r->anchored_utf8);
13629 ret->check_substr = ret->anchored_substr;
13630 ret->check_utf8 = ret->anchored_utf8;
13631 } else {
13632 assert(r->check_substr == r->float_substr);
13633 assert(r->check_utf8 == r->float_utf8);
13634 ret->check_substr = ret->float_substr;
13635 ret->check_utf8 = ret->float_utf8;
13636 }
13637 } else if (ret->check_utf8) {
13638 if (anchored) {
13639 ret->check_utf8 = ret->anchored_utf8;
13640 } else {
13641 ret->check_utf8 = ret->float_utf8;
13642 }
13643 }
13644 }
13645
13646 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
13647 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
13648
13649 if (ret->pprivate)
13650 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
13651
13652 if (RX_MATCH_COPIED(dstr))
13653 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
13654 else
13655 ret->subbeg = NULL;
13656#ifdef PERL_OLD_COPY_ON_WRITE
13657 ret->saved_copy = NULL;
13658#endif
13659
13660 if (ret->mother_re) {
13661 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
13662 /* Our storage points directly to our mother regexp, but that's
13663 1: a buffer in a different thread
13664 2: something we no longer hold a reference on
13665 so we need to copy it locally. */
13666 /* Note we need to use SvCUR(), rather than
13667 SvLEN(), on our mother_re, because it, in
13668 turn, may well be pointing to its own mother_re. */
13669 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
13670 SvCUR(ret->mother_re)+1));
13671 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
13672 }
13673 ret->mother_re = NULL;
13674 }
13675 ret->gofs = 0;
13676}
13677#endif /* PERL_IN_XSUB_RE */
13678
13679/*
13680 regdupe_internal()
13681
13682 This is the internal complement to regdupe() which is used to copy
13683 the structure pointed to by the *pprivate pointer in the regexp.
13684 This is the core version of the extension overridable cloning hook.
13685 The regexp structure being duplicated will be copied by perl prior
13686 to this and will be provided as the regexp *r argument, however
13687 with the /old/ structures pprivate pointer value. Thus this routine
13688 may override any copying normally done by perl.
13689
13690 It returns a pointer to the new regexp_internal structure.
13691*/
13692
13693void *
13694Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
13695{
13696 dVAR;
13697 struct regexp *const r = (struct regexp *)SvANY(rx);
13698 regexp_internal *reti;
13699 int len;
13700 RXi_GET_DECL(r,ri);
13701
13702 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
13703
13704 len = ProgLen(ri);
13705
13706 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
13707 Copy(ri->program, reti->program, len+1, regnode);
13708
13709 reti->num_code_blocks = ri->num_code_blocks;
13710 if (ri->code_blocks) {
13711 int n;
13712 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
13713 struct reg_code_block);
13714 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
13715 struct reg_code_block);
13716 for (n = 0; n < ri->num_code_blocks; n++)
13717 reti->code_blocks[n].src_regex = (REGEXP*)
13718 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
13719 }
13720 else
13721 reti->code_blocks = NULL;
13722
13723 reti->regstclass = NULL;
13724
13725 if (ri->data) {
13726 struct reg_data *d;
13727 const int count = ri->data->count;
13728 int i;
13729
13730 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13731 char, struct reg_data);
13732 Newx(d->what, count, U8);
13733
13734 d->count = count;
13735 for (i = 0; i < count; i++) {
13736 d->what[i] = ri->data->what[i];
13737 switch (d->what[i]) {
13738 /* see also regcomp.h and regfree_internal() */
13739 case 'a': /* actually an AV, but the dup function is identical. */
13740 case 'r':
13741 case 's':
13742 case 'S':
13743 case 'u': /* actually an HV, but the dup function is identical. */
13744 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
13745 break;
13746 case 'f':
13747 /* This is cheating. */
13748 Newx(d->data[i], 1, struct regnode_charclass_class);
13749 StructCopy(ri->data->data[i], d->data[i],
13750 struct regnode_charclass_class);
13751 reti->regstclass = (regnode*)d->data[i];
13752 break;
13753 case 'T':
13754 /* Trie stclasses are readonly and can thus be shared
13755 * without duplication. We free the stclass in pregfree
13756 * when the corresponding reg_ac_data struct is freed.
13757 */
13758 reti->regstclass= ri->regstclass;
13759 /* Fall through */
13760 case 't':
13761 OP_REFCNT_LOCK;
13762 ((reg_trie_data*)ri->data->data[i])->refcount++;
13763 OP_REFCNT_UNLOCK;
13764 /* Fall through */
13765 case 'l':
13766 case 'L':
13767 d->data[i] = ri->data->data[i];
13768 break;
13769 default:
13770 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
13771 }
13772 }
13773
13774 reti->data = d;
13775 }
13776 else
13777 reti->data = NULL;
13778
13779 reti->name_list_idx = ri->name_list_idx;
13780
13781#ifdef RE_TRACK_PATTERN_OFFSETS
13782 if (ri->u.offsets) {
13783 Newx(reti->u.offsets, 2*len+1, U32);
13784 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
13785 }
13786#else
13787 SetProgLen(reti,len);
13788#endif
13789
13790 return (void*)reti;
13791}
13792
13793#endif /* USE_ITHREADS */
13794
13795#ifndef PERL_IN_XSUB_RE
13796
13797/*
13798 - regnext - dig the "next" pointer out of a node
13799 */
13800regnode *
13801Perl_regnext(pTHX_ register regnode *p)
13802{
13803 dVAR;
13804 register I32 offset;
13805
13806 if (!p)
13807 return(NULL);
13808
13809 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
13810 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
13811 }
13812
13813 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
13814 if (offset == 0)
13815 return(NULL);
13816
13817 return(p+offset);
13818}
13819#endif
13820
13821STATIC void
13822S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
13823{
13824 va_list args;
13825 STRLEN l1 = strlen(pat1);
13826 STRLEN l2 = strlen(pat2);
13827 char buf[512];
13828 SV *msv;
13829 const char *message;
13830
13831 PERL_ARGS_ASSERT_RE_CROAK2;
13832
13833 if (l1 > 510)
13834 l1 = 510;
13835 if (l1 + l2 > 510)
13836 l2 = 510 - l1;
13837 Copy(pat1, buf, l1 , char);
13838 Copy(pat2, buf + l1, l2 , char);
13839 buf[l1 + l2] = '\n';
13840 buf[l1 + l2 + 1] = '\0';
13841#ifdef I_STDARG
13842 /* ANSI variant takes additional second argument */
13843 va_start(args, pat2);
13844#else
13845 va_start(args);
13846#endif
13847 msv = vmess(buf, &args);
13848 va_end(args);
13849 message = SvPV_const(msv,l1);
13850 if (l1 > 512)
13851 l1 = 512;
13852 Copy(message, buf, l1 , char);
13853 buf[l1-1] = '\0'; /* Overwrite \n */
13854 Perl_croak(aTHX_ "%s", buf);
13855}
13856
13857/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
13858
13859#ifndef PERL_IN_XSUB_RE
13860void
13861Perl_save_re_context(pTHX)
13862{
13863 dVAR;
13864
13865 struct re_save_state *state;
13866
13867 SAVEVPTR(PL_curcop);
13868 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
13869
13870 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
13871 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
13872 SSPUSHUV(SAVEt_RE_STATE);
13873
13874 Copy(&PL_reg_state, state, 1, struct re_save_state);
13875
13876 PL_reg_oldsaved = NULL;
13877 PL_reg_oldsavedlen = 0;
13878 PL_reg_maxiter = 0;
13879 PL_reg_leftiter = 0;
13880 PL_reg_poscache = NULL;
13881 PL_reg_poscache_size = 0;
13882#ifdef PERL_OLD_COPY_ON_WRITE
13883 PL_nrs = NULL;
13884#endif
13885
13886 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13887 if (PL_curpm) {
13888 const REGEXP * const rx = PM_GETRE(PL_curpm);
13889 if (rx) {
13890 U32 i;
13891 for (i = 1; i <= RX_NPARENS(rx); i++) {
13892 char digits[TYPE_CHARS(long)];
13893 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
13894 GV *const *const gvp
13895 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13896
13897 if (gvp) {
13898 GV * const gv = *gvp;
13899 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13900 save_scalar(gv);
13901 }
13902 }
13903 }
13904 }
13905}
13906#endif
13907
13908static void
13909clear_re(pTHX_ void *r)
13910{
13911 dVAR;
13912 ReREFCNT_dec((REGEXP *)r);
13913}
13914
13915#ifdef DEBUGGING
13916
13917STATIC void
13918S_put_byte(pTHX_ SV *sv, int c)
13919{
13920 PERL_ARGS_ASSERT_PUT_BYTE;
13921
13922 /* Our definition of isPRINT() ignores locales, so only bytes that are
13923 not part of UTF-8 are considered printable. I assume that the same
13924 holds for UTF-EBCDIC.
13925 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
13926 which Wikipedia says:
13927
13928 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
13929 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
13930 identical, to the ASCII delete (DEL) or rubout control character.
13931 ) So the old condition can be simplified to !isPRINT(c) */
13932 if (!isPRINT(c)) {
13933 if (c < 256) {
13934 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
13935 }
13936 else {
13937 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
13938 }
13939 }
13940 else {
13941 const char string = c;
13942 if (c == '-' || c == ']' || c == '\\' || c == '^')
13943 sv_catpvs(sv, "\\");
13944 sv_catpvn(sv, &string, 1);
13945 }
13946}
13947
13948
13949#define CLEAR_OPTSTART \
13950 if (optstart) STMT_START { \
13951 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
13952 optstart=NULL; \
13953 } STMT_END
13954
13955#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
13956
13957STATIC const regnode *
13958S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
13959 const regnode *last, const regnode *plast,
13960 SV* sv, I32 indent, U32 depth)
13961{
13962 dVAR;
13963 register U8 op = PSEUDO; /* Arbitrary non-END op. */
13964 register const regnode *next;
13965 const regnode *optstart= NULL;
13966
13967 RXi_GET_DECL(r,ri);
13968 GET_RE_DEBUG_FLAGS_DECL;
13969
13970 PERL_ARGS_ASSERT_DUMPUNTIL;
13971
13972#ifdef DEBUG_DUMPUNTIL
13973 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
13974 last ? last-start : 0,plast ? plast-start : 0);
13975#endif
13976
13977 if (plast && plast < last)
13978 last= plast;
13979
13980 while (PL_regkind[op] != END && (!last || node < last)) {
13981 /* While that wasn't END last time... */
13982 NODE_ALIGN(node);
13983 op = OP(node);
13984 if (op == CLOSE || op == WHILEM)
13985 indent--;
13986 next = regnext((regnode *)node);
13987
13988 /* Where, what. */
13989 if (OP(node) == OPTIMIZED) {
13990 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
13991 optstart = node;
13992 else
13993 goto after_print;
13994 } else
13995 CLEAR_OPTSTART;
13996
13997 regprop(r, sv, node);
13998 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
13999 (int)(2*indent + 1), "", SvPVX_const(sv));
14000
14001 if (OP(node) != OPTIMIZED) {
14002 if (next == NULL) /* Next ptr. */
14003 PerlIO_printf(Perl_debug_log, " (0)");
14004 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14005 PerlIO_printf(Perl_debug_log, " (FAIL)");
14006 else
14007 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14008 (void)PerlIO_putc(Perl_debug_log, '\n');
14009 }
14010
14011 after_print:
14012 if (PL_regkind[(U8)op] == BRANCHJ) {
14013 assert(next);
14014 {
14015 register const regnode *nnode = (OP(next) == LONGJMP
14016 ? regnext((regnode *)next)
14017 : next);
14018 if (last && nnode > last)
14019 nnode = last;
14020 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14021 }
14022 }
14023 else if (PL_regkind[(U8)op] == BRANCH) {
14024 assert(next);
14025 DUMPUNTIL(NEXTOPER(node), next);
14026 }
14027 else if ( PL_regkind[(U8)op] == TRIE ) {
14028 const regnode *this_trie = node;
14029 const char op = OP(node);
14030 const U32 n = ARG(node);
14031 const reg_ac_data * const ac = op>=AHOCORASICK ?
14032 (reg_ac_data *)ri->data->data[n] :
14033 NULL;
14034 const reg_trie_data * const trie =
14035 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14036#ifdef DEBUGGING
14037 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14038#endif
14039 const regnode *nextbranch= NULL;
14040 I32 word_idx;
14041 sv_setpvs(sv, "");
14042 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14043 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14044
14045 PerlIO_printf(Perl_debug_log, "%*s%s ",
14046 (int)(2*(indent+3)), "",
14047 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14048 PL_colors[0], PL_colors[1],
14049 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14050 PERL_PV_PRETTY_ELLIPSES |
14051 PERL_PV_PRETTY_LTGT
14052 )
14053 : "???"
14054 );
14055 if (trie->jump) {
14056 U16 dist= trie->jump[word_idx+1];
14057 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14058 (UV)((dist ? this_trie + dist : next) - start));
14059 if (dist) {
14060 if (!nextbranch)
14061 nextbranch= this_trie + trie->jump[0];
14062 DUMPUNTIL(this_trie + dist, nextbranch);
14063 }
14064 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14065 nextbranch= regnext((regnode *)nextbranch);
14066 } else {
14067 PerlIO_printf(Perl_debug_log, "\n");
14068 }
14069 }
14070 if (last && next > last)
14071 node= last;
14072 else
14073 node= next;
14074 }
14075 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
14076 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14077 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14078 }
14079 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14080 assert(next);
14081 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14082 }
14083 else if ( op == PLUS || op == STAR) {
14084 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14085 }
14086 else if (PL_regkind[(U8)op] == ANYOF) {
14087 /* arglen 1 + class block */
14088 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14089 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14090 node = NEXTOPER(node);
14091 }
14092 else if (PL_regkind[(U8)op] == EXACT) {
14093 /* Literal string, where present. */
14094 node += NODE_SZ_STR(node) - 1;
14095 node = NEXTOPER(node);
14096 }
14097 else {
14098 node = NEXTOPER(node);
14099 node += regarglen[(U8)op];
14100 }
14101 if (op == CURLYX || op == OPEN)
14102 indent++;
14103 }
14104 CLEAR_OPTSTART;
14105#ifdef DEBUG_DUMPUNTIL
14106 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14107#endif
14108 return node;
14109}
14110
14111#endif /* DEBUGGING */
14112
14113/*
14114 * Local variables:
14115 * c-indentation-style: bsd
14116 * c-basic-offset: 4
14117 * indent-tabs-mode: nil
14118 * End:
14119 *
14120 * ex: set ts=8 sts=4 sw=4 et:
14121 */